In [1]:
import pandas as pd
import numpy as np

# Optional: show the pandas version so students know which version is used
print('pandas version:', pd.__version__)


pandas version: 2.3.3


In [2]:
# Read the CSV file into a DataFrame
df = pd.read_csv('C:/Users/UG/Desktop/ML/Missing Values Assignment/python_missingval/sensor_log.csv')

# Look at the first 5 rows
df.head()

Unnamed: 0,timestamp,temperature_c,humidity_pct,voltage_v
0,2025-10-01 08:00:00,24.5,55.2,3.7
1,2025-10-01 08:00:10,24.7,55.0,3.69
2,2025-10-01 08:00:20,24.6,55.1,
3,2025-10-01 08:00:30,,54.9,3.68
4,2025-10-01 08:01:00,24.9,54.8,3.68


In [3]:
# How many rows and columns does the dataset have?
print('Number of rows:', df.shape[0])
print('Number of columns:', df.shape[1])

# General information about the DataFrame, including data types and non-null counts
df.info()


Number of rows: 10
Number of columns: 4
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   timestamp      10 non-null     object 
 1   temperature_c  8 non-null      float64
 2   humidity_pct   9 non-null      float64
 3   voltage_v      9 non-null      float64
dtypes: float64(3), object(1)
memory usage: 452.0+ bytes


In [4]:
# A quick look at where values are missing (True means missing)
df.isna().head()


Unnamed: 0,timestamp,temperature_c,humidity_pct,voltage_v
0,False,False,False,False
1,False,False,False,False
2,False,False,False,True
3,False,True,False,False
4,False,False,False,False


In [5]:
# Count how many missing values are in each column
df.isna().sum()

timestamp        0
temperature_c    2
humidity_pct     1
voltage_v        1
dtype: int64

In [6]:
# Calculate the percentage of missing values in each column
missing_percent = df.isna().mean() * 100
missing_percent.round(2)

timestamp         0.0
temperature_c    20.0
humidity_pct     10.0
voltage_v        10.0
dtype: float64

In [7]:
# Drop any rows that contain at least one missing value
df_drop_rows = df.dropna()

print('Original shape:', df.shape)
print('After dropping rows with any missing values:', df_drop_rows.shape)
df_drop_rows.head()


Original shape: (10, 4)
After dropping rows with any missing values: (6, 4)


Unnamed: 0,timestamp,temperature_c,humidity_pct,voltage_v
0,2025-10-01 08:00:00,24.5,55.2,3.7
1,2025-10-01 08:00:10,24.7,55.0,3.69
4,2025-10-01 08:01:00,24.9,54.8,3.68
6,2025-10-01 08:03:00,25.3,54.7,3.67
7,2025-10-01 08:05:30,25.5,54.9,3.65


In [8]:
# Drop columns that contain any missing values
df_drop_cols = df.dropna(axis='columns')

print('Columns before:', df.columns.tolist())
print('Columns after dropping any column with missing values:', df_drop_cols.columns.tolist())
df_drop_cols.head()

Columns before: ['timestamp', 'temperature_c', 'humidity_pct', 'voltage_v']
Columns after dropping any column with missing values: ['timestamp']


Unnamed: 0,timestamp
0,2025-10-01 08:00:00
1,2025-10-01 08:00:10
2,2025-10-01 08:00:20
3,2025-10-01 08:00:30
4,2025-10-01 08:01:00


In [9]:
# Example: fill missing voltage values with a constant
df_constant = df.copy()

df_constant['voltage_v'] = df_constant['voltage_v'].fillna(0)

df_constant.head()

Unnamed: 0,timestamp,temperature_c,humidity_pct,voltage_v
0,2025-10-01 08:00:00,24.5,55.2,3.7
1,2025-10-01 08:00:10,24.7,55.0,3.69
2,2025-10-01 08:00:20,24.6,55.1,0.0
3,2025-10-01 08:00:30,,54.9,3.68
4,2025-10-01 08:01:00,24.9,54.8,3.68


In [10]:
# Fill all numeric columns with their column mean
df_mean = df.copy()
numeric_cols = df_mean.select_dtypes(include='number').columns

for col in numeric_cols:
    col_mean = df_mean[col].mean()
    df_mean[col] = df_mean[col].fillna(col_mean)

df_mean.head()


Unnamed: 0,timestamp,temperature_c,humidity_pct,voltage_v
0,2025-10-01 08:00:00,24.5,55.2,3.7
1,2025-10-01 08:00:10,24.7,55.0,3.69
2,2025-10-01 08:00:20,24.6,55.1,3.667778
3,2025-10-01 08:00:30,25.075,54.9,3.68
4,2025-10-01 08:01:00,24.9,54.8,3.68


In [11]:
# Check that there are no missing values left in the numeric columns
df_mean[numeric_cols].isna().sum()

temperature_c    0
humidity_pct     0
voltage_v        0
dtype: int64

In [12]:
# (Optional) Example with a small categorical column
example = pd.DataFrame({
    'city': ['Accra', 'Accra', 'Kumasi', np.nan, 'Accra'],
    'temperature_c': [30, 31, 29, 28, np.nan]
})

print('Original example DataFrame:')
display(example)

# Fill missing city with the most frequent city (the mode)
city_mode = example['city'].mode()[0]
example['city'] = example['city'].fillna(city_mode)

# Fill missing temperature with the median
temp_median = example['temperature_c'].median()
example['temperature_c'] = example['temperature_c'].fillna(temp_median)

print('After filling missing values:')
display(example)

Original example DataFrame:


Unnamed: 0,city,temperature_c
0,Accra,30.0
1,Accra,31.0
2,Kumasi,29.0
3,,28.0
4,Accra,


After filling missing values:


Unnamed: 0,city,temperature_c
0,Accra,30.0
1,Accra,31.0
2,Kumasi,29.0
3,Accra,28.0
4,Accra,29.5


In [13]:
# Prepare a time-indexed version of the data
df_ts = df.copy()
df_ts['timestamp'] = pd.to_datetime(df_ts['timestamp'])
df_ts = df_ts.set_index('timestamp')

df_ts.head()

Unnamed: 0_level_0,temperature_c,humidity_pct,voltage_v
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2025-10-01 08:00:00,24.5,55.2,3.7
2025-10-01 08:00:10,24.7,55.0,3.69
2025-10-01 08:00:20,24.6,55.1,
2025-10-01 08:00:30,,54.9,3.68
2025-10-01 08:01:00,24.9,54.8,3.68


In [14]:
# Forward fill: each missing value takes the last known value above it
df_ffill = df_ts.ffill()

df_ffill.head()

Unnamed: 0_level_0,temperature_c,humidity_pct,voltage_v
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2025-10-01 08:00:00,24.5,55.2,3.7
2025-10-01 08:00:10,24.7,55.0,3.69
2025-10-01 08:00:20,24.6,55.1,3.69
2025-10-01 08:00:30,24.6,54.9,3.68
2025-10-01 08:01:00,24.9,54.8,3.68


In [15]:
# Backward fill: each missing value takes the next known value below it
df_bfill = df_ts.bfill()

df_bfill.head()

Unnamed: 0_level_0,temperature_c,humidity_pct,voltage_v
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2025-10-01 08:00:00,24.5,55.2,3.7
2025-10-01 08:00:10,24.7,55.0,3.69
2025-10-01 08:00:20,24.6,55.1,3.68
2025-10-01 08:00:30,24.9,54.9,3.68
2025-10-01 08:01:00,24.9,54.8,3.68


In [16]:
# Interpolate numeric values over time
# Here we use method='time' to respect the time index
df_interp = df_ts.interpolate(method='time')

df_interp.head()

Unnamed: 0_level_0,temperature_c,humidity_pct,voltage_v
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2025-10-01 08:00:00,24.5,55.2,3.7
2025-10-01 08:00:10,24.7,55.0,3.69
2025-10-01 08:00:20,24.6,55.1,3.685
2025-10-01 08:00:30,24.675,54.9,3.68
2025-10-01 08:01:00,24.9,54.8,3.68
