In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer

# Optional: Try importing Prophet, but handle case where it's not installed
try:
    from prophet import Prophet
    prophet_available = True
except ImportError:
    print("Prophet not installed. To install: pip install prophet")
    prophet_available = False

# Load the dataset
df = pd.read_csv("march_april_dataset.csv")
df["Timestamp"] = pd.to_datetime(df["Timestamp"])
print(f"Dataset loaded with {len(df)} rows")

Dataset loaded with 5856 rows


In [22]:

# -----------------------------------------------------------------
# Approach 1: Prophet-Based Imputation with Mean Adjustment
# -----------------------------------------------------------------

def prophet_imputation(df):
    if not prophet_available:
        print("Prophet not available. Install with: pip install prophet")
        return None
    
    # Create a copy to avoid modifying original data
    df_prophet = df.copy()
    
    # Identify missing values
    missing_mask = df_prophet['Reading'].isna()
    missing_count = missing_mask.sum()
    
    if missing_count == 0:
        print("No missing values found!")
        return df_prophet
    
    # Find indices around missing values
    first_missing_idx = df_prophet[missing_mask].index[0]
    last_known_idx = df_prophet.loc[:first_missing_idx][~df_prophet.loc[:first_missing_idx]['Reading'].isna()].index[-1]
    next_known_idx = df_prophet.loc[first_missing_idx:][~df_prophet.loc[first_missing_idx:]['Reading'].isna()].index[0]
    
    # Calculate the expected total
    last_known_value = df_prophet.loc[last_known_idx, 'Reading']
    next_known_value = df_prophet.loc[next_known_idx, 'Reading']
    expected_total = next_known_value - last_known_value
    
    # Set up Prophet
    df_prophet_input = df_prophet.rename(columns={"Timestamp": "ds", "Reading": "y"})
    model = Prophet(daily_seasonality=True)
    model.fit(df_prophet_input.dropna())
    
    # Create future dataframe and predict
    future = df_prophet_input.copy()
    forecast = model.predict(future)
    
    # Use predictions for missing values
    df_prophet_result = df_prophet.copy()
    df_prophet_result['prophet_values'] = forecast['yhat'].values
    df_prophet_result['imputed_reading'] = df_prophet_result['Reading'].fillna(df_prophet_result['prophet_values'])
    
    # Scale imputed values to match the expected total
    missing_indices = df_prophet_result.loc[last_known_idx+1:next_known_idx-1].index
    imputed_sum = df_prophet_result.loc[missing_indices, 'imputed_reading'].sum()
    
    if imputed_sum > 0:  # Avoid division by zero
        scale_factor = expected_total / imputed_sum
        df_prophet_result.loc[missing_indices, 'imputed_reading'] = df_prophet_result.loc[missing_indices, 'imputed_reading'] * scale_factor
    
    # Return the final result
    return df_prophet_result[['Timestamp', 'imputed_reading']].rename(columns={'imputed_reading': 'Reading'})

# Apply Prophet imputation
df_prophet = prophet_imputation(df) if prophet_available else None
if df_prophet is not None:
    print("Prophet imputation completed")
    df_prophet.to_csv("prophet_imputed.csv", index=False)

12:21:49 - cmdstanpy - INFO - Chain [1] start processing
12:21:51 - cmdstanpy - INFO - Chain [1] done processing


Prophet imputation completed


In [23]:

# -----------------------------------------------------------------
# Approach 2: Mean-Based Imputation
# -----------------------------------------------------------------

def mean_imputation(df):
    # Create a copy to avoid modifying original data
    df_mean = df.copy()
    
    # Identify missing values
    missing_mask = df_mean['Reading'].isna()
    missing_count = missing_mask.sum()
    
    if missing_count == 0:
        print("No missing values found!")
        return df_mean
    
    # Calculate mean of non-missing values
    mean_value = df_mean['Reading'].dropna().mean()
    
    # Fill missing values with mean
    df_mean.loc[missing_mask, 'Reading'] = mean_value
    
    return df_mean

# Apply Mean imputation
df_mean = mean_imputation(df)
print("Mean imputation completed")
df_mean.to_csv("mean_imputed.csv", index=False)


Mean imputation completed


In [24]:

# -----------------------------------------------------------------
# Approach 3: IQR-Based Outlier Handling and Imputation
# -----------------------------------------------------------------

def iqr_imputation(df):
    # Create a copy to avoid modifying original data
    df_iqr = df.copy()
    
    # Identify missing values
    missing_mask = df_iqr['Reading'].isna()
    missing_count = missing_mask.sum()
    
    if missing_count == 0:
        print("No missing values found!")
        return df_iqr
    
    # Find indices around missing values
    first_missing_idx = df_iqr[missing_mask].index[0]
    last_known_idx = df_iqr.loc[:first_missing_idx][~df_iqr.loc[:first_missing_idx]['Reading'].isna()].index[-1]
    next_known_idx = df_iqr.loc[first_missing_idx:][~df_iqr.loc[first_missing_idx:]['Reading'].isna()].index[0]
    
    # Calculate the expected total
    last_known_value = df_iqr.loc[last_known_idx, 'Reading']
    next_known_value = df_iqr.loc[next_known_idx, 'Reading']
    expected_total = next_known_value - last_known_value
    
    # Step 1: Initial fill with linear interpolation
    df_iqr['Reading'] = df_iqr['Reading'].interpolate(method='linear')
    
    # Step 2: Apply IQR to detect outliers
    Q1 = df_iqr['Reading'].quantile(0.25)
    Q3 = df_iqr['Reading'].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Step 3: Replace outliers with median
    median_value = df_iqr['Reading'].median()
    outlier_mask = (df_iqr['Reading'] < lower_bound) | (df_iqr['Reading'] > upper_bound)
    df_iqr.loc[outlier_mask, 'Reading'] = median_value
    
    # Step 4: Scale to match expected total
    missing_indices = df_iqr.loc[last_known_idx+1:next_known_idx-1].index
    imputed_sum = df_iqr.loc[missing_indices, 'Reading'].sum()
    
    if imputed_sum > 0:  # Avoid division by zero
        scale_factor = expected_total / imputed_sum
        df_iqr.loc[missing_indices, 'Reading'] = df_iqr.loc[missing_indices, 'Reading'] * scale_factor
    
    return df_iqr

# Apply IQR imputation
df_iqr = iqr_imputation(df)
print("IQR imputation completed")
df_iqr.to_csv("iqr_imputed.csv", index=False)


IQR imputation completed


In [25]:

# -----------------------------------------------------------------
# Approach 4: K-Nearest Neighbors (KNN) Imputation
# -----------------------------------------------------------------

def knn_imputation(df):
    # Create a copy to avoid modifying original data
    df_knn = df.copy()
    
    # Identify missing values
    missing_mask = df_knn['Reading'].isna()
    missing_count = missing_mask.sum()
    
    if missing_count == 0:
        print("No missing values found!")
        return df_knn
    
    # Find indices around missing values
    first_missing_idx = df_knn[missing_mask].index[0]
    last_known_idx = df_knn.loc[:first_missing_idx][~df_knn.loc[:first_missing_idx]['Reading'].isna()].index[-1]
    next_known_idx = df_knn.loc[first_missing_idx:][~df_knn.loc[first_missing_idx:]['Reading'].isna()].index[0]
    
    # Calculate the expected total
    last_known_value = df_knn.loc[last_known_idx, 'Reading']
    next_known_value = df_knn.loc[next_known_idx, 'Reading']
    expected_total = next_known_value - last_known_value
    
    # Convert timestamps to numeric for KNN
    df_knn['Timestamp_Num'] = df_knn['Timestamp'].astype(np.int64) // 10**9
    
    # Apply KNN imputation
    imputer = KNNImputer(n_neighbors=5, weights='distance')
    data_array = df_knn[['Timestamp_Num', 'Reading']].values
    imputed_array = imputer.fit_transform(data_array)
    
    # Update with imputed values
    df_knn['Reading'] = imputed_array[:, 1]
    
    # Scale to match expected total
    missing_indices = df_knn.loc[last_known_idx+1:next_known_idx-1].index
    imputed_sum = df_knn.loc[missing_indices, 'Reading'].sum()
    
    if imputed_sum > 0:  # Avoid division by zero
        scale_factor = expected_total / imputed_sum
        df_knn.loc[missing_indices, 'Reading'] = df_knn.loc[missing_indices, 'Reading'] * scale_factor
    
    # Clean up and return
    df_knn = df_knn[['Timestamp', 'Reading']]
    return df_knn

# Apply KNN imputation
df_knn = knn_imputation(df)
print("KNN imputation completed")
df_knn.to_csv("knn_imputed.csv", index=False)

KNN imputation completed
