In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
import os
import time

# Sklearn imputers
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error

warnings.filterwarnings('ignore')
np.random.seed(42)


os.makedirs('results/baselines', exist_ok=True)



In [2]:

# Load train, val, test
train_data = pd.read_pickle('dataset/train_data_final.pkl')
val_masked = pd.read_pickle('dataset/val_data_masked.pkl')
test_masked = pd.read_pickle('dataset/test_data_masked.pkl')

# Load ground truth
val_ground_truth = pd.read_pickle('dataset/val_ground_truth.pkl')
test_ground_truth = pd.read_pickle('dataset/test_ground_truth.pkl')

# Load mask indices
val_mask_indices = pd.read_pickle('dataset/val_mask_indices.pkl')
test_mask_indices = pd.read_pickle('dataset/test_mask_indices.pkl')
# Load masked columns list
import json
with open('dataset/cols_to_mask.json', 'r') as f:
    cols_to_mask = json.load(f)


In [3]:
train_data.head()

Unnamed: 0,timestamp,ping_ms,datarate,jitter,Latitude,Longitude,Altitude,speed_kmh,COG,precipIntensity,...,scenario_A3D,scenario_A3U,drive_mode_2x2,drive_mode_platoon,direction_uplink,measured_qos_delay,hour,day_of_week,poor_signal_quality,date
0,2021-06-22 09:49:54+02:00,,2.378685,-0.108315,0.606259,0.41037,0.518182,-0.471264,-1.053281,0.343206,...,1.0,0.0,0.0,0.0,0.0,0.0,9,1,0.0,2021-06-22
1,2021-06-22 09:49:54+02:00,,0.610473,-0.230853,0.605881,0.410979,-0.081818,-0.471264,0.399327,0.343206,...,1.0,0.0,0.0,0.0,0.0,0.0,9,1,0.0,2021-06-22
2,2021-06-22 09:49:54+02:00,,1.622595,-0.384756,0.592378,0.404193,-0.5,-0.471264,-1.053281,0.344948,...,1.0,0.0,0.0,0.0,0.0,0.0,9,1,0.0,2021-06-22
3,2021-06-22 09:49:54+02:00,2.646228,2.450694,-0.342086,0.593766,0.401496,-0.354545,-0.471264,0.438026,0.344948,...,1.0,0.0,0.0,0.0,0.0,0.0,9,1,0.0,2021-06-22
4,2021-06-22 09:49:55+02:00,,0.462455,0.409555,0.605628,0.410979,-0.072727,-0.471264,0.399327,0.343206,...,1.0,0.0,0.0,0.0,0.0,0.0,9,1,0.0,2021-06-22


In [4]:
date_time_cols = {'timestamp', 'date'}

In [8]:
train_data = train_data.drop(columns=date_time_cols)

In [9]:
def evaluate_imputation(imputed_data, ground_truth, mask_indices, cols):
    """Calculate RMSE and MAE on artificially masked values"""
    results = {}
    
    for col in cols:
        # Get artificially masked positions
        masked_positions = mask_indices[col]
        
        if masked_positions.sum() == 0:
            continue
        
        # Get true values and predictions
        true_vals = ground_truth.loc[masked_positions, col]
        pred_vals = imputed_data.loc[masked_positions, col]
        
        # Remove any remaining NaNs (shouldn't happen but safety check)
        valid_mask = true_vals.notna() & pred_vals.notna()
        true_vals = true_vals[valid_mask]
        pred_vals = pred_vals[valid_mask]
        
        if len(true_vals) > 0:
            rmse = np.sqrt(mean_squared_error(true_vals, pred_vals))
            mae = mean_absolute_error(true_vals, pred_vals)
            results[col] = {'rmse': rmse, 'mae': mae, 'n_samples': len(true_vals)}
    
    return results


In [6]:
print("BASELINE : KNN IMPUTATION")

print("\nStrategy: Replace missing values using K-nearest neighbors")
print("          Finding similar rows and averaging their values")

# KNN configuration
N_NEIGHBORS = 5

print(f"\n  Configuration:")
print(f"  • Number of neighbors (k): {N_NEIGHBORS}")
print(f"  • Distance metric: uniform (equal weights)")

# Start timer
start_time = time.time()

# Fit KNN imputer on training data
print(f"\n Fitting KNN imputer on training data...")
knn_imputer = KNNImputer(n_neighbors=N_NEIGHBORS, weights='uniform')
knn_imputer.fit(train_data[cols_to_mask])

print(f" Fitted KNN imputer on {len(cols_to_mask)} columns")

# Impute validation set
print(f" Imputing validation set...")
val_knn_imputed = val_masked.copy()
val_knn_imputed[cols_to_mask] = knn_imputer.transform(val_masked[cols_to_mask])

# Impute test set
print(f" Imputing test set...")
test_knn_imputed = test_masked.copy()
test_knn_imputed[cols_to_mask] = knn_imputer.transform(test_masked[cols_to_mask])

elapsed_time = time.time() - start_time

print(f" Imputed validation and test sets")
print(f" Time: {elapsed_time:.2f} seconds")

# Evaluate on validation set
print(f"\nVALIDATION SET EVALUATION:")
print("-"*80)

val_results_knn = evaluate_imputation(val_knn_imputed, val_ground_truth, val_mask_indices, cols_to_mask)

overall_rmse_val = np.mean([v['rmse'] for v in val_results_knn.values()])
overall_mae_val = np.mean([v['mae'] for v in val_results_knn.values()])

print(f"    Overall RMSE: {overall_rmse_val:.4f}")
print(f"    Overall MAE:  {overall_mae_val:.4f}")
print(f"    Columns evaluated: {len(val_results_knn)}")

# Evaluate on test set
print(f"\n TEST SET EVALUATION:")
print("-"*80)

test_results_knn = evaluate_imputation(test_knn_imputed, test_ground_truth, test_mask_indices, cols_to_mask)

overall_rmse_test = np.mean([v['rmse'] for v in test_results_knn.values()])
overall_mae_test = np.mean([v['mae'] for v in test_results_knn.values()])

print(f"  • Overall RMSE: {overall_rmse_test:.4f}")
print(f"  • Overall MAE:  {overall_mae_test:.4f}")
print(f"  • Columns evaluated: {len(test_results_knn)}")

# Store results
baseline_results = {
    'method': f'KNN Imputation (k={N_NEIGHBORS})',
    'val_rmse': overall_rmse_val,
    'val_mae': overall_mae_val,
    'test_rmse': overall_rmse_test,
    'test_mae': overall_mae_test,
    'time_seconds': elapsed_time,
    'n_neighbors': N_NEIGHBORS
}


BASELINE : KNN IMPUTATION

Strategy: Replace missing values using K-nearest neighbors
          Finding similar rows and averaging their values

  Configuration:
  • Number of neighbors (k): 5
  • Distance metric: uniform (equal weights)

 Fitting KNN imputer on training data...
 Fitted KNN imputer on 48 columns
 Imputing validation set...
 Imputing test set...
 Imputed validation and test sets
 Time: 2647.68 seconds

VALIDATION SET EVALUATION:
--------------------------------------------------------------------------------
    Overall RMSE: 154047820.9072
    Overall MAE:  131576752.4522
    Columns evaluated: 48

 TEST SET EVALUATION:
--------------------------------------------------------------------------------
  • Overall RMSE: 7174059.1449
  • Overall MAE:  5603343.7531
  • Columns evaluated: 48


In [7]:
baseline_results

{'method': 'KNN Imputation (k=5)',
 'val_rmse': np.float64(154047820.9072064),
 'val_mae': np.float64(131576752.45219249),
 'test_rmse': np.float64(7174059.144911922),
 'test_mae': np.float64(5603343.753072032),
 'time_seconds': 2647.678708553314,
 'n_neighbors': 5}