In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import RobustScaler
import pickle
from pathlib import Path

# Load raw data
df = pd.read_csv('data/raw/original_data.csv')  # UPDATE THIS PATH

# Reset index if timestamp is index
if 'timestamp' not in df.columns:
    df = df.reset_index()

print("STEP 1: DATA LOADING")

print(f"Shape: {df.shape}")
print(f"Columns: {len(df.columns)}")
print(f"Expected: (53199, 98)")

# Check for time[s] column
if 'time[s]' in df.columns:
    print(f"\n'time[s]' found with {df['time[s]'].isna().sum()} missing values")
    df = df.drop(columns=['time[s]'])
    print(f"New shape: {df.shape}")

print(f"\nTimestamp column: '{df.columns[0] if 'time' in df.columns[0].lower() else 'NOT FOUND'}'")
print(f"Timestamp missing values: {df['timestamp'].isna().sum()}")

# Basic verification
assert df.shape[0] == 53199, f"Row count mismatch: {df.shape[0]} != 53199"
assert 'timestamp' in df.columns, "Timestamp column not found"


STEP 1: DATA LOADING
Shape: (53199, 98)
Columns: 98
Expected: (53199, 98)

'time[s]' found with 14160 missing values
New shape: (53199, 97)

Timestamp column: 'timestamp'
Timestamp missing values: 0


In [2]:
print("STEP 2: LOAD TAXONOMY AND VERIFY")

# Load the taxonomy from Day 1
exec(open('column_taxonomy.py').read())

# Get features to scale (the 50 continuous features)
features_to_scale = []
for category in ['radio_metrics', 'network_qos', 'traffic_kpis', 'vehicle_telemetry']:
    features_to_scale.extend(column_taxonomy[category])

print(f"Features to scale: {len(features_to_scale)}")

# Verify all features exist in dataframe
missing_features = [f for f in features_to_scale if f not in df.columns]
if missing_features:
    print(f"\n  WARNING: These features not in dataframe:")
    for f in missing_features:
        print(f"  - {f}")
else:
    print("All 50 features found in dataframe")

# Show what we have
print(f"\nColumns to use for imputation:")
print(f"  Radio metrics:      {len([f for f in features_to_scale if f in column_taxonomy['radio_metrics']])} features")
print(f"  Network QoS:        {len([f for f in features_to_scale if f in column_taxonomy['network_qos']])} features")
print(f"  Traffic KPIs:       {len([f for f in features_to_scale if f in column_taxonomy['traffic_kpis']])} features")
print(f"  Vehicle telemetry:  {len([f for f in features_to_scale if f in column_taxonomy['vehicle_telemetry']])} features")

# Save features list for later
with open('data/processed/features_to_scale.pkl', 'wb') as f:
    pickle.dump(features_to_scale, f)

STEP 2: LOAD TAXONOMY AND VERIFY
Features to scale: 50
All 50 features found in dataframe

Columns to use for imputation:
  Radio metrics:      10 features
  Network QoS:        12 features
  Traffic KPIs:       10 features
  Vehicle telemetry:  18 features


In [3]:
print("STEP 3: CREATE MISSINGNESS INDICATORS")

# Create indicators BEFORE any manipulation
# 1 = was missing, 0 = was observed
missingness_indicators = []

for feature in features_to_scale:
    indicator_name = f"{feature}_was_missing"
    df[indicator_name] = df[feature].isna().astype(int)
    missingness_indicators.append(indicator_name)

print(f"Created {len(missingness_indicators)} missingness indicators")

# Verify
print(f"\nVerification:")
sample_feature = features_to_scale[0]
sample_indicator = f"{sample_feature}_was_missing"

original_missing = df[sample_feature].isna().sum()
indicator_count = df[sample_indicator].sum()

print(f"  Feature: {sample_feature}")
print(f"  Original NaN count: {original_missing}")
print(f"  Indicator sum: {indicator_count}")
print(f"  Match: {original_missing == indicator_count}")

assert original_missing == indicator_count, "Indicator mismatch!"


# Current shape
print(f"\nCurrent dataframe shape: {df.shape}")
print(f"  Original features: 97")
print(f"  + Missingness indicators: {len(missingness_indicators)}")
print(f"  = Total: {97 + len(missingness_indicators)}")

STEP 3: CREATE MISSINGNESS INDICATORS
Created 50 missingness indicators

Verification:
  Feature: serving_cell_snr_1
  Original NaN count: 14161
  Indicator sum: 14161
  Match: True

Current dataframe shape: (53199, 147)
  Original features: 97
  + Missingness indicators: 50
  = Total: 147


In [4]:
print("STEP 4: TIMESTAMP CONVERSION AND SORTING")

# Convert to datetime
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Sort by time (CRITICAL for temporal split)
df = df.sort_values('timestamp').reset_index(drop=True)

print(f"Time range:")
print(f"  Start: {df['timestamp'].min()}")
print(f"  End:   {df['timestamp'].max()}")
print(f"  Duration: {df['timestamp'].max() - df['timestamp'].min()}")

# Check for duplicates
duplicates = df['timestamp'].duplicated().sum()
print(f"\nDuplicate timestamps: {duplicates}")

if duplicates > 0:
    print("  Note: Multiple measurements at same timestamp (normal for this type of data)")

# Sampling rate
time_diffs = df['timestamp'].diff().dropna()
print(f"\nSampling intervals:")
print(f"  Mean:   {time_diffs.mean()}")
print(f"  Median: {time_diffs.median()}")
print(f"  Min:    {time_diffs.min()}")
print(f"  Max:    {time_diffs.max()}")

print("\nData sorted by time")

STEP 4: TIMESTAMP CONVERSION AND SORTING
Time range:
  Start: 2021-12-14 13:30:49+01:00
  End:   2021-12-16 15:07:53+01:00
  Duration: 2 days 01:37:04

Duplicate timestamps: 0

Sampling intervals:
  Mean:   0 days 00:00:03.357720215
  Median: 0 days 00:00:01
  Min:    0 days 00:00:01
  Max:    0 days 16:56:32

Data sorted by time


In [5]:
print("STEP 5: BALANCED TEMPORAL SPLIT (OPTION B)")

from sklearn.model_selection import train_test_split

# Strategy: Split by chunks to preserve local temporal patterns
# while ensuring balanced missing percentages

# Create chunk IDs (group every 100 consecutive rows)
df['chunk_id'] = df.index // 100

print(f"Total rows: {len(df):,}")
print(f"Total chunks: {df['chunk_id'].nunique()}")

# Get unique chunk IDs
unique_chunks = df['chunk_id'].unique()

# Split chunks: 70% train, 15% val, 15% test
train_chunks, temp_chunks = train_test_split(
    unique_chunks, 
    test_size=0.30, 
    random_state=42
)

val_chunks, test_chunks = train_test_split(
    temp_chunks, 
    test_size=0.50,  # 50% of 30% = 15% overall
    random_state=42
)

print(f"\nChunk distribution:")
print(f"  Train chunks: {len(train_chunks)} ({len(train_chunks)/len(unique_chunks)*100:.1f}%)")
print(f"  Val chunks:   {len(val_chunks)} ({len(val_chunks)/len(unique_chunks)*100:.1f}%)")
print(f"  Test chunks:  {len(test_chunks)} ({len(test_chunks)/len(unique_chunks)*100:.1f}%)")

# Create splits and sort by time within each split
train_df = df[df['chunk_id'].isin(train_chunks)].copy()
val_df = df[df['chunk_id'].isin(val_chunks)].copy()
test_df = df[df['chunk_id'].isin(test_chunks)].copy()

# Sort each split by timestamp
train_df = train_df.sort_values('timestamp').reset_index(drop=True)
val_df = val_df.sort_values('timestamp').reset_index(drop=True)
test_df = test_df.sort_values('timestamp').reset_index(drop=True)

# Drop the chunk_id column (no longer needed)
train_df = train_df.drop('chunk_id', axis=1)
val_df = val_df.drop('chunk_id', axis=1)
test_df = test_df.drop('chunk_id', axis=1)

print(f"\nSplit sizes:")
print(f"  Train: {len(train_df):,} rows ({len(train_df)/len(df)*100:.1f}%)")
print(f"  Val:   {len(val_df):,} rows ({len(val_df)/len(df)*100:.1f}%)")
print(f"  Test:  {len(test_df):,} rows ({len(test_df)/len(df)*100:.1f}%)")

# Time ranges for each split
print(f"\nTime ranges:")
print(f"  Train: {train_df['timestamp'].min()} to {train_df['timestamp'].max()}")
print(f"  Val:   {val_df['timestamp'].min()} to {val_df['timestamp'].max()}")
print(f"  Test:  {test_df['timestamp'].min()} to {test_df['timestamp'].max()}")

# CRITICAL: Check missing percentages
print("MISSING DATA BALANCE CHECK")

for name, split_df in [('Train', train_df), ('Val', val_df), ('Test', test_df)]:
    missing_pct = split_df[features_to_scale].isnull().sum().sum() / split_df[features_to_scale].size * 100
    print(f"  {name:5s}: {missing_pct:.2f}% missing")

# Calculate difference
train_missing = train_df[features_to_scale].isnull().sum().sum() / train_df[features_to_scale].size * 100
val_missing = val_df[features_to_scale].isnull().sum().sum() / val_df[features_to_scale].size * 100
test_missing = test_df[features_to_scale].isnull().sum().sum() / test_df[features_to_scale].size * 100

max_diff = max(abs(train_missing - val_missing), 
               abs(train_missing - test_missing),
               abs(val_missing - test_missing))

print(f"\nMax difference between splits: {max_diff:.2f}%")

if max_diff < 5.0:
    print("Splits are well-balanced (difference < 5%)")
elif max_diff < 10.0:
    print("Splits are acceptable (difference < 10%)")
else:
    print("Splits are imbalanced (difference > 10%)")
    print("Consider re-running with different random_state")

print("\nBalanced temporal split complete")

STEP 5: BALANCED TEMPORAL SPLIT (OPTION B)
Total rows: 53,199
Total chunks: 532

Chunk distribution:
  Train chunks: 372 (69.9%)
  Val chunks:   80 (15.0%)
  Test chunks:  80 (15.0%)

Split sizes:
  Train: 37,200 rows (69.9%)
  Val:   8,000 rows (15.0%)
  Test:  7,999 rows (15.0%)

Time ranges:
  Train: 2021-12-14 13:32:29+01:00 to 2021-12-16 15:06:14+01:00
  Val:   2021-12-14 13:34:09+01:00 to 2021-12-16 15:01:14+01:00
  Test:  2021-12-14 13:30:49+01:00 to 2021-12-16 15:07:53+01:00
MISSING DATA BALANCE CHECK
  Train: 26.16% missing
  Val  : 26.60% missing
  Test : 27.15% missing

Max difference between splits: 1.00%
Splits are well-balanced (difference < 5%)

Balanced temporal split complete


In [6]:
print("STEP 6: FIT SCALER (TRAINING DATA ONLY)")

from sklearn.preprocessing import RobustScaler
import pickle
from pathlib import Path

# Initialize scaler
scaler = RobustScaler()

# Get training features
train_features = train_df[features_to_scale]

print(f"Fitting scaler on training data...")
print(f"  Training samples: {len(train_features):,}")
print(f"  Features to scale: {len(features_to_scale)}")

# Fit scaler (RobustScaler handles NaNs automatically)
scaler.fit(train_features)

print("\nScaler fitted on training data")

# Show scaling parameters for first 5 features
print(f"\nScaling parameters (first 5 features):")
print(f"{'Feature':<30s} {'Center':>10s} {'Scale':>10s}")

for i, feature in enumerate(features_to_scale[:5]):
    center = scaler.center_[i]
    scale = scaler.scale_[i]
    print(f"{feature:<30s} {center:>10.2f} {scale:>10.2f}")



with open('models/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)


# Verification: Check that scaler was fitted correctly
assert hasattr(scaler, 'center_'), "Scaler not fitted correctly!"
assert len(scaler.center_) == len(features_to_scale), "Scaler dimension mismatch!"

print("\nScaler verification passed")


STEP 6: FIT SCALER (TRAINING DATA ONLY)
Fitting scaler on training data...
  Training samples: 37,200
  Features to scale: 50

Scaler fitted on training data

Scaling parameters (first 5 features):
Feature                            Center      Scale
serving_cell_snr_1                  13.00       6.80
serving_cell_rssi_1                -84.40       7.80
serving_cell_rsrq_1                 -9.40       3.40
serving_cell_rsrp_1               -115.00      12.80
serving_cell_id                      6.00       1.00

Scaler verification passed


In [7]:
print("STEP 7: APPLY SCALING (PRESERVE NaNs)")

def scale_features_preserve_nans(df, scaler, features):
    """
    Scale features while preserving NaN locations
    
    Args:
        df: DataFrame with data
        scaler: Fitted RobustScaler
        features: List of features to scale
    
    Returns:
        df_scaled: DataFrame with scaled features, NaNs preserved
    """
    df_scaled = df.copy()
    
    # Get the data
    data = df[features].values
    
    # Remember where NaNs are BEFORE scaling
    nan_mask = np.isnan(data)
    
    # Apply scaling
    scaled_data = scaler.transform(df[features])
    
    # Force NaNs back to NaN (sklearn might change them)
    scaled_data[nan_mask] = np.nan
    
    # Put scaled data back into dataframe
    df_scaled[features] = scaled_data
    
    return df_scaled

# Apply scaling to all three splits
print("Scaling train set...")
train_df_scaled = scale_features_preserve_nans(train_df, scaler, features_to_scale)

print("Scaling val set...")
val_df_scaled = scale_features_preserve_nans(val_df, scaler, features_to_scale)

print("Scaling test set...")
test_df_scaled = scale_features_preserve_nans(test_df, scaler, features_to_scale)

print("\n All splits scaled")


# VERIFICATION 1: NaN PRESERVATION

print("VERIFICATION 1: NaN PRESERVATION")


for name, original, scaled in [
    ('Train', train_df, train_df_scaled),
    ('Val', val_df, val_df_scaled),
    ('Test', test_df, test_df_scaled)
]:
    original_nans = original[features_to_scale].isnull().sum().sum()
    scaled_nans = scaled[features_to_scale].isnull().sum().sum()
    match = "✓" if original_nans == scaled_nans else "✗"
    
    print(f"{name:5s}: Original NaNs={original_nans:>6,} | Scaled NaNs={scaled_nans:>6,} | {match}")
    
    # Assert they match
    assert original_nans == scaled_nans, f"{name}: NaN count changed after scaling!"

print("\n All NaNs preserved correctly")


# VERIFICATION 2: SCALING WORKED

print("VERIFICATION 2: SCALING EFFECTIVENESS")

# Check first feature as example
sample_feature = features_to_scale[0]
print(f"Sample feature: {sample_feature}")

# Training data - observed values only
train_original = train_df[sample_feature].dropna()
train_scaled = train_df_scaled[sample_feature].dropna()

print(f"\nOriginal (unscaled):")
print(f"  Mean:   {train_original.mean():10.4f}")
print(f"  Median: {train_original.median():10.4f}")
print(f"  Std:    {train_original.std():10.4f}")
print(f"  Min:    {train_original.min():10.4f}")
print(f"  Max:    {train_original.max():10.4f}")

print(f"\nScaled:")
print(f"  Mean:   {train_scaled.mean():10.4f} (should be ~0)")
print(f"  Median: {train_scaled.median():10.4f} (should be ~0)")
print(f"  Std:    {train_scaled.std():10.4f}")
print(f"  Min:    {train_scaled.min():10.4f}")
print(f"  Max:    {train_scaled.max():10.4f}")

# Check if mean is close to 0 (within reason)
if abs(train_scaled.mean()) < 0.5:
    print("\n Scaling centered correctly (mean ≈ 0)")
else:
    print(f"\n  Warning: Mean is {train_scaled.mean():.4f}, expected ~0")

# VERIFICATION 3: RANGE CHECK

print("VERIFICATION 3: SCALED RANGES")


print(f"Checking all {len(features_to_scale)} features...")

# Check ranges for all features
extreme_features = []

for feature in features_to_scale:
    scaled_vals = train_df_scaled[feature].dropna()
    if len(scaled_vals) > 0:
        min_val = scaled_vals.min()
        max_val = scaled_vals.max()
        
        # Flag if range is unusually large (might indicate scaling issue)
        if abs(min_val) > 10 or abs(max_val) > 10:
            extreme_features.append({
                'feature': feature,
                'min': min_val,
                'max': max_val
            })

if extreme_features:
    print(f"\n  {len(extreme_features)} features have values beyond [-10, 10]:")
    for item in extreme_features[:5]:  # Show first 5
        print(f"  {item['feature']:30s}: [{item['min']:7.2f}, {item['max']:7.2f}]")
    print("  (This is OK if you have outliers - RobustScaler handles them)")
else:
    print("\n All features in reasonable range")


# VERIFICATION 4: NO DATA LEAKAGE


print("VERIFICATION 4: NO DATA LEAKAGE CHECK")


# Scaler should only know about training data
# Val and test should have different ranges

sample_feature = features_to_scale[0]

train_range = (train_df_scaled[sample_feature].min(), 
               train_df_scaled[sample_feature].max())
val_range = (val_df_scaled[sample_feature].min(), 
             val_df_scaled[sample_feature].max())
test_range = (test_df_scaled[sample_feature].min(), 
              test_df_scaled[sample_feature].max())

print(f"Sample feature: {sample_feature}")
print(f"  Train range: [{train_range[0]:7.2f}, {train_range[1]:7.2f}]")
print(f"  Val range:   [{val_range[0]:7.2f}, {val_range[1]:7.2f}]")
print(f"  Test range:  [{test_range[0]:7.2f}, {test_range[1]:7.2f}]")

# Val/test can have values outside train range (that's good!)
if val_range[0] < train_range[0] or val_range[1] > train_range[1]:
    print("\nVal has values outside train range (no data leakage)")
else:
    print("\n  Note: Val range within train range (can happen with balanced splits)")

print(" SCALING COMPLETE AND VERIFIED ")




STEP 7: APPLY SCALING (PRESERVE NaNs)
Scaling train set...
Scaling val set...
Scaling test set...

 All splits scaled
VERIFICATION 1: NaN PRESERVATION
Train: Original NaNs=486,501 | Scaled NaNs=486,501 | ✓
Val  : Original NaNs=106,398 | Scaled NaNs=106,398 | ✓
Test : Original NaNs=108,601 | Scaled NaNs=108,601 | ✓

 All NaNs preserved correctly
VERIFICATION 2: SCALING EFFECTIVENESS
Sample feature: serving_cell_snr_1

Original (unscaled):
  Mean:      14.0068
  Median:    13.0000
  Std:        4.1626
  Min:        1.5000
  Max:       23.8000

Scaled:
  Mean:       0.1481 (should be ~0)
  Median:     0.0000 (should be ~0)
  Std:        0.6122
  Min:       -1.6912
  Max:        1.5882

 Scaling centered correctly (mean ≈ 0)
VERIFICATION 3: SCALED RANGES
Checking all 50 features...

  17 features have values beyond [-10, 10]:
  delay_std_UL                  : [  -1.29, 1170.63]
  delay_mean_UL                 : [  -1.00,  994.64]
  jitter_UL                     : [  -0.93,  234.84]
  delay

In [8]:

print("STEP 8: SAVE PROCESSED DATASETS")


from pathlib import Path



# Save the scaled datasets
print("Saving datasets...")

train_df_scaled.to_csv('data/processed/train.csv', index=False)
print("   train.csv")

val_df_scaled.to_csv('data/processed/val.csv', index=False)
print("   val.csv")

test_df_scaled.to_csv('data/processed/test.csv', index=False)
print(" test.csv")

# Save metadata
metadata = {
    'features_to_scale': features_to_scale,
    'missingness_indicators': missingness_indicators,
    'column_taxonomy': column_taxonomy,
    
    # Split sizes
    'train_size': len(train_df_scaled),
    'val_size': len(val_df_scaled),
    'test_size': len(test_df_scaled),
    
    # Time ranges
    'train_time_range': (str(train_df_scaled['timestamp'].min()), 
                         str(train_df_scaled['timestamp'].max())),
    'val_time_range': (str(val_df_scaled['timestamp'].min()), 
                       str(val_df_scaled['timestamp'].max())),
    'test_time_range': (str(test_df_scaled['timestamp'].min()), 
                        str(test_df_scaled['timestamp'].max())),
    
    # Missing percentages
    'train_missing_pct': train_df_scaled[features_to_scale].isnull().sum().sum() / train_df_scaled[features_to_scale].size * 100,
    'val_missing_pct': val_df_scaled[features_to_scale].isnull().sum().sum() / val_df_scaled[features_to_scale].size * 100,
    'test_missing_pct': test_df_scaled[features_to_scale].isnull().sum().sum() / test_df_scaled[features_to_scale].size * 100,
}

with open('data/processed/metadata.pkl', 'wb') as f:
    pickle.dump(metadata, f)

print(" metadata.pkl")

# Display summary
print("SAVED FILES SUMMARY")

print(f"\nDatasets:")
print(f"  train.csv:    {len(train_df_scaled):>6,} rows × {len(train_df_scaled.columns):>3} cols")
print(f"  val.csv:      {len(val_df_scaled):>6,} rows × {len(val_df_scaled.columns):>3} cols")
print(f"  test.csv:     {len(test_df_scaled):>6,} rows × {len(test_df_scaled.columns):>3} cols")

print(f"\nMetadata includes:")
print(f"  - {len(features_to_scale)} features to scale")
print(f"  - {len(missingness_indicators)} missingness indicators")
print(f"  - Time ranges for each split")
print(f"  - Missing percentages: Train={metadata['train_missing_pct']:.2f}%, Val={metadata['val_missing_pct']:.2f}%, Test={metadata['test_missing_pct']:.2f}%")

print("\n All datasets and metadata saved to data/processed/")

STEP 8: SAVE PROCESSED DATASETS
Saving datasets...
   train.csv
   val.csv
 test.csv
 metadata.pkl
SAVED FILES SUMMARY

Datasets:
  train.csv:    37,200 rows × 147 cols
  val.csv:       8,000 rows × 147 cols
  test.csv:      7,999 rows × 147 cols

Metadata includes:
  - 50 features to scale
  - 50 missingness indicators
  - Time ranges for each split
  - Missing percentages: Train=26.16%, Val=26.60%, Test=27.15%

 All datasets and metadata saved to data/processed/


In [9]:
print("STEP 9: CREATE ARTIFICIAL MASKS FOR EVALUATION")

def create_artificial_masks(df, features, mask_ratio=0.15, seed=42):
    """
    Artificially mask observed values for evaluation
    
    Args:
        df: DataFrame with scaled data
        features: List of features to mask
        mask_ratio: Proportion of OBSERVED values to mask (0.15 = 15%)
        seed: Random seed for reproducibility
    
    Returns:
        df_masked: DataFrame with additional masks applied
        ground_truth: Dict {(row_idx, feature): original_value}
    """
    np.random.seed(seed)
    
    df_masked = df.copy()
    ground_truth = {}
    
    print(f"  Mask ratio: {mask_ratio*100:.0f}% of observed values")
    print(f"  Random seed: {seed}")
    
    for feature in features:
        # Find OBSERVED values (not originally missing)
        indicator_col = f"{feature}_was_missing"
        
        if indicator_col in df.columns:
            # observed = indicator is 0 AND value is not NaN
            observed_mask = (df[indicator_col] == 0) & df[feature].notna()
        else:
            # No indicator, just check not NaN
            observed_mask = df[feature].notna()
        
        observed_indices = df[observed_mask].index.tolist()
        
        if len(observed_indices) == 0:
            continue  # No observed values
        
        # Randomly select some to mask
        n_to_mask = int(len(observed_indices) * mask_ratio)
        
        if n_to_mask == 0:
            continue
        
        masked_indices = np.random.choice(
            observed_indices,
            size=n_to_mask,
            replace=False
        )
        
        # Store ground truth BEFORE masking
        for idx in masked_indices:
            ground_truth[(idx, feature)] = df.loc[idx, feature]
        
        # Apply artificial mask (set to NaN)
        df_masked.loc[masked_indices, feature] = np.nan
    
    return df_masked, ground_truth


# Create masks for VALIDATION set
print("VALIDATION SET")


val_df_masked, val_ground_truth = create_artificial_masks(
    val_df_scaled,
    features_to_scale,
    mask_ratio=0.15,
    seed=42
)

print(f"\nArtificially masked validation set")
print(f"  Ground truth entries: {len(val_ground_truth):,}")
print(f"  Coverage: {len(val_ground_truth) / (len(val_df_scaled) * len(features_to_scale)) * 100:.2f}% of val data")

# Check NaN increase
val_original_nans = val_df_scaled[features_to_scale].isnull().sum().sum()
val_masked_nans = val_df_masked[features_to_scale].isnull().sum().sum()
print(f"  Original NaNs: {val_original_nans:,}")
print(f"  After masking: {val_masked_nans:,}")
print(f"  Increase: {val_masked_nans - val_original_nans:,} (should match ground truth size)")

assert len(val_ground_truth) == (val_masked_nans - val_original_nans), "Mismatch in val masking!"


# Create masks for TEST set


print("TEST SET")


test_df_masked, test_ground_truth = create_artificial_masks(
    test_df_scaled,
    features_to_scale,
    mask_ratio=0.15,
    seed=123  # Different seed for test
)

print(f"\n Artificially masked test set")
print(f"  Ground truth entries: {len(test_ground_truth):,}")
print(f"  Coverage: {len(test_ground_truth) / (len(test_df_scaled) * len(features_to_scale)) * 100:.2f}% of test data")

# Check NaN increase
test_original_nans = test_df_scaled[features_to_scale].isnull().sum().sum()
test_masked_nans = test_df_masked[features_to_scale].isnull().sum().sum()
print(f"  Original NaNs: {test_original_nans:,}")
print(f"  After masking: {test_masked_nans:,}")
print(f"  Increase: {test_masked_nans - test_original_nans:,} (should match ground truth size)")

assert len(test_ground_truth) == (test_masked_nans - test_original_nans), "Mismatch in test masking!"


# Save everything


print("SAVING MASKED DATASETS AND GROUND TRUTH")


# Save masked datasets
val_df_masked.to_csv('data/processed/val_masked.csv', index=False)
print("  val_masked.csv")

test_df_masked.to_csv('data/processed/test_masked.csv', index=False)
print("  test_masked.csv")

# Save ground truth
with open('data/processed/val_ground_truth.pkl', 'wb') as f:
    pickle.dump(val_ground_truth, f)
print("  val_ground_truth.pkl")

with open('data/processed/test_ground_truth.pkl', 'wb') as f:
    pickle.dump(test_ground_truth, f)
print("  test_ground_truth.pkl")


print("PREPROCESSING COMPLETE")

STEP 9: CREATE ARTIFICIAL MASKS FOR EVALUATION
VALIDATION SET
  Mask ratio: 15% of observed values
  Random seed: 42

Artificially masked validation set
  Ground truth entries: 44,020
  Coverage: 11.00% of val data
  Original NaNs: 106,398
  After masking: 150,418
  Increase: 44,020 (should match ground truth size)
TEST SET
  Mask ratio: 15% of observed values
  Random seed: 123

 Artificially masked test set
  Ground truth entries: 43,669
  Coverage: 10.92% of test data
  Original NaNs: 108,601
  After masking: 152,270
  Increase: 43,669 (should match ground truth size)
SAVING MASKED DATASETS AND GROUND TRUTH
  val_masked.csv
  test_masked.csv
  val_ground_truth.pkl
  test_ground_truth.pkl
PREPROCESSING COMPLETE
