In [1]:


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings


In [2]:


# Load the engineered dataset
df_engineered = pd.read_parquet('results/df_engineered.parquet')

print(f"\nDataset shape: {df_engineered.shape}")



Dataset shape: (204942, 69)


In [3]:
df_engineered.head()

Unnamed: 0,timestamp,ping_ms,datarate,jitter,Latitude,Longitude,Altitude,speed_kmh,COG,precipIntensity,...,scenario_A2U,scenario_A3D,scenario_A3U,drive_mode_2x2,drive_mode_platoon,direction_uplink,measured_qos_delay,hour,day_of_week,poor_signal_quality
216,2021-06-22 09:49:54+02:00,,68700000.0,0.000848,52.514013,13.335172,41.9,0.0,0.0,0.0652,...,0,1,0,0,1,0,0,9,1,0
217,2021-06-22 09:49:54+02:00,,24500000.0,0.000512,52.514008,13.335195,35.3,0.0,259.0,0.0652,...,0,1,0,0,1,0,0,9,1,0
218,2021-06-22 09:49:54+02:00,,49800000.0,9e-05,52.51383,13.334935,30.7,0.0,0.0,0.0653,...,0,1,0,0,1,0,0,9,1,0
219,2021-06-22 09:49:54+02:00,1396.0,70500000.0,0.000207,52.513848,13.334832,32.3,0.0,265.9,0.0653,...,0,1,0,0,1,0,0,9,1,0
220,2021-06-22 09:49:55+02:00,,20800000.0,0.002268,52.514005,13.335195,35.4,0.0,259.0,0.0652,...,0,1,0,0,1,0,0,9,1,0


In [7]:
# Missing percentage for each feature in df_engineered
feature_missing_pct = (
    df_engineered.isna()
    .mean()
    .mul(100)
    .round(2)
    .sort_values(ascending=False)
)

# Convert to a feature list with percentages
feature_missing_list = [
    {"feature": col, "missing_pct": pct}
    for col, pct in feature_missing_pct.items()
]

# Optional: DataFrame view
feature_missing_df = pd.DataFrame(feature_missing_list)

feature_missing_df

Unnamed: 0,feature,missing_pct
0,scell_downlink_rbs_mcs_mean,85.81
1,ping_ms,28.16
2,datarate,7.65
3,jitter,7.65
4,Pos in Ref Round,6.91
...,...,...
64,measurement,0.00
65,target_datarate,0.00
66,operator,0.00
67,PCell_DL_RBs_MCS_Low,0.00


In [8]:
df_engineered = df_engineered.drop(columns=['scell_downlink_rbs_mcs_mean'])


In [9]:
df_engineered

Unnamed: 0,timestamp,ping_ms,datarate,jitter,Latitude,Longitude,Altitude,speed_kmh,COG,precipIntensity,...,scenario_A2U,scenario_A3D,scenario_A3U,drive_mode_2x2,drive_mode_platoon,direction_uplink,measured_qos_delay,hour,day_of_week,poor_signal_quality
216,2021-06-22 09:49:54+02:00,,68700000.0,0.000848,52.514013,13.335172,41.9,0.0000,0.0,0.0652,...,0,1,0,0,1,0,0,9,1,0
217,2021-06-22 09:49:54+02:00,,24500000.0,0.000512,52.514008,13.335195,35.3,0.0000,259.0,0.0652,...,0,1,0,0,1,0,0,9,1,0
218,2021-06-22 09:49:54+02:00,,49800000.0,0.000090,52.513830,13.334935,30.7,0.0000,0.0,0.0653,...,0,1,0,0,1,0,0,9,1,0
219,2021-06-22 09:49:54+02:00,1396.0,70500000.0,0.000207,52.513848,13.334832,32.3,0.0000,265.9,0.0653,...,0,1,0,0,1,0,0,9,1,0
220,2021-06-22 09:49:55+02:00,,20800000.0,0.002268,52.514005,13.335195,35.4,0.0000,259.0,0.0652,...,0,1,0,0,1,0,0,9,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
207429,2021-06-24 18:59:55+02:00,,178000000.0,0.000123,52.513883,13.335023,28.0,2.7780,81.6,0.3287,...,0,1,0,0,0,0,0,18,3,0
207430,2021-06-24 18:59:56+02:00,,178000000.0,0.000091,52.513885,13.335027,28.0,0.7408,81.6,0.3287,...,0,1,0,0,0,0,0,18,3,0
207431,2021-06-24 18:59:57+02:00,,165000000.0,0.000076,52.513887,13.335030,28.1,0.0000,81.6,0.3287,...,0,1,0,0,0,0,0,18,3,0
207432,2021-06-24 18:59:58+02:00,540.0,177000000.0,0.000040,52.513890,13.335032,28.1,0.0000,81.6,0.3287,...,0,1,0,0,0,0,0,18,3,0


In [10]:

print("TEMPORAL SPLITTING STRATEGY")

# Check data distribution by date
df_engineered['date'] = df_engineered['timestamp'].dt.date

print("Data distribution by date:")
date_counts = df_engineered.groupby('date').size()
for date, count in date_counts.items():
    pct = (count / len(df_engineered)) * 100
    print(f"  {date}: {count:>7,} rows ({pct:>5.2f}%)")

print(f"\nTotal: {len(df_engineered):,} rows")

TEMPORAL SPLITTING STRATEGY
Data distribution by date:
  2021-06-22:  79,564 rows (38.82%)
  2021-06-23:  79,015 rows (38.55%)
  2021-06-24:  46,363 rows (22.62%)

Total: 204,942 rows


In [11]:

print("EXECUTING TEMPORAL SPLIT (70/15/15)")

# Sort by timestamp to ensure chronological order
df_sorted = df_engineered.sort_values('timestamp').reset_index(drop=True)

print(f"\nSorted by timestamp")
# Calculate split indices
n_total = len(df_sorted)
n_train = int(n_total * 0.70)
n_val = int(n_total * 0.15)
n_test = n_total - n_train - n_val  # Remainder goes to test

print(f"\n SPLIT SIZES:")
print(f"  Total:      {n_total:>7,} rows (100.0%)")
print(f"  Train:      {n_train:>7,} rows ({n_train/n_total*100:>5.1f}%)")
print(f"  Validation: {n_val:>7,} rows ({n_val/n_total*100:>5.1f}%)")
print(f"  Test:       {n_test:>7,} rows ({n_test/n_total*100:>5.1f}%)")

# Split the data
train_data = df_sorted.iloc[:n_train].copy()
val_data = df_sorted.iloc[n_train:n_train+n_val].copy()
test_data = df_sorted.iloc[n_train+n_val:].copy()

print(f"\n Data split complete")

# Verify temporal order (no leakage)
print(f"\nTEMPORAL VERIFICATION (No leakage check):")
print(f"  Train period:      {train_data['timestamp'].min()} to {train_data['timestamp'].max()}")
print(f"  Validation period: {val_data['timestamp'].min()} to {val_data['timestamp'].max()}")
print(f"  Test period:       {test_data['timestamp'].min()} to {test_data['timestamp'].max()}")

# Check no overlap
train_max = train_data['timestamp'].max()
val_min = val_data['timestamp'].min()
val_max = val_data['timestamp'].max()
test_min = test_data['timestamp'].min()

if train_max < val_min and val_max < test_min:
    print(f"\n Train < Val < Test")
else:
    print(f"\n Possible temporal overlap!")


EXECUTING TEMPORAL SPLIT (70/15/15)

Sorted by timestamp

 SPLIT SIZES:
  Total:      204,942 rows (100.0%)
  Train:      143,459 rows ( 70.0%)
  Validation:  30,741 rows ( 15.0%)
  Test:        30,742 rows ( 15.0%)

 Data split complete

TEMPORAL VERIFICATION (No leakage check):
  Train period:      2021-06-22 09:49:54+02:00 to 2021-06-23 15:51:37+02:00
  Validation period: 2021-06-23 15:51:37+02:00 to 2021-06-24 10:19:02+02:00
  Test period:       2021-06-24 10:19:02+02:00 to 2021-06-24 18:59:59+02:00

 Possible temporal overlap!


In [12]:
# Check distribution balance
print(f"\n DISTRIBUTION BALANCE CHECK:")


# Check operator distribution
print("\nOperator distribution:")
print(f"{'Split':<12} {'Operator 0':>12} {'Operator 1':>12}")

for split_name, split_data in [('Train', train_data), ('Validation', val_data), ('Test', test_data)]:
    if 'operator' in split_data.columns:
        op_counts = split_data['operator'].value_counts()
        for op in [0, 1]:
            if op in op_counts.index:
                count = op_counts[op]
                pct = count / len(split_data) * 100
                if op == 0:
                    print(f"{split_name:<12} {pct:>10.1f}%", end='')
                else:
                    print(f" {pct:>10.1f}%")

# Check missing percentages
print("\nMissing data percentages:")
print(f"{'Split':<12} {'Overall':>10}")

for split_name, split_data in [('Train', train_data), ('Validation', val_data), ('Test', test_data)]:
    overall_missing = (split_data.isnull().sum().sum() / (split_data.shape[0] * split_data.shape[1]) * 100)
    print(f"{split_name:<12} {overall_missing:>9.2f}%")


print("SPLIT QUALITY VERIFIED")


 DISTRIBUTION BALANCE CHECK:

Operator distribution:
Split          Operator 0   Operator 1
Train              49.8%       50.2%
Validation         50.0%       50.0%
Test               50.4%       49.6%

Missing data percentages:
Split           Overall
Train             3.63%
Validation        0.89%
Test              1.28%
SPLIT QUALITY VERIFIED


In [13]:

# Move overlapping train-val timestamp to validation
overlap_timestamp_1 = train_data['timestamp'].max()
if overlap_timestamp_1 in val_data['timestamp'].values:
    mask = train_data['timestamp'] == overlap_timestamp_1
    rows_to_move = train_data[mask].copy()
    train_data = train_data[~mask].reset_index(drop=True)
    val_data = pd.concat([rows_to_move, val_data]).sort_values('timestamp').reset_index(drop=True)
    print(f"  ✓ Moved {len(rows_to_move)} rows from train to validation")

# Move overlapping val-test timestamp to test
overlap_timestamp_2 = val_data['timestamp'].max()
if overlap_timestamp_2 in test_data['timestamp'].values:
    mask = val_data['timestamp'] == overlap_timestamp_2
    rows_to_move = val_data[mask].copy()
    val_data = val_data[~mask].reset_index(drop=True)
    test_data = pd.concat([rows_to_move, test_data]).sort_values('timestamp').reset_index(drop=True)
    print(f"  ✓ Moved {len(rows_to_move)} rows from validation to test")

# Verify no overlap now
train_timestamps = set(train_data['timestamp'])
val_timestamps = set(val_data['timestamp'])
test_timestamps = set(test_data['timestamp'])

overlap_check = (
    len(train_timestamps & val_timestamps) + 
    len(val_timestamps & test_timestamps) + 
    len(train_timestamps & test_timestamps)
)

if overlap_check == 0:
    print(f"\n  NO OVERLAP - Clean split achieved!")
else:
    print(f"\n Still {overlap_check} overlapping timestamps")

# Final split sizes
print(f"\n FINAL SPLIT SIZES:")
print(f"  Train:      {len(train_data):>7,} rows ({len(train_data)/len(df_sorted)*100:>5.1f}%)")
print(f"  Validation: {len(val_data):>7,} rows ({len(val_data)/len(df_sorted)*100:>5.1f}%)")
print(f"  Test:       {len(test_data):>7,} rows ({len(test_data)/len(df_sorted)*100:>5.1f}%)")
print(f"  Total:      {len(train_data)+len(val_data)+len(test_data):>7,} rows")


train_data.to_pickle('dataset/train_data.pkl')
val_data.to_pickle('dataset/val_data.pkl')
test_data.to_pickle('dataset/test_data.pkl')

print("Saved: dataset/train_data.pkl")
print("Saved: dataset/val_data.pkl")
print("Saved: dataset/test_data.pkl")

# Save split indices for reference
split_info = {
    'n_train': len(train_data),
    'n_val': len(val_data),
    'n_test': len(test_data),
    'train_start': str(train_data['timestamp'].min()),
    'train_end': str(train_data['timestamp'].max()),
    'val_start': str(val_data['timestamp'].min()),
    'val_end': str(val_data['timestamp'].max()),
    'test_start': str(test_data['timestamp'].min()),
    'test_end': str(test_data['timestamp'].max()),
    'split_date': str(datetime.now()),
    'random_seed': 42
}

import json
with open('dataset/split_info_1.json', 'w') as f:
    json.dump(split_info, f, indent=2)

print("Saved: dataset/split_info.json")




 Still 2 overlapping timestamps

 FINAL SPLIT SIZES:
  Train:      143,459 rows ( 70.0%)
  Validation:  30,741 rows ( 15.0%)
  Test:        30,742 rows ( 15.0%)
  Total:      204,942 rows
Saved: dataset/train_data.pkl
Saved: dataset/val_data.pkl
Saved: dataset/test_data.pkl
Saved: dataset/split_info.json


### Standardization

In [14]:
train_data.head()

Unnamed: 0,timestamp,ping_ms,datarate,jitter,Latitude,Longitude,Altitude,speed_kmh,COG,precipIntensity,...,scenario_A3D,scenario_A3U,drive_mode_2x2,drive_mode_platoon,direction_uplink,measured_qos_delay,hour,day_of_week,poor_signal_quality,date
0,2021-06-22 09:49:54+02:00,,68700000.0,0.000848,52.514013,13.335172,41.9,0.0,0.0,0.0652,...,1,0,0,1,0,0,9,1,0,2021-06-22
1,2021-06-22 09:49:54+02:00,,24500000.0,0.000512,52.514008,13.335195,35.3,0.0,259.0,0.0652,...,1,0,0,1,0,0,9,1,0,2021-06-22
2,2021-06-22 09:49:54+02:00,,49800000.0,9e-05,52.51383,13.334935,30.7,0.0,0.0,0.0653,...,1,0,0,1,0,0,9,1,0,2021-06-22
3,2021-06-22 09:49:54+02:00,1396.0,70500000.0,0.000207,52.513848,13.334832,32.3,0.0,265.9,0.0653,...,1,0,0,1,0,0,9,1,0,2021-06-22
4,2021-06-22 09:49:55+02:00,,20800000.0,0.002268,52.514005,13.335195,35.4,0.0,259.0,0.0652,...,1,0,0,1,0,0,9,1,0,2021-06-22


In [15]:
val_data.head()

Unnamed: 0,timestamp,ping_ms,datarate,jitter,Latitude,Longitude,Altitude,speed_kmh,COG,precipIntensity,...,scenario_A3D,scenario_A3U,drive_mode_2x2,drive_mode_platoon,direction_uplink,measured_qos_delay,hour,day_of_week,poor_signal_quality,date
143459,2021-06-23 15:51:37+02:00,50.5,394000.0,0.002718,52.513095,13.32976,34.5,0.0,84.3,0.0646,...,0,0,0,1,1,1,15,2,1,2021-06-23
143460,2021-06-23 15:51:37+02:00,25.2,403000.0,0.004402,52.513088,13.329952,32.5,0.0,83.6,0.0646,...,0,0,0,1,1,1,15,2,1,2021-06-23
143461,2021-06-23 15:51:37+02:00,22.8,403000.0,0.002579,52.513112,13.329863,35.9,0.0,84.0,0.0646,...,0,0,0,1,1,1,15,2,1,2021-06-23
143462,2021-06-23 15:51:38+02:00,23.1,394000.0,0.003982,52.513088,13.329952,32.5,0.0,83.6,0.0646,...,0,0,0,1,1,1,15,2,1,2021-06-23
143463,2021-06-23 15:51:38+02:00,27.04,403000.0,0.003064,52.513158,13.331445,28.4,7.9636,83.6,0.0646,...,0,0,0,1,1,1,15,2,0,2021-06-23


In [16]:
import pandas as pd
from sklearn.preprocessing import RobustScaler

# Select numerical columns only
num_cols = train_data.select_dtypes(include=["int64", "float64"]).columns

# Initialize scaler
scaler = RobustScaler()

# Fit ONLY on training data
train_data_scaled = train_data.copy()
train_data_scaled[num_cols] = scaler.fit_transform(train_data[num_cols])

# Transform validation data using the same scaler
val_data_scaled = val_data.copy()
val_data_scaled[num_cols] = scaler.transform(val_data[num_cols])

In [17]:
train_data_scaled.head()

Unnamed: 0,timestamp,ping_ms,datarate,jitter,Latitude,Longitude,Altitude,speed_kmh,COG,precipIntensity,...,scenario_A3D,scenario_A3U,drive_mode_2x2,drive_mode_platoon,direction_uplink,measured_qos_delay,hour,day_of_week,poor_signal_quality,date
0,2021-06-22 09:49:54+02:00,,2.378685,-0.108315,0.606259,0.41037,0.518182,-0.471264,-1.053281,0.343206,...,1.0,0.0,0.0,0.0,0.0,0.0,9,1,0.0,2021-06-22
1,2021-06-22 09:49:54+02:00,,0.610473,-0.230853,0.605881,0.410979,-0.081818,-0.471264,0.399327,0.343206,...,1.0,0.0,0.0,0.0,0.0,0.0,9,1,0.0,2021-06-22
2,2021-06-22 09:49:54+02:00,,1.622595,-0.384756,0.592378,0.404193,-0.5,-0.471264,-1.053281,0.344948,...,1.0,0.0,0.0,0.0,0.0,0.0,9,1,0.0,2021-06-22
3,2021-06-22 09:49:54+02:00,2.646228,2.450694,-0.342086,0.593766,0.401496,-0.354545,-0.471264,0.438026,0.344948,...,1.0,0.0,0.0,0.0,0.0,0.0,9,1,0.0,2021-06-22
4,2021-06-22 09:49:55+02:00,,0.462455,0.409555,0.605628,0.410979,-0.072727,-0.471264,0.399327,0.343206,...,1.0,0.0,0.0,0.0,0.0,0.0,9,1,0.0,2021-06-22


In [18]:
val_data_scaled.head()

Unnamed: 0,timestamp,ping_ms,datarate,jitter,Latitude,Longitude,Altitude,speed_kmh,COG,precipIntensity,...,scenario_A3D,scenario_A3U,drive_mode_2x2,drive_mode_platoon,direction_uplink,measured_qos_delay,hour,day_of_week,poor_signal_quality,date
143459,2021-06-23 15:51:37+02:00,0.007061,-0.353882,0.573669,0.536724,0.269129,-0.154545,-0.471264,-0.580482,0.332753,...,0.0,0.0,0.0,0.0,1.0,1.0,15,2,1.0,2021-06-23
143460,2021-06-23 15:51:37+02:00,-0.042564,-0.353522,1.187819,0.536219,0.274131,-0.336364,-0.471264,-0.584408,0.332753,...,0.0,0.0,0.0,0.0,1.0,1.0,15,2,1.0,2021-06-23
143461,2021-06-23 15:51:37+02:00,-0.047272,-0.353522,0.522976,0.537986,0.271826,-0.027273,-0.471264,-0.582165,0.332753,...,0.0,0.0,0.0,0.0,1.0,1.0,15,2,1.0,2021-06-23
143462,2021-06-23 15:51:38+02:00,-0.046683,-0.353882,1.034646,0.536219,0.274131,-0.336364,-0.471264,-0.584408,0.332753,...,0.0,0.0,0.0,0.0,1.0,1.0,15,2,1.0,2021-06-23
143463,2021-06-23 15:51:38+02:00,-0.038955,-0.353522,0.699854,0.541519,0.313106,-0.709091,-0.224138,-0.584408,0.332753,...,0.0,0.0,0.0,0.0,1.0,1.0,15,2,0.0,2021-06-23


In [19]:

# Set random seed for reproducibility
np.random.seed(42)

# Mask rate
MASK_RATE = 0.15

print(f"\n  CONFIGURATION:")
print(f"    Mask rate: {MASK_RATE*100:.0f}%")
print(f"    Random seed: 42")
print(f"    Target sets: Validation & Test only")

# Identify numeric columns to mask (exclude metadata, identifiers, engineered features)
exclude_cols = ['timestamp', 'ts_gps', 'device', 'measurement', 'area', 
                'scenario', 'drive_mode', 'direction', 'measured_qos', 'operator',
                'date', 'hour', 'day_of_week', 'row_missing_count', 'row_missing_pct']

# Also exclude missing indicator columns and one-hot encoded columns
exclude_cols += [col for col in val_data_scaled.columns if col.endswith('_missing')]
exclude_cols += [col for col in val_data_scaled.columns if any(prefix in col for prefix in 
                ['device_', 'scenario_', 'drive_mode_', 'direction_', 'measured_qos_', 'area_'])]

# Get columns to mask (numeric features only)
all_numeric = val_data_scaled.select_dtypes(include=[np.number]).columns.tolist()
cols_to_mask = [col for col in all_numeric if col not in exclude_cols]




  CONFIGURATION:
    Mask rate: 15%
    Random seed: 42
    Target sets: Validation & Test only


In [21]:

print("EXECUTING MASKING ON VALIDATION & TEST SETS")

# Function to create masks
def create_evaluation_masks(data, cols_to_mask, mask_rate=0.15):
    
    masked_data = data.copy()
    ground_truth = data[cols_to_mask].copy()
    mask_indices = pd.DataFrame(False, index=data.index, columns=cols_to_mask)
    
    total_masked = 0
    
    for col in cols_to_mask:
        # Find non-missing values
        non_missing_mask = data[col].notna()
        non_missing_indices = data[non_missing_mask].index
        n_non_missing = len(non_missing_indices)
        
        if n_non_missing == 0:
            continue  
        
        # Randomly select mask_rate% to mask
        n_to_mask = int(n_non_missing * mask_rate)
        
        if n_to_mask > 0:
            indices_to_mask = np.random.choice(non_missing_indices, 
                                              size=n_to_mask, 
                                              replace=False)
            
            # Apply mask
            masked_data.loc[indices_to_mask, col] = np.nan
            mask_indices.loc[indices_to_mask, col] = True
            
            total_masked += n_to_mask
    
    return masked_data, ground_truth, mask_indices, total_masked


print("\n MASKING VALIDATION SET...")

val_masked, val_ground_truth, val_mask_indices, val_total_masked = create_evaluation_masks(
    val_data_scaled, cols_to_mask, MASK_RATE
)

val_total_cells = len(val_data_scaled) * len(cols_to_mask)
val_originally_missing = val_data_scaled[cols_to_mask].isnull().sum().sum()
val_non_missing = val_total_cells - val_originally_missing
val_masked_pct = (val_total_masked / val_non_missing * 100)

print(f"  Total cells: {val_total_cells:,}")
print(f"  Originally missing: {val_originally_missing:,}")
print(f"  Non-missing cells: {val_non_missing:,}")
print(f"  Artificially masked: {val_total_masked:,} ({val_masked_pct:.2f}% of non-missing)")


print("\n MASKING TEST SET...")

test_masked, test_ground_truth, test_mask_indices, test_total_masked = create_evaluation_masks(
    test_data, cols_to_mask, MASK_RATE
)

test_total_cells = len(test_data) * len(cols_to_mask)
test_originally_missing = test_data[cols_to_mask].isnull().sum().sum()
test_non_missing = test_total_cells - test_originally_missing
test_masked_pct = (test_total_masked / test_non_missing * 100)

print(f"  Total cells: {test_total_cells:,}")
print(f"  Originally missing: {test_originally_missing:,}")
print(f"  Non-missing cells: {test_non_missing:,}")
print(f"  Artificially masked: {test_total_masked:,} ({test_masked_pct:.2f}% of non-missing)")

print("VERIFICATION:")
print("="*80)

print(f"\nValidation set:")
print(f"  • Before masking: {(val_data[cols_to_mask].isnull().sum().sum() / val_total_cells * 100):.2f}% missing")
print(f"  • After masking:  {(val_masked[cols_to_mask].isnull().sum().sum() / val_total_cells * 100):.2f}% missing")
print(f"  • Increase:       {((val_masked[cols_to_mask].isnull().sum().sum() - val_data[cols_to_mask].isnull().sum().sum()) / val_total_cells * 100):.2f}%")

print(f"\nTest set:")
print(f"  • Before masking: {(test_data[cols_to_mask].isnull().sum().sum() / test_total_cells * 100):.2f}% missing")
print(f"  • After masking:  {(test_masked[cols_to_mask].isnull().sum().sum() / test_total_cells * 100):.2f}% missing")
print(f"  • Increase:       {((test_masked[cols_to_mask].isnull().sum().sum() - test_data[cols_to_mask].isnull().sum().sum()) / test_total_cells * 100):.2f}%")

print("\n Masking complete!")

EXECUTING MASKING ON VALIDATION & TEST SETS

 MASKING VALIDATION SET...
  Total cells: 1,475,568
  Originally missing: 18,827
  Non-missing cells: 1,456,741
  Artificially masked: 218,498 (15.00% of non-missing)

 MASKING TEST SET...
  Total cells: 1,475,616
  Originally missing: 27,161
  Non-missing cells: 1,448,455
  Artificially masked: 217,249 (15.00% of non-missing)
VERIFICATION:

Validation set:
  • Before masking: 1.28% missing
  • After masking:  16.08% missing
  • Increase:       14.81%

Test set:
  • Before masking: 1.84% missing
  • After masking:  16.56% missing
  • Increase:       14.72%

 Masking complete!


In [None]:

print("SAVING MASKED DATA & GROUND TRUTH")

# Save train data (no masking - use as-is)
train_data_scaled.to_pickle('dataset/train_data_final.pkl')
print(" Saved: dataset/train_data_final.pkl (original, no masking)")

# Save validation - masked version
val_masked.to_pickle('dataset/val_data_masked.pkl')
print(" Saved: dataset/val_data_masked.pkl (with artificial masks)")

# Save validation - ground truth
val_ground_truth.to_pickle('dataset/val_ground_truth.pkl')
print(" Saved: dataset/val_ground_truth.pkl (original values)")

# Save validation - mask indices
val_mask_indices.to_pickle('dataset/val_mask_indices.pkl')
print(" Saved: dataset/val_mask_indices.pkl (boolean mask)")

# Save test - masked version
test_masked.to_pickle('dataset/test_data_masked.pkl')
print(" Saved: dataset/test_data_masked.pkl (with artificial masks)")

# Save test - ground truth
test_ground_truth.to_pickle('dataset/test_ground_truth.pkl')
print(" Saved: dataset/test_ground_truth.pkl (original values)")

# Save test - mask indices
test_mask_indices.to_pickle('dataset/test_mask_indices.pkl')
print(" Saved: dataset/test_mask_indices.pkl (boolean mask)")

# Save list of masked columns
with open('dataset/cols_to_mask.json', 'w') as f:
    json.dump(cols_to_mask, f, indent=2)
print(" Saved: dataset/cols_to_mask.json (list of masked columns)")

# Create summary
summary = {
    'train': {
        'n_rows': len(train_data),
        'n_cols': len(train_data.columns),
        'masking': 'none',
        'file': 'train_data_final.pkl'
    },
    'validation': {
        'n_rows': len(val_masked),
        'n_cols': len(val_masked.columns),
        'n_masked_cols': len(cols_to_mask),
        'n_artificially_masked': val_total_masked,
        'mask_rate': f'{MASK_RATE*100:.0f}%',
        'files': {
            'data': 'val_data_masked.pkl',
            'ground_truth': 'val_ground_truth.pkl',
            'mask_indices': 'val_mask_indices.pkl'
        }
    },
    'test': {
        'n_rows': len(test_masked),
        'n_cols': len(test_masked.columns),
        'n_masked_cols': len(cols_to_mask),
        'n_artificially_masked': test_total_masked,
        'mask_rate': f'{MASK_RATE*100:.0f}%',
        'files': {
            'data': 'test_data_masked.pkl',
            'ground_truth': 'test_ground_truth.pkl',
            'mask_indices': 'test_mask_indices.pkl'
        }
    },
    'created_date': str(datetime.now()),
    'random_seed': 42
}

with open('dataset/masking_summary.json', 'w') as f:
    json.dump(summary, f, indent=2)
print(" Saved: dataset/masking_summary.json (summary)")





SAVING MASKED DATA & GROUND TRUTH
 Saved: dataset/train_data_final.pkl (original, no masking)
 Saved: dataset/train_data_final.csv (original, no masking)
 Saved: dataset/val_data_masked.pkl (with artificial masks)
 Saved: dataset/val_ground_truth.pkl (original values)
 Saved: dataset/val_mask_indices.pkl (boolean mask)
 Saved: dataset/test_data_masked.pkl (with artificial masks)
 Saved: dataset/test_ground_truth.pkl (original values)
 Saved: dataset/test_mask_indices.pkl (boolean mask)
 Saved: dataset/cols_to_mask.json (list of masked columns)
 Saved: dataset/masking_summary.json (summary)
