# Route Change Prediction - Advanced Analysis

This notebook focuses on understanding the feature importance, specifically temporal dynamics (lags), and handling class imbalance.

In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, precision_recall_curve

from data_loader import load_data, parse_rtts_column
from feature_engineering import calculate_rtt_stats, add_temporal_features

## 1. Data Loading (Targeted Sample)
Loading a 2M row sample to allow for expensive sorting/grouping operations required for lag features.

In [2]:
SAMPLE_SIZE = 2000000
df = load_data('train.csv', sample_size=SAMPLE_SIZE)

## 2. Advanced Feature Engineering
Calculating Lags, Diffs, Volatility and Spike Features.

In [None]:
df = parse_rtts_column(df)
df = calculate_rtt_stats(df)
df = add_temporal_features(df)

# Look at the new columns
print(df.columns)
display(df[['rtt_mean', 'volatility_rtt_mean', 'z_score_rtt']].head(10))

## 3. Handling Class Imbalance (Undersampling)
The dataset is highly imbalanced. We will correct this for analysis by undersampling the majority class (No Change).

In [None]:
print("Original Distribution:")
print(df['route_changed'].value_counts())

# Separate classes
df_minority = df[df['route_changed'] == 1]
df_majority = df[df['route_changed'] == 0]

# Undersample majority
df_majority_downsampled = df_majority.sample(n=len(df_minority) * 2, random_state=42) # 1:2 Ratio

# Combine
df_balanced = pd.concat([df_minority, df_majority_downsampled])

print("\nBalanced Distribution (for Analysis):")
print(df_balanced['route_changed'].value_counts())

## 4. Visualizing Feature Correlations (Volatility)

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(x='route_changed', y='volatility_rtt_mean', data=df_balanced, showfliers=False)
plt.title('Impact of RTT Volatility (Instability) on Route Change')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(x='route_changed', y='diff_rtt_std', data=df_balanced, showfliers=False)
plt.title('Change in Jitter (Diff RTT Std) vs Route Change')
plt.show()

## 5. Model Training (Balanced)

In [None]:
features = [
    'rtt_mean', 'rtt_std', 'rtt_min', 'rtt_max', 'packet_loss_ratio',
    'prev_rtt_mean', 'prev_rtt_std', 'diff_rtt_mean', 'diff_rtt_std',
    'time_since_last', 'tr_attempts', 'total_probes_sent',
    'volatility_rtt_mean', 'z_score_rtt', 'packet_loss_trend'
]
target = 'route_changed'

# Use the balanced dataset for training to learn the signal better
df_model = df_balanced.dropna(subset=features)

X = df_model[features]
y = df_model[target]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training Data: {X_train.shape}")

# Ensure data is clean (no infinite values from division)
X_train = X_train.replace([np.inf, -np.inf], np.nan)
X_val = X_val.replace([np.inf, -np.inf], np.nan)

In [None]:
clf = lgb.LGBMClassifier(n_estimators=200, learning_rate=0.03, num_leaves=31, random_state=42)
clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(10)])

y_prob = clf.predict_proba(X_val)[:, 1]
y_pred = clf.predict(X_val)

print("ROC-AUC:", roc_auc_score(y_val, y_prob))
print("\nClassification Report:\n", classification_report(y_val, y_pred))

In [None]:
lgb.plot_importance(clf, importance_type='gain', figsize=(10, 8))
plt.title('Feature Importance (Gain)')
plt.show()