In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import spearmanr
from sklearn.model_selection import KFold, GroupKFold
import json
import joblib

### Training

#### With multiple featuers: 
'latitude', 'longitude', 'month', 'AT', 'avg_ndvi', 'population', 'IGP', 'has_wikipedia', 'media_count', 'Tier_Tier2', 'Tier_Tier3'

> Numerical Features:
- latitude, longitude (geographic position)
- year, month (temporal)
- AT, avg_ndvi (environmental)
- population (demographic)
- IGP, has_wikipedia, media_count (binary/count features)

> Categorical Features:
- Tier only (3 levels: Tier1, Tier2, Tier3)

In [5]:
# Paths
base_dir = Path("/home/diya.thakor/AirQuality/BASELINE")
data_dir = base_dir / "data"
results_dir = base_dir / "results"
results_dir.mkdir(parents=True, exist_ok=True)

# Load data
train_df = pd.read_csv(data_dir / "train_2022.csv")

# Feature selection
# feature_cols = [
#     'latitude', 'longitude', 'month', 'AT', 'avg_ndvi',
#     'population', 'IGP', 'has_wikipedia', 'media_count'
# ]

feature_cols = [
    'latitude', 'longitude', 'month', 'AT', 'avg_ndvi',
    'population', 'IGP', 'has_wikipedia', 'media_count'
]

# Encode Tier
# tier_le = LabelEncoder()
# train_df['Tier_encoded'] = tier_le.fit_transform(train_df['Tier'])
# feature_cols.append('Tier_encoded')

# One-hot encode Tier
train_df = pd.get_dummies(train_df, columns=['Tier'], prefix='Tier', drop_first=True)

print("Columns after get_dummies:", train_df.columns.tolist())

tier_cols = [col for col in train_df.columns if 'Tier' in col and col != 'Tier']
print(f"Tier columns found: {tier_cols}")

feature_cols.extend(tier_cols)

# No need for Tier_Tier1 - it's the reference (both others = 0)

X = train_df[feature_cols].values
y = train_df['PM2.5'].values
cities = train_df['city'].values

# Standardize features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Hyperparameter grid
alphas = [0.01, 0.1, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0]
k = 5
# kf = KFold(n_splits=k, shuffle=True, random_state=42)
gkf = GroupKFold(n_splits=k)

# Store all split details
cv_details = {}

for alpha in alphas:
    fold_maes = []
    fold_details = []
    for fold, (train_idx, val_idx) in enumerate(gkf.split(X, y, groups=cities)):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        cities_train = cities[train_idx]
        cities_val = cities[val_idx]

        # Train model
        model = Ridge(alpha=alpha)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        mae = mean_absolute_error(y_val, y_pred)
        fold_maes.append(mae)

        # Store fold details
        fold_details.append({
            'fold': fold,
            'alpha': alpha,
            'train_cities': list(np.unique(cities_train)),
            'val_cities': list(np.unique(cities_val)),
            'mae': mae,
            'n_train': len(train_idx),
            'n_val': len(val_idx)
        })
    cv_details[alpha] = {
        'fold_maes': fold_maes,
        'mean_mae': np.mean(fold_maes),
        'std_mae': np.std(fold_maes),
        'folds': fold_details
    }

# Find best alpha
best_alpha = min(cv_details, key=lambda a: cv_details[a]['mean_mae'])
best_mean_mae = cv_details[best_alpha]['mean_mae']
best_std_mae = cv_details[best_alpha]['std_mae']

# Train final model on all data
final_model = Ridge(alpha=best_alpha)
final_model.fit(X, y)

# Save details
with open(results_dir / "cv_details.json", "w") as f:
    json.dump(cv_details, f, indent=2)

# Print summary
print("="*60)
print("Ridge Regression Cross-Validation Summary")
print("="*60)
print(f"Best alpha: {best_alpha}")
print(f"Mean MAE (CV): {best_mean_mae:.2f} ± {best_std_mae:.2f}")
print("All details saved to cv_details.json")
print("="*60)


Columns after get_dummies: ['city', 'state', 'YearMonth', 'year', 'month', 'PM2.5', 'AT', 'avg_ndvi', 'latitude', 'longitude', 'population', 'IGP', 'has_wikipedia', 'media_count', 'Tier_Tier2', 'Tier_Tier3']
Tier columns found: ['Tier_Tier2', 'Tier_Tier3']
Ridge Regression Cross-Validation Summary
Best alpha: 100.0
Mean MAE (CV): 24.88 ± 2.83
All details saved to cv_details.json


In [6]:
# Save the trained model to disk
model_path = results_dir / "ridge_final_model.joblib"
joblib.dump(final_model, model_path)

# Save scaler and encoder
joblib.dump(scaler, results_dir / "scaler.joblib")
# joblib.dump(tier_le, results_dir / "tier_encoder.joblib")

['/home/diya.thakor/AirQuality/BASELINE/results/scaler.joblib']

#### With lat,lon,month and year only

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_absolute_error
import joblib
import json

# Paths
base_dir = Path("/home/diya.thakor/AirQuality/BASELINE")
data_dir = base_dir / "data"
results_dir = base_dir / "results"
results_dir.mkdir(parents=True, exist_ok=True)

# Load data
train_df = pd.read_csv(data_dir / "train_2022.csv")

# Feature selection - ONLY latitude, longitude, month, year
feature_cols = ['latitude', 'longitude', 'month', 'year']

print(f"\nFeatures used: {feature_cols}")
print(f"Total features: {len(feature_cols)}")

X = train_df[feature_cols].values
y = train_df['PM2.5'].values
cities = train_df['city'].values

# Standardize features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Hyperparameter grid
alphas = [0.01, 0.1, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0]
k = 5
gkf = GroupKFold(n_splits=k)

# Store all split details
cv_details = {}

print("\nRunning GroupKFold Cross-Validation...")
print("="*60)

for alpha in alphas:
    fold_maes = []
    fold_details = []
    for fold, (train_idx, val_idx) in enumerate(gkf.split(X, y, groups=cities)):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        cities_train = cities[train_idx]
        cities_val = cities[val_idx]

        # Train model
        model = Ridge(alpha=alpha)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        mae = mean_absolute_error(y_val, y_pred)
        fold_maes.append(mae)

        # Store fold details
        fold_details.append({
            'fold': fold,
            'alpha': alpha,
            'train_cities': sorted(list(np.unique(cities_train))),
            'val_cities': sorted(list(np.unique(cities_val))),
            'mae': float(mae),
            'n_train': len(train_idx),
            'n_val': len(val_idx)
        })
    
    cv_details[alpha] = {
        'fold_maes': fold_maes,
        'mean_mae': float(np.mean(fold_maes)),
        'std_mae': float(np.std(fold_maes)),
        'folds': fold_details
    }

# Find best alpha
best_alpha = min(cv_details, key=lambda a: cv_details[a]['mean_mae'])
best_mean_mae = cv_details[best_alpha]['mean_mae']
best_std_mae = cv_details[best_alpha]['std_mae']

# Train final model on all data
final_model = Ridge(alpha=best_alpha)
final_model.fit(X, y)

# Save model, scaler, and details with '_minimal' suffix
joblib.dump(final_model, results_dir / "ridge_minimal_model.joblib")
joblib.dump(scaler, results_dir / "ridge_minimal_scaler.joblib")
with open(results_dir / "ridge_minimal_feature_cols.json", "w") as f:
    json.dump(feature_cols, f)
with open(results_dir / "ridge_minimal_cv_details.json", "w") as f:
    json.dump(cv_details, f, indent=2)

# Print summary
print("="*60)
print("Ridge Regression CV Summary - MINIMAL FEATURES")
print("="*60)
print(f"Features used: {feature_cols}")
print(f"Best alpha: {best_alpha}")
print(f"Mean MAE (CV): {best_mean_mae:.2f} ± {best_std_mae:.2f}")
print("All details saved with 'minimal' suffix")
print("="*60)


Features used: ['latitude', 'longitude', 'month', 'year']
Total features: 4

Running GroupKFold Cross-Validation...
Ridge Regression CV Summary - MINIMAL FEATURES
Features used: ['latitude', 'longitude', 'month', 'year']
Best alpha: 100.0
Mean MAE (CV): 26.15 ± 2.10
All details saved with 'minimal' suffix


### Testing

#### With multiple featuers: 
'latitude', 'longitude', 'month', 'AT', 'avg_ndvi', 'population', 'IGP', 'has_wikipedia', 'media_count', 'Tier_Tier2', 'Tier_Tier3'

In [5]:
# Paths
base_dir = Path("/home/diya.thakor/AirQuality/BASELINE")
data_dir = base_dir / "data"
results_dir = base_dir / "results"

# Load test data
test_df = pd.read_csv(data_dir / "test_2023.csv")

# Load saved scaler and encoder
scaler = joblib.load(results_dir / "scaler.joblib")
# tier_le = joblib.load(results_dir / "tier_encoder.joblib")

# Encode Tier in test data using loaded encoder
# test_df['Tier_encoded'] = tier_le.transform(test_df['Tier'])

test_df = pd.get_dummies(test_df, columns=['Tier'], prefix='Tier', drop_first=True)

# Select features (same as training)
feature_cols = [
    'latitude', 'longitude', 'month', 'AT', 'avg_ndvi',
    'population', 'IGP', 'has_wikipedia', 'media_count',
    'Tier_Tier2', 'Tier_Tier3'
]

X_test = test_df[feature_cols].values

# Scale features using loaded scaler
X_test = scaler.transform(X_test)

# Load trained model
model = joblib.load(results_dir / "ridge_final_model.joblib")

# Predict on test set
y_test_pred = model.predict(X_test)
y_test_true = test_df['PM2.5'].values

# Evaluate
mae = mean_absolute_error(y_test_true, y_test_pred)
rmse = np.sqrt(mean_squared_error(y_test_true, y_test_pred))
r2 = r2_score(y_test_true, y_test_pred)
spearman = spearmanr(y_test_true, y_test_pred)[0]

print("="*60)
print("Ridge Regression Test Set Evaluation (2023)")
print("="*60)
print(f"Test MAE: {mae:.2f}")
print(f"Test RMSE: {rmse:.2f}")
print(f"Test R²: {r2:.3f}")
print(f"Test Spearman: {spearman:.3f}")
print("="*60)

# Save predictions for further analysis
test_df['predicted_PM2.5'] = y_test_pred
test_df.to_csv(results_dir / "ridge_test_predictions.csv", index=False)

Ridge Regression Test Set Evaluation (2023)
Test MAE: 22.53
Test RMSE: 30.93
Test R²: 0.254
Test Spearman: 0.493


#### With lat,lon,month and year

In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
import joblib
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import spearmanr
import json

# Paths
base_dir = Path("/home/diya.thakor/AirQuality/BASELINE")
data_dir = base_dir / "data"
results_dir = base_dir / "results"

# Load test data
test_df = pd.read_csv(data_dir / "test_2023.csv")

# Load feature columns (should be: latitude, longitude, month, year)
with open(results_dir / "ridge_minimal_feature_cols.json", "r") as f:
    feature_cols = json.load(f)

print(f"Features used: {feature_cols}")

# Select features (NO one-hot encoding needed)
X_test = test_df[feature_cols].values

# Load saved scaler and model
scaler = joblib.load(results_dir / "ridge_minimal_scaler.joblib")
model = joblib.load(results_dir / "ridge_minimal_model.joblib")

# Scale features using loaded scaler
X_test = scaler.transform(X_test)

# Predict on test set
y_test_pred = model.predict(X_test)
y_test_true = test_df['PM2.5'].values

# Evaluate
mae = mean_absolute_error(y_test_true, y_test_pred)
rmse = np.sqrt(mean_squared_error(y_test_true, y_test_pred))
r2 = r2_score(y_test_true, y_test_pred)
spearman = spearmanr(y_test_true, y_test_pred)[0]

print("="*60)
print("Ridge Regression Test Set Evaluation - MINIMAL FEATURES (2023)")
print("="*60)
print(f"Features: {feature_cols}")
print(f"Test MAE: {mae:.2f}")
print(f"Test RMSE: {rmse:.2f}")
print(f"Test R²: {r2:.3f}")
print(f"Test Spearman: {spearman:.3f}")
print("="*60)

# Save predictions for further analysis
test_df['predicted_PM2.5'] = y_test_pred
test_df.to_csv(results_dir / "ridge_minimal_test_predictions.csv", index=False)

print(f"\nPredictions saved to: ridge_minimal_test_predictions.csv")


Features used: ['latitude', 'longitude', 'month', 'year']
Ridge Regression Test Set Evaluation - MINIMAL FEATURES (2023)
Features: ['latitude', 'longitude', 'month', 'year']
Test MAE: 25.36
Test RMSE: 34.52
Test R²: 0.072
Test Spearman: 0.287

Predictions saved to: ridge_minimal_test_predictions.csv
