In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_absolute_error
import xgboost as xgb
import joblib
import json

### Training

#### With multiple featuers: 
'latitude', 'longitude', 'month', 'AT', 'avg_ndvi', 'population', 'IGP', 'has_wikipedia', 'media_count', 'Tier_Tier2', 'Tier_Tier3'

In [3]:
# Paths
base_dir = Path("/home/diya.thakor/AirQuality/BASELINE")
data_dir = base_dir / "data"
results_dir = base_dir / "results"
results_dir.mkdir(parents=True, exist_ok=True)

# Load training data
train_df = pd.read_csv(data_dir / "train_2022.csv")

train_df = pd.get_dummies(train_df, columns=['Tier'], prefix='Tier', drop_first=True)

# Features (include your one-hot Tier columns)
main_features = [
    'latitude', 'longitude', 'month', 'AT', 'avg_ndvi',
    'population', 'IGP', 'has_wikipedia', 'media_count'
]
tier_cols = [col for col in train_df.columns if col.startswith('Tier_')]
feature_cols = main_features + tier_cols

X = train_df[feature_cols].values
y = train_df['PM2.5'].values
cities = train_df['city'].values

# No need for StandardScaler with XGBoost, but keep for consistency
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Hyperparameter grid
param_grid = {
    'n_estimators': [100, 300],
    'max_depth': [3, 6],
    'learning_rate': [0.05, 0.1],
    'subsample': [1.0],
    'colsample_bytree': [1.0],
}

# GroupKFold setup
k = 5
kf = GroupKFold(n_splits=k)

cv_details = {}

# Try all parameter settings and log details
for n_est in param_grid['n_estimators']:
    for max_d in param_grid['max_depth']:
        for lr in param_grid['learning_rate']:
            for sub in param_grid['subsample']:
                for colsub in param_grid['colsample_bytree']:
                    params = {
                        'n_estimators': n_est,
                        'max_depth': max_d,
                        'learning_rate': lr,
                        'subsample': sub,
                        'colsample_bytree': colsub,
                        'random_state': 42,
                        'n_jobs': -1,
                        'tree_method': 'hist',      
                        'device': 'cuda:0'          
                    }
                    fold_maes = []
                    fold_details = []
                    
                    for fold, (train_idx, val_idx) in enumerate(kf.split(X, y, groups=cities)):
                        X_train, X_val = X[train_idx], X[val_idx]
                        y_train, y_val = y[train_idx], y[val_idx]
                        cities_train = cities[train_idx]
                        cities_val = cities[val_idx]

                        model = xgb.XGBRegressor(**params)
                        model.fit(X_train, y_train)
                        y_pred = model.predict(X_val)
                        mae = mean_absolute_error(y_val, y_pred)
                        
                        fold_maes.append(mae)
                        fold_details.append({
                            'fold': fold,
                            'params': params,
                            'train_cities': sorted(list(np.unique(cities_train))),
                            'val_cities': sorted(list(np.unique(cities_val))),
                            'mae': mae,
                            'n_train': len(train_idx),
                            'n_val': len(val_idx)
                        })
                    
                    cv_details[str(params)] = {
                        'params': params,
                        'fold_maes': fold_maes,
                        'mean_mae': float(np.mean(fold_maes)),
                        'std_mae': float(np.std(fold_maes)),
                        'folds': fold_details
                    }

# Find best params (lowest mean MAE)
best_params_key = min(cv_details, key=lambda k: cv_details[k]['mean_mae'])
best_params = cv_details[best_params_key]['params']
best_mean_mae = cv_details[best_params_key]['mean_mae']
best_std_mae = cv_details[best_params_key]['std_mae']

# Train final model on all data
final_model = xgb.XGBRegressor(**best_params)
final_model.fit(X, y)

# Save all artifacts
joblib.dump(final_model, results_dir / "xgboost_final_model.joblib")
joblib.dump(scaler, results_dir / "xgboost_scaler.joblib")
with open(results_dir / "xgboost_feature_cols.json", "w") as f:
    json.dump(feature_cols, f)
with open(results_dir / "xgboost_cv_details.json", "w") as f:
    json.dump(cv_details, f, indent=2)

print("="*60)
print("XGBoost GroupKFold Cross-Validation Summary (GPU: cuda:0)")
print("="*60)
print(f"Best params: {best_params}")
print(f"Mean MAE (CV): {best_mean_mae:.3f} ± {best_std_mae:.3f}")
print("All details saved to xgboost_cv_details.json")
print("="*60)

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


XGBoost GroupKFold Cross-Validation Summary (GPU: cuda:0)
Best params: {'n_estimators': 100, 'max_depth': 3, 'learning_rate': 0.1, 'subsample': 1.0, 'colsample_bytree': 1.0, 'random_state': 42, 'n_jobs': -1, 'tree_method': 'hist', 'device': 'cuda:0'}
Mean MAE (CV): 15.943 ± 1.336
All details saved to xgboost_cv_details.json


#### With lat,lon,month and year only

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_absolute_error
import joblib
import json

# Paths
base_dir = Path("/home/diya.thakor/AirQuality/BASELINE")
data_dir = base_dir / "data"
results_dir = base_dir / "results"
results_dir.mkdir(parents=True, exist_ok=True)

# Load training data
train_df = pd.read_csv(data_dir / "train_2022.csv")

# Features - ONLY latitude, longitude, month, year
feature_cols = ['latitude', 'longitude', 'month', 'year']

print(f"\nFeatures used: {feature_cols}")
print(f"Total features: {len(feature_cols)}")

X = train_df[feature_cols].values
y = train_df['PM2.5'].values
cities = train_df['city'].values

# StandardScaler (keep for consistency, though XGBoost doesn't strictly need it)
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Hyperparameter grid
param_grid = {
    'n_estimators': [100, 300],
    'max_depth': [3, 6],
    'learning_rate': [0.05, 0.1],
    'subsample': [1.0],
    'colsample_bytree': [1.0],
}

# GroupKFold setup
k = 5
kf = GroupKFold(n_splits=k)

cv_details = {}

print("\nRunning GroupKFold Cross-Validation with GPU acceleration...")
print("="*60)

# Try all parameter settings and log details
for n_est in param_grid['n_estimators']:
    for max_d in param_grid['max_depth']:
        for lr in param_grid['learning_rate']:
            for sub in param_grid['subsample']:
                for colsub in param_grid['colsample_bytree']:
                    params = {
                        'n_estimators': n_est,
                        'max_depth': max_d,
                        'learning_rate': lr,
                        'subsample': sub,
                        'colsample_bytree': colsub,
                        'random_state': 42,
                        'n_jobs': -1,
                        'tree_method': 'hist',      
                        'device': 'cuda:0'          
                    }
                    fold_maes = []
                    fold_details = []
                    
                    for fold, (train_idx, val_idx) in enumerate(kf.split(X, y, groups=cities)):
                        X_train, X_val = X[train_idx], X[val_idx]
                        y_train, y_val = y[train_idx], y[val_idx]
                        cities_train = cities[train_idx]
                        cities_val = cities[val_idx]

                        model = xgb.XGBRegressor(**params)
                        model.fit(X_train, y_train)
                        y_pred = model.predict(X_val)
                        mae = mean_absolute_error(y_val, y_pred)
                        
                        fold_maes.append(mae)
                        fold_details.append({
                            'fold': fold,
                            'params': params,
                            'train_cities': sorted(list(np.unique(cities_train))),
                            'val_cities': sorted(list(np.unique(cities_val))),
                            'mae': float(mae),
                            'n_train': len(train_idx),
                            'n_val': len(val_idx)
                        })
                    
                    cv_details[str(params)] = {
                        'params': params,
                        'fold_maes': fold_maes,
                        'mean_mae': float(np.mean(fold_maes)),
                        'std_mae': float(np.std(fold_maes)),
                        'folds': fold_details
                    }

# Find best params (lowest mean MAE)
best_params_key = min(cv_details, key=lambda k: cv_details[k]['mean_mae'])
best_params = cv_details[best_params_key]['params']
best_mean_mae = cv_details[best_params_key]['mean_mae']
best_std_mae = cv_details[best_params_key]['std_mae']

# Train final model on all data
final_model = xgb.XGBRegressor(**best_params)
final_model.fit(X, y)

# Save all artifacts with '_minimal' suffix
joblib.dump(final_model, results_dir / "xgboost_minimal_model.joblib")
joblib.dump(scaler, results_dir / "xgboost_minimal_scaler.joblib")
with open(results_dir / "xgboost_minimal_feature_cols.json", "w") as f:
    json.dump(feature_cols, f)
with open(results_dir / "xgboost_minimal_cv_details.json", "w") as f:
    json.dump(cv_details, f, indent=2)

print("="*60)
print("XGBoost GroupKFold CV Summary - MINIMAL FEATURES (GPU: cuda:0)")
print("="*60)
print(f"Features used: {feature_cols}")
print(f"Best params: {best_params}")
print(f"Mean MAE (CV): {best_mean_mae:.3f} ± {best_std_mae:.3f}")
print("All details saved with 'minimal' suffix")
print("="*60)


Features used: ['latitude', 'longitude', 'month', 'year']
Total features: 4

Running GroupKFold Cross-Validation with GPU acceleration...


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


XGBoost GroupKFold CV Summary - MINIMAL FEATURES (GPU: cuda:0)
Features used: ['latitude', 'longitude', 'month', 'year']
Best params: {'n_estimators': 100, 'max_depth': 3, 'learning_rate': 0.1, 'subsample': 1.0, 'colsample_bytree': 1.0, 'random_state': 42, 'n_jobs': -1, 'tree_method': 'hist', 'device': 'cuda:0'}
Mean MAE (CV): 15.880 ± 1.797
All details saved with 'minimal' suffix


### Testing

#### With multiple featuers: 
'latitude', 'longitude', 'month', 'AT', 'avg_ndvi', 'population', 'IGP', 'has_wikipedia', 'media_count', 'Tier_Tier2', 'Tier_Tier3'

In [4]:
import pandas as pd
import numpy as np
from pathlib import Path
import joblib
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import spearmanr

# Paths
base_dir = Path("/home/diya.thakor/AirQuality/BASELINE")
data_dir = base_dir / "data"
results_dir = base_dir / "results"

# Load test data
test_df = pd.read_csv(data_dir / "test_2023.csv")

# Load saved scaler
scaler = joblib.load(results_dir / "xgboost_scaler.joblib")

# One-hot encode Tier in test data (must match training)
test_df = pd.get_dummies(test_df, columns=['Tier'], prefix='Tier', drop_first=True)

# Load feature columns (to ensure correct order and presence)
with open(results_dir / "xgboost_feature_cols.json", "r") as f:
    feature_cols = json.load(f)

# Select features (same as training)
X_test = test_df[feature_cols].values

# Scale features using loaded scaler (for consistency, even if not strictly needed for XGBoost)
X_test = scaler.transform(X_test)

# Load trained XGBoost model
model = joblib.load(results_dir / "xgboost_final_model.joblib")

# Predict on test set
y_test_pred = model.predict(X_test)
y_test_true = test_df['PM2.5'].values

# Evaluate
mae = mean_absolute_error(y_test_true, y_test_pred)
rmse = np.sqrt(mean_squared_error(y_test_true, y_test_pred))
r2 = r2_score(y_test_true, y_test_pred)
spearman = spearmanr(y_test_true, y_test_pred)[0]

print("="*60)
print("XGBoost Test Set Evaluation (2023)")
print("="*60)
print(f"Test MAE: {mae:.2f}")
print(f"Test RMSE: {rmse:.2f}")
print(f"Test R²: {r2:.3f}")
print(f"Test Spearman: {spearman:.3f}")
print("="*60)

# Save predictions for further analysis
test_df['predicted_PM2.5'] = y_test_pred
test_df.to_csv(results_dir / "xgboost_test_predictions.csv", index=False)

XGBoost Test Set Evaluation (2023)
Test MAE: 16.50
Test RMSE: 24.75
Test R²: 0.523
Test Spearman: 0.745


#### With lat,lon,month and year only

In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
import joblib
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import spearmanr
import json

# Paths
base_dir = Path("/home/diya.thakor/AirQuality/BASELINE")
data_dir = base_dir / "data"
results_dir = base_dir / "results"

# Load test data
test_df = pd.read_csv(data_dir / "test_2023.csv")

# Load feature columns (should be: latitude, longitude, month, year)
with open(results_dir / "xgboost_minimal_feature_cols.json", "r") as f:
    feature_cols = json.load(f)

print(f"Features used: {feature_cols}")

# Select features (NO one-hot encoding needed)
X_test = test_df[feature_cols].values

# Load saved scaler and model
scaler = joblib.load(results_dir / "xgboost_minimal_scaler.joblib")
model = joblib.load(results_dir / "xgboost_minimal_model.joblib")

# Scale features using loaded scaler
X_test = scaler.transform(X_test)

# Predict on test set
y_test_pred = model.predict(X_test)
y_test_true = test_df['PM2.5'].values

# Evaluate
mae = mean_absolute_error(y_test_true, y_test_pred)
rmse = np.sqrt(mean_squared_error(y_test_true, y_test_pred))
r2 = r2_score(y_test_true, y_test_pred)
spearman = spearmanr(y_test_true, y_test_pred)[0]

print("="*60)
print("XGBoost Test Set Evaluation - MINIMAL FEATURES (2023)")
print("="*60)
print(f"Features: {feature_cols}")
print(f"Test MAE: {mae:.2f}")
print(f"Test RMSE: {rmse:.2f}")
print(f"Test R²: {r2:.3f}")
print(f"Test Spearman: {spearman:.3f}")
print("="*60)

# Save predictions for further analysis
test_df['predicted_PM2.5'] = y_test_pred
test_df.to_csv(results_dir / "xgboost_minimal_test_predictions.csv", index=False)

print(f"\nPredictions saved to: xgboost_minimal_test_predictions.csv")

Features used: ['latitude', 'longitude', 'month', 'year']
XGBoost Test Set Evaluation - MINIMAL FEATURES (2023)
Features: ['latitude', 'longitude', 'month', 'year']
Test MAE: 16.72
Test RMSE: 24.40
Test R²: 0.536
Test Spearman: 0.752

Predictions saved to: xgboost_minimal_test_predictions.csv
