In [3]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_absolute_error
import joblib
import json

### Training

#### With multiple featuers: 
'latitude', 'longitude', 'month', 'AT', 'avg_ndvi', 'population', 'IGP', 'has_wikipedia', 'media_count', 'Tier_Tier2', 'Tier_Tier3'

In [2]:
# Paths
base_dir = Path("/home/diya.thakor/AirQuality/BASELINE")
data_dir = base_dir / "data"
results_dir = base_dir / "results"
results_dir.mkdir(parents=True, exist_ok=True)

# Load training data
train_df = pd.read_csv(data_dir / "train_2022.csv")

train_df = pd.get_dummies(train_df, columns=['Tier'], prefix='Tier', drop_first=True)

# Features (include your one-hot Tier columns)
main_features = [
    'latitude', 'longitude', 'month', 'AT', 'avg_ndvi',
    'population', 'IGP', 'has_wikipedia', 'media_count'
]
tier_cols = [col for col in train_df.columns if col.startswith('Tier_')]
feature_cols = main_features + tier_cols

X = train_df[feature_cols].values
y = train_df['PM2.5'].values
cities = train_df['city'].values

# Hyperparameter grid
param_grid = {
    'n_estimators': [100, 300],
    'max_depth': [6, 12],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# GroupKFold setup
k = 5
kf = GroupKFold(n_splits=k)

cv_details = {}

# Try all parameter settings and log details
for n_est in param_grid['n_estimators']:
    for max_d in param_grid['max_depth']:
        for min_split in param_grid['min_samples_split']:
            for min_leaf in param_grid['min_samples_leaf']:
                params = {
                    'n_estimators': n_est,
                    'max_depth': max_d,
                    'min_samples_split': min_split,
                    'min_samples_leaf': min_leaf,
                    'random_state': 42,
                    'n_jobs': -1
                }
                fold_maes = []
                fold_details = []
                
                for fold, (train_idx, val_idx) in enumerate(kf.split(X, y, groups=cities)):
                    X_train, X_val = X[train_idx], X[val_idx]
                    y_train, y_val = y[train_idx], y[val_idx]
                    cities_train = cities[train_idx]
                    cities_val = cities[val_idx]

                    model = RandomForestRegressor(**params)
                    model.fit(X_train, y_train)
                    y_pred = model.predict(X_val)
                    mae = mean_absolute_error(y_val, y_pred)
                    
                    fold_maes.append(mae)
                    fold_details.append({
                        'fold': fold,
                        'params': params,
                        'train_cities': sorted(list(np.unique(cities_train))),
                        'val_cities': sorted(list(np.unique(cities_val))),
                        'mae': mae,
                        'n_train': len(train_idx),
                        'n_val': len(val_idx)
                    })
                
                cv_details[str(params)] = {
                    'params': params,
                    'fold_maes': fold_maes,
                    'mean_mae': float(np.mean(fold_maes)),
                    'std_mae': float(np.std(fold_maes)),
                    'folds': fold_details
                }

# Find best params (lowest mean MAE)
best_params_key = min(cv_details, key=lambda k: cv_details[k]['mean_mae'])
best_params = cv_details[best_params_key]['params']
best_mean_mae = cv_details[best_params_key]['mean_mae']
best_std_mae = cv_details[best_params_key]['std_mae']

# Train final model on all data
final_model = RandomForestRegressor(**best_params)
final_model.fit(X, y)

# Save all artifacts
joblib.dump(final_model, results_dir / "randomforest_final_model.joblib")
with open(results_dir / "randomforest_feature_cols.json", "w") as f:
    json.dump(feature_cols, f)
with open(results_dir / "randomforest_cv_details.json", "w") as f:
    json.dump(cv_details, f, indent=2)

print("="*60)
print("Random Forest GroupKFold Cross-Validation Summary (No Scaler)")
print("="*60)
print(f"Best params: {best_params}")
print(f"Mean MAE (CV): {best_mean_mae:.3f} ± {best_std_mae:.3f}")
print("All details saved to randomforest_cv_details.json")
print("="*60)

Random Forest GroupKFold Cross-Validation Summary (No Scaler)
Best params: {'n_estimators': 300, 'max_depth': 12, 'min_samples_split': 5, 'min_samples_leaf': 2, 'random_state': 42, 'n_jobs': -1}
Mean MAE (CV): 16.027 ± 1.206
All details saved to randomforest_cv_details.json


#### With only lat,lon,month and year

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_absolute_error
import joblib
import json

# Paths
base_dir = Path("/home/diya.thakor/AirQuality/BASELINE")
data_dir = base_dir / "data"
results_dir = base_dir / "results"
results_dir.mkdir(parents=True, exist_ok=True)

# Load training data
train_df = pd.read_csv(data_dir / "train_2022.csv")

# Features - ONLY latitude, longitude, month, year
feature_cols = ['latitude', 'longitude', 'month', 'year']

print(f"\nFeatures used: {feature_cols}")
print(f"Total features: {len(feature_cols)}")

X = train_df[feature_cols].values
y = train_df['PM2.5'].values
cities = train_df['city'].values

# Hyperparameter grid
param_grid = {
    'n_estimators': [100, 300],
    'max_depth': [6, 12],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# GroupKFold setup
k = 5
kf = GroupKFold(n_splits=k)

cv_details = {}

print("\nRunning GroupKFold Cross-Validation...")
print("="*60)

# Try all parameter settings and log details
for n_est in param_grid['n_estimators']:
    for max_d in param_grid['max_depth']:
        for min_split in param_grid['min_samples_split']:
            for min_leaf in param_grid['min_samples_leaf']:
                params = {
                    'n_estimators': n_est,
                    'max_depth': max_d,
                    'min_samples_split': min_split,
                    'min_samples_leaf': min_leaf,
                    'random_state': 42,
                    'n_jobs': -1
                }
                fold_maes = []
                fold_details = []
                
                for fold, (train_idx, val_idx) in enumerate(kf.split(X, y, groups=cities)):
                    X_train, X_val = X[train_idx], X[val_idx]
                    y_train, y_val = y[train_idx], y[val_idx]
                    cities_train = cities[train_idx]
                    cities_val = cities[val_idx]

                    model = RandomForestRegressor(**params)
                    model.fit(X_train, y_train)
                    y_pred = model.predict(X_val)
                    mae = mean_absolute_error(y_val, y_pred)
                    
                    fold_maes.append(mae)
                    fold_details.append({
                        'fold': fold,
                        'params': params,
                        'train_cities': sorted(list(np.unique(cities_train))),
                        'val_cities': sorted(list(np.unique(cities_val))),
                        'mae': mae,
                        'n_train': len(train_idx),
                        'n_val': len(val_idx)
                    })
                
                cv_details[str(params)] = {
                    'params': params,
                    'fold_maes': fold_maes,
                    'mean_mae': float(np.mean(fold_maes)),
                    'std_mae': float(np.std(fold_maes)),
                    'folds': fold_details
                }

# Find best params (lowest mean MAE)
best_params_key = min(cv_details, key=lambda k: cv_details[k]['mean_mae'])
best_params = cv_details[best_params_key]['params']
best_mean_mae = cv_details[best_params_key]['mean_mae']
best_std_mae = cv_details[best_params_key]['std_mae']

# Train final model on all data
final_model = RandomForestRegressor(**best_params)
final_model.fit(X, y)

# Save all artifacts with suffix to distinguish from full feature model
joblib.dump(final_model, results_dir / "randomforest_minimal_model.joblib")
with open(results_dir / "randomforest_minimal_feature_cols.json", "w") as f:
    json.dump(feature_cols, f)
with open(results_dir / "randomforest_minimal_cv_details.json", "w") as f:
    json.dump(cv_details, f, indent=2)

print("="*60)
print("Random Forest GroupKFold CV Summary (Minimal Features)")
print("="*60)
print(f"Features used: {feature_cols}")
print(f"Best params: {best_params}")
print(f"Mean MAE (CV): {best_mean_mae:.3f} ± {best_std_mae:.3f}")
print("All details saved with 'minimal' suffix")
print("="*60)


Features used: ['latitude', 'longitude', 'month', 'year']
Total features: 4

Running GroupKFold Cross-Validation...
Random Forest GroupKFold CV Summary (Minimal Features)
Features used: ['latitude', 'longitude', 'month', 'year']
Best params: {'n_estimators': 300, 'max_depth': 12, 'min_samples_split': 5, 'min_samples_leaf': 2, 'random_state': 42, 'n_jobs': -1}
Mean MAE (CV): 15.268 ± 1.308
All details saved with 'minimal' suffix


### Testing

#### With multiple featuers: 
'latitude', 'longitude', 'month', 'AT', 'avg_ndvi', 'population', 'IGP', 'has_wikipedia', 'media_count', 'Tier_Tier2', 'Tier_Tier3'

In [3]:
import pandas as pd
import numpy as np
from pathlib import Path
import joblib
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import spearmanr
import json

# Paths
base_dir = Path("/home/diya.thakor/AirQuality/BASELINE")
data_dir = base_dir / "data"
results_dir = base_dir / "results"

# Load test data
test_df = pd.read_csv(data_dir / "test_2023.csv")

# One-hot encode Tier in test set
test_df = pd.get_dummies(test_df, columns=['Tier'], prefix='Tier', drop_first=True)

# Load feature columns (order must match training)
with open(results_dir / "randomforest_feature_cols.json", "r") as f:
    feature_cols = json.load(f)

# Select features
X_test = test_df[feature_cols].values

# Load trained Random Forest model
rf_model = joblib.load(results_dir / "randomforest_final_model.joblib")

# Predict on test set
y_test_pred = rf_model.predict(X_test)
y_test_true = test_df['PM2.5'].values

# Evaluate
mae = mean_absolute_error(y_test_true, y_test_pred)
rmse = np.sqrt(mean_squared_error(y_test_true, y_test_pred))
r2 = r2_score(y_test_true, y_test_pred)
spearman = spearmanr(y_test_true, y_test_pred)[0]

print("="*60)
print("Random Forest Test Set Evaluation (2023, No Scaler)")
print("="*60)
print(f"Test MAE: {mae:.2f}")
print(f"Test RMSE: {rmse:.2f}")
print(f"Test R²: {r2:.3f}")
print(f"Test Spearman: {spearman:.3f}")
print("="*60)

# Save predictions for further analysis
test_df['predicted_PM2.5'] = y_test_pred
test_df.to_csv(results_dir / "randomforest_test_predictions.csv", index=False)

Random Forest Test Set Evaluation (2023, No Scaler)
Test MAE: 16.55
Test RMSE: 25.44
Test R²: 0.496
Test Spearman: 0.740


#### With only lat,lon,month and year

In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
import joblib
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import spearmanr
import json

# Paths
base_dir = Path("/home/diya.thakor/AirQuality/BASELINE")
data_dir = base_dir / "data"
results_dir = base_dir / "results"

# Load test data
test_df = pd.read_csv(data_dir / "test_2023.csv")

# Load feature columns (should be: latitude, longitude, month, year)
with open(results_dir / "randomforest_minimal_feature_cols.json", "r") as f:
    feature_cols = json.load(f)

print(f"Features used: {feature_cols}")

# Select features (NO one-hot encoding needed for minimal features)
X_test = test_df[feature_cols].values

# Load trained Random Forest minimal model
rf_model = joblib.load(results_dir / "randomforest_minimal_model.joblib")

# Predict on test set
y_test_pred = rf_model.predict(X_test)
y_test_true = test_df['PM2.5'].values

# Evaluate
mae = mean_absolute_error(y_test_true, y_test_pred)
rmse = np.sqrt(mean_squared_error(y_test_true, y_test_pred))
r2 = r2_score(y_test_true, y_test_pred)
spearman = spearmanr(y_test_true, y_test_pred)[0]

print("="*60)
print("Random Forest Test Set Evaluation - MINIMAL FEATURES")
print("="*60)
print(f"Features: {feature_cols}")
print(f"Test MAE: {mae:.2f}")
print(f"Test RMSE: {rmse:.2f}")
print(f"Test R²: {r2:.3f}")
print(f"Test Spearman: {spearman:.3f}")
print("="*60)

# Save predictions for comparison
test_df['predicted_PM2.5'] = y_test_pred
test_df.to_csv(results_dir / "randomforest_minimal_test_predictions.csv", index=False)

print(f"\nPredictions saved to: randomforest_minimal_test_predictions.csv")

Features used: ['latitude', 'longitude', 'month', 'year']
Random Forest Test Set Evaluation - MINIMAL FEATURES
Features: ['latitude', 'longitude', 'month', 'year']
Test MAE: 16.26
Test RMSE: 24.45
Test R²: 0.534
Test Spearman: 0.757

Predictions saved to: randomforest_minimal_test_predictions.csv
