In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split, TimeSeriesSplit, GridSearchCV
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import pickle
import warnings
warnings.filterwarnings('ignore')

In [None]:

"""
Modeling for rainfall trend analysis in Eastern Nepal.
This script splits the feature-engineered data, trains regression and classification models,
evaluates performance, tunes hyperparameters, performs cross-validation, and saves the best models.
"""

# Define file paths
PREPROCESSED_PATH = '../Data/Preprocessed'
OUTPUT_PATH = '../Outputs'

# Ensure output directory exists
os.makedirs(OUTPUT_PATH, exist_ok=True)

def load_data():
    """Load feature-engineered data."""
    try:
        data = pd.read_csv(os.path.join(PREPROCESSED_PATH, 'feature_engineered_data.csv'))
        data['date'] = pd.to_datetime(data['date'])
        print("Feature-engineered data loaded successfully.")
        return data
    except FileNotFoundError as e:
        raise FileNotFoundError(f"Error: {e}. Check if feature_engineered_data.csv exists in {PREPROCESSED_PATH}")

def split_data(data, target, features):
    """Split data into training and validation sets without shuffling."""
    data = data.sort_values('date')  # Ensure chronological order
    X = data[features].fillna(0)
    y = data[target].fillna(0)
    
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, shuffle=False, random_state=42
    )
    
    print(f"Data split: Training ({len(X_train)} rows), Validation ({len(X_val)} rows)")
    return X_train, X_val, y_train, y_val

def evaluate_regression(y_true, y_pred):
    """Evaluate regression model performance."""
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    return {'MAE': mae, 'RMSE': rmse, 'R2': r2}

def evaluate_classification(y_true, y_pred, y_pred_proba=None):
    """Evaluate classification model performance."""
    metrics = {
        'Accuracy': accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred, zero_division=0),
        'Recall': recall_score(y_true, y_pred, zero_division=0),
        'F1': f1_score(y_true, y_pred, zero_division=0)
    }
    if y_pred_proba is not None:
        metrics['ROC-AUC'] = roc_auc_score(y_true, y_pred_proba)
    return metrics

def train_and_evaluate(models, X_train, X_val, y_train, y_val, task='regression'):
    """Train models and evaluate performance."""
    results = {}
    for name, model in models.items():
        print(f"\nTraining {name}...")
        model.fit(X_train, y_train)
        if task == 'regression':
            y_pred = model.predict(X_val)
            metrics = evaluate_regression(y_val, y_pred)
        else:
            y_pred = model.predict(X_val)
            y_pred_proba = model.predict_proba(X_val)[:, 1] if hasattr(model, 'predict_proba') else None
            metrics = evaluate_classification(y_val, y_pred, y_pred_proba)
        
        results[name] = {'model': model, 'metrics': metrics}
        print(f"{name} Metrics:", metrics)
    return results

def hyperparameter_tuning(model, param_grid, X_train, y_train, task='regression'):
    """Perform hyperparameter tuning with GridSearchCV."""
    print(f"\nTuning hyperparameters for {model.__class__.__name__}...")
    scoring = 'r2' if task == 'regression' else 'f1'
    cv = TimeSeriesSplit(n_splits=3)
    grid_search = GridSearchCV(model, param_grid, cv=cv, scoring=scoring, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best {scoring} score: {grid_search.best_score_:.4f}")
    return grid_search.best_estimator_

def cross_validation(model, X, y, task='regression'):
    """Perform time-series cross-validation."""
    print(f"\nCross-validating {model.__class__.__name__}...")
    cv = TimeSeriesSplit(n_splits=5)
    scores = []
    
    for train_idx, val_idx in cv.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        model.fit(X_train, y_train)
        if task == 'regression':
            y_pred = model.predict(X_val)
            score = r2_score(y_val, y_pred)
        else:
            y_pred = model.predict(X_val)
            score = f1_score(y_val, y_pred, zero_division=0)
        scores.append(score)
    
    print(f"Cross-validation scores: {scores}")
    print(f"Mean CV score: {np.mean(scores):.4f} (±{np.std(scores):.4f})")
    return scores

def save_model(model, name):
    """Save trained model to file."""
    output_file = os.path.join(OUTPUT_PATH, f'{name}_model.pkl')
    with open(output_file, 'wb') as f:
        pickle.dump(model, f)
    print(f"Saved {name} model to {output_file}")

def main():
    """Main function to execute modeling steps."""
    # Step: Load and split data
    data = load_data()
    
    # Features (exclude targets and non-numeric columns)
    features = [
        'ele(meter)', 'lat(deg)', 'lon(deg)', 'year', 'month', 'day_of_year',
        'yearly_rainfall', 'monthly_rainfall', 'prev_day_rainfall', 'rolling_mean_7d',
        'station_name_x_encoded', 'district_encoded',
        'log_rainfall_sum', 'log_yearly_rainfall', 'log_monthly_rainfall',
        'log_prev_day_rainfall', 'log_rolling_mean_7d', 'log_day_of_year',
        'pca_component_1', 'pca_component_2', 'pca_component_3'
    ]
    features = [f for f in features if f in data.columns]  # Filter available features
    
    # Regression task
    print("\n--- Regression Task (Predicting rainfall_sum) ---")
    X_train_reg, X_val_reg, y_train_reg, y_val_reg = split_data(data, 'rainfall_sum', features)
    
    # Choose algorithms (Regression)
    regression_models = {
        'LinearRegression': LinearRegression(),
        'RandomForestRegressor': RandomForestRegressor(random_state=42),
        'GradientBoostingRegressor': GradientBoostingRegressor(random_state=42)
    }
    
    # Train and make predictions
    reg_results = train_and_evaluate(regression_models, X_train_reg, X_val_reg, y_train_reg, y_val_reg, task='regression')
    
    # Hyperparameter tuning (Random Forest as example)
    rf_reg = RandomForestRegressor(random_state=42)
    rf_reg_param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5]
    }
    best_rf_reg = hyperparameter_tuning(rf_reg, rf_reg_param_grid, X_train_reg, y_train_reg, task='regression')
    reg_results['TunedRandomForestRegressor'] = {
        'model': best_rf_reg,
        'metrics': evaluate_regression(y_val_reg, best_rf_reg.predict(X_val_reg))
    }
    print("Tuned RandomForestRegressor Metrics:", reg_results['TunedRandomForestRegressor']['metrics'])
    
    # Cross-validation (Best model)
    cv_scores_reg = cross_validation(best_rf_reg, X_train_reg, y_train_reg, task='regression')
    
    # Classification task
    print("\n--- Classification Task (Predicting extreme_rainfall) ---")
    X_train_clf, X_val_clf, y_train_clf, y_val_clf = split_data(data, 'extreme_rainfall', features)
    
    # Choose algorithms (Classification)
    classification_models = {
        'LogisticRegression': LogisticRegression(random_state=42),
        'RandomForestClassifier': RandomForestClassifier(random_state=42),
        'GradientBoostingClassifier': GradientBoostingClassifier(random_state=42)
    }
    
    # Train and make predictions
    clf_results = train_and_evaluate(classification_models, X_train_clf, X_val_clf, y_train_clf, y_val_clf, task='classification')
    
    # Hyperparameter tuning (Random Forest as example)
    rf_clf = RandomForestClassifier(random_state=42)
    rf_clf_param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5]
    }
    best_rf_clf = hyperparameter_tuning(rf_clf, rf_clf_param_grid, X_train_clf, y_train_clf, task='classification')
    clf_results['TunedRandomForestClassifier'] = {
        'model': best_rf_clf,
        'metrics': evaluate_classification(y_val_clf, best_rf_clf.predict(X_val_clf), best_rf_clf.predict_proba(X_val_clf)[:, 1])
    }
    print("Tuned RandomForestClassifier Metrics:", clf_results['TunedRandomForestClassifier']['metrics'])
    
    # Cross-validation (Best model)
    cv_scores_clf = cross_validation(best_rf_clf, X_train_clf, y_train_clf, task='classification')
    
    # Save evaluation results
    with open(os.path.join(OUTPUT_PATH, 'model_evaluation.txt'), 'w') as f:
        f.write("Regression Results:\n")
        for name, result in reg_results.items():
            f.write(f"{name}: {result['metrics']}\n")
        f.write(f"\nCross-validation (TunedRandomForestRegressor): {cv_scores_reg}\n")
        f.write(f"Mean CV Score: {np.mean(cv_scores_reg):.4f} (±{np.std(cv_scores_reg):.4f})\n")
        f.write("\nClassification Results:\n")
        for name, result in clf_results.items():
            f.write(f"{name}: {result['metrics']}\n")
        f.write(f"\nCross-validation (TunedRandomForestClassifier): {cv_scores_clf}\n")
        f.write(f"Mean CV Score: {np.mean(cv_scores_clf):.4f} (±{np.std(cv_scores_clf):.4f})\n")
    
    # Save the best models
    save_model(best_rf_reg, 'best_random_forest_regressor')
    save_model(best_rf_clf, 'best_random_forest_classifier')
    
    print("Modeling completed successfully.")

if __name__ == "__main__":
    main()

Feature-engineered data loaded successfully.

--- Regression Task (Predicting rainfall_sum) ---
Data split: Training (172403 rows), Validation (43101 rows)

Training LinearRegression...
LinearRegression Metrics: {'MAE': 0.16553317330954534, 'RMSE': 0.3631788987097078, 'R2': 0.8600617647783155}

Training RandomForestRegressor...
RandomForestRegressor Metrics: {'MAE': 5.292177981573996e-05, 'RMSE': 0.0012119240583506156, 'R2': 0.9999984417182236}

Training GradientBoostingRegressor...
GradientBoostingRegressor Metrics: {'MAE': 0.0027113283440877186, 'RMSE': 0.008856669656250529, 'R2': 0.9999167783982105}

Tuning hyperparameters for RandomForestRegressor...
