# Setup and Data Loading

In [57]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, accuracy_score
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from kaggle.api.kaggle_api_extended import KaggleApi
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
import time


# Initialize Kaggle API
api = KaggleApi()
api.authenticate()

# Data Preprocessing

In [41]:
dataset.day.max()

365

In [42]:
# Load dataset
dataset = pd.read_csv('train.csv')

# Display first few rows
dataset.head()

Unnamed: 0,id,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall
0,0,1,1017.4,21.2,20.6,19.9,19.4,87.0,88.0,1.1,60.0,17.2,1
1,1,2,1019.5,16.2,16.9,15.8,15.4,95.0,91.0,0.0,50.0,21.9,1
2,2,3,1024.1,19.4,16.1,14.6,9.3,75.0,47.0,8.3,70.0,18.1,1
3,3,4,1013.4,18.1,17.8,16.9,16.8,95.0,95.0,0.0,60.0,35.6,1
4,4,5,1021.8,21.3,18.4,15.2,9.6,52.0,45.0,3.6,40.0,24.8,0


# Feature Engineering

In [43]:
# Feature Engineering for rainfall prediction

def create_basic_features(df):
    """Create basic engineered features from the dataset"""
    features_df = df.copy()
    
    # 1. Temperature range (difference between max and min temperature)
    features_df['temp_range'] = features_df['maxtemp'] - features_df['mintemp']
    
    # 2. Dew point depression (difference between temperature and dew point)
    features_df['dewpoint_depression'] = features_df['temparature'] - features_df['dewpoint']
    
    # 3. Heat index - a measure of how hot it feels when humidity is factored in
    # Simple formula: HI = 0.5 * (T + 61.0 + ((T-68.0)*1.2) + (RH*0.094))
    features_df['heat_index'] = 0.5 * (features_df['temparature'] + 61.0 + 
                                     ((features_df['temparature']-68.0)*1.2) + 
                                     (features_df['humidity']*0.094))
    
    # 4. Seasonal feature using sine and cosine transformation of day of year
    features_df['day_sin'] = np.sin(2 * np.pi * features_df['day']/365)
    features_df['day_cos'] = np.cos(2 * np.pi * features_df['day']/365)
    
    # 5. Pressure trend (difference from daily average)
    # Calculate the deviation from mean
    mean_pressure = features_df['pressure'].mean()
    features_df['pressure_deviation'] = features_df['pressure'] - mean_pressure
    
    return features_df

def add_previous_day_features(df):
    """Add features from the previous day"""
    features_df = df.copy()
    
    # Sort by id to ensure days are in order
    features_df = features_df.sort_values('day')
    
    # Weather columns to use for previous day features
    weather_columns = [
        'pressure', 'maxtemp', 'temparature', 'mintemp', 'dewpoint', 
        'humidity', 'cloud', 'sunshine', 'winddirection', 'windspeed',
    ]
    
    # Create columns for previous day features
    for col in weather_columns:
        features_df[f'{col}_day_before'] = features_df[col].shift(1)
    
    # For the first day, we don't have previous data, so fill with the same day's values
    # This is a simplification; in a real scenario, you might want to handle this differently
    for col in weather_columns:
        features_df[f'{col}_day_before'].fillna(features_df[col], inplace=True)
    
    # Calculate day-over-day changes
    for col in weather_columns:
        features_df[f'{col}_change'] = features_df[col] - features_df[f'{col}_day_before']
    
    return features_df

def engineer_features(df):
    """Complete feature engineering process"""
    # First create basic features
    df_with_features = create_basic_features(df)
    
    # Then add previous day features
    df_with_features = add_previous_day_features(df_with_features)
    
    return df_with_features

# Apply feature engineering to the dataset
dataset = engineer_features(dataset)

# Display the new features
basic_features = ['temp_range', 'dewpoint_depression', 'heat_index', 'day_sin', 'day_cos', 'pressure_deviation']
previous_day_cols = [col for col in dataset.columns if '_day_before' in col or '_change' in col]
new_features = basic_features + previous_day_cols[:10]  # Show first 10 previous day features to keep output manageable

dataset[new_features].describe()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  features_df[f'{col}_day_before'].fillna(features_df[col], inplace=True)


Unnamed: 0,temp_range,dewpoint_depression,heat_index,day_sin,day_cos,pressure_deviation,pressure_day_before,maxtemp_day_before,temparature_day_before,mintemp_day_before,dewpoint_day_before,humidity_day_before,cloud_day_before,sunshine_day_before,winddirection_day_before,windspeed_day_before
count,2190.0,2190.0,2190.0,2190.0,2190.0,2190.0,2190.0,2190.0,2190.0,2190.0,2190.0,2190.0,2190.0,2190.0,2190.0,2190.0
mean,4.195708,3.498493,19.904082,0.014148,-0.002722,-6.29171e-14,1013.60379,26.365799,23.953744,22.170959,20.455205,82.035616,75.721918,3.744475,104.858584,21.790639
std,1.525268,1.91601,5.747178,0.708346,0.706042,5.655366,5.655946,5.65433,5.221871,5.058568,5.288194,7.799955,18.026498,3.626293,80.004692,9.883292
min,-0.1,-2.2,0.707,-0.999991,-0.999963,-14.60215,999.0,10.4,7.4,4.0,-0.3,39.0,2.0,0.0,10.0,4.4
25%,3.1,2.2,15.06925,-0.693281,-0.708627,-5.002146,1008.6,21.3,19.325,17.7,16.8,77.0,69.0,0.4,40.0,14.125
50%,4.2,3.3,21.674,0.038722,-0.01291,-0.6021461,1013.0,27.8,25.5,23.85,22.15,82.0,83.0,2.4,70.0,20.5
75%,5.2,4.6,24.7795,0.723644,0.702527,4.172854,1017.775,31.2,28.4,26.4,25.0,88.0,88.0,6.8,200.0,27.9
max,10.8,15.5,28.0,0.999991,1.0,20.99785,1034.6,36.0,31.5,29.8,26.7,98.0,100.0,12.1,300.0,59.5


# Model Training

In [44]:
# Split data into train and test sets
X = dataset.drop('rainfall', axis=1)
y = dataset['rainfall']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Single Models

In [None]:

# Dictionary to store the best models
best_models = {}

# Define parameter grids for each model (keeping them small for limited resources)
param_grids = {
    'XGBoost': {
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.2]
    },
    'LightGBM': {
        'n_estimators': [50, 100, 200],
        'num_leaves': [31, 50, 70],
        'learning_rate': [0.01, 0.1]
    },
    'RandomForest': {
        'n_estimators': [100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5]
    },
    'CatBoost': {
        'iterations': [50, 100],
        'depth': [4, 6, 8],
        'learning_rate': [0.01, 0.1]
    }
}

# Models to tune
models_to_tune = {
    'XGBoost': xgb.XGBClassifier(random_state=42),
    'LightGBM': lgb.LGBMClassifier(random_state=42),
    'RandomForest': RandomForestClassifier(random_state=42),
    'CatBoost': cb.CatBoostClassifier(random_state=42, verbose=0)
}

# Hyperparameter tuning with RandomizedSearchCV (more efficient than GridSearchCV)
print("Performing lightweight hyperparameter tuning:")
for name, model in models_to_tune.items():
    print(f"\nTuning {name}...")
    start_time = time.time()
    
    # Use RandomizedSearchCV instead of GridSearchCV to save computational resources
    # n_iter=5 means only 5 random combinations will be tried
    search = RandomizedSearchCV(
        model, 
        param_distributions=param_grids[name],
        n_iter=5,  # Try only 5 parameter combinations
        cv=3,      # 3-fold CV instead of 5-fold to save time
        scoring='roc_auc',
        n_jobs=-1, # Use all available cores
        random_state=42
    )
    
    search.fit(X_train, y_train)
    
    # Save the best model
    best_models[name] = search.best_estimator_
    
    # Evaluate on test set
    y_pred = search.best_estimator_.predict(X_test)
    y_pred_proba = search.best_estimator_.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    
    # Print results
    elapsed_time = time.time() - start_time
    print(f"Best parameters: {search.best_params_}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"ROC AUC: {roc_auc:.4f}")
    print(f"Tuning completed in {elapsed_time:.2f} seconds")

# Compare tuned models
print("\nComparison of tuned models:")
for name, model in best_models.items():
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    print(f"{name} - Accuracy: {accuracy:.4f}, ROC AUC: {roc_auc:.4f}")

# Identify the best overall model
best_model_name = max(best_models.items(), key=lambda x: roc_auc_score(y_test, x[1].predict_proba(X_test)[:, 1]))[0]
print(f"\nBest model: {best_model_name}")


Performing lightweight hyperparameter tuning:

Tuning XGBoost...
Best parameters: {'n_estimators': 50, 'max_depth': 3, 'learning_rate': 0.1}
Accuracy: 0.8447
ROC AUC: 0.8668
Tuning completed in 28.01 seconds

Tuning LightGBM...
[LightGBM] [Info] Number of positive: 1328, number of negative: 424
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001291 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6239
[LightGBM] [Info] Number of data points in the train set: 1752, number of used features: 38
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.757991 -> initscore=1.141696
[LightGBM] [Info] Start training from score 1.141696
Best parameters: {'num_leaves': 50, 'n_estimators': 50, 'learning_rate': 0.01}
Accuracy: 0.8014
ROC AUC: 0.8561
Tuning completed in 7.23 seconds

Tuning RandomForest...
Best parameters: {'n_estimators': 200, 'min_samples_split': 2, 'max_depth': 10}
Accuracy: 0.8493
ROC AUC: 0.8739
Tu

## Stacking

In [60]:
# Define base models using the tuned models from best_models dictionary
base_models = [
    ('XGBoost', best_models['XGBoost']),
    ('LightGBM', best_models['LightGBM']),
    ('CatBoost', best_models['CatBoost']),
    ('RandomForest', best_models['RandomForest'])
]

# Define meta-learner
meta_learner = LogisticRegression(random_state=42)

# Create stacking classifier
stacking_clf = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_learner,
    cv=5,
    stack_method='predict_proba'
)

# Train the stacking classifier on the train split first for evaluation
stacking_clf.fit(X_train, y_train)

# Make predictions
y_pred = stacking_clf.predict(X_test)

# Evaluate the model
y_pred_proba = stacking_clf.predict_proba(X_test)[:, 1]
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"Stacking Classifier Accuracy: {accuracy:.4f}")
print(f"Stacking Classifier ROC AUC: {roc_auc:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Compare with individual base models
print("\nComparing with base models:")
for name, model in base_models:
    model_pred = model.predict(X_test)
    model_pred_proba = model.predict_proba(X_test)[:, 1]
    accuracy_base = accuracy_score(y_test, model_pred)
    roc_auc_base = roc_auc_score(y_test, model_pred_proba)
    print(f"{name} Accuracy: {accuracy_base:.4f}, ROC AUC: {roc_auc_base:.4f}")

# After evaluation, train the final model on the entire dataset
# This makes the model stronger for predictions on the test set
print("\nTraining final model on full dataset...")
final_stacking_clf = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_learner,
    cv=5,
    stack_method='predict_proba'
)
final_stacking_clf.fit(X, y)

# Use this model for making final predictions
stacking_clf = final_stacking_clf

[LightGBM] [Info] Number of positive: 1328, number of negative: 424
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000440 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6239
[LightGBM] [Info] Number of data points in the train set: 1752, number of used features: 38
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.757991 -> initscore=1.141696
[LightGBM] [Info] Start training from score 1.141696
[LightGBM] [Info] Number of positive: 1062, number of negative: 339
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000423 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6053
[LightGBM] [Info] Number of data points in the train set: 1401, number of used features: 38
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.758030 -> initscore=1.141909
[LightGBM] [Info] Start training from score 1.141909
[LightGBM] [Info] Number

# Submission

In [61]:
# Load test data
test_data = pd.read_csv('test.csv')

test_data = engineer_features(test_data)

test_data.head()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  features_df[f'{col}_day_before'].fillna(features_df[col], inplace=True)


Unnamed: 0,id,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,...,pressure_change,maxtemp_change,temparature_change,mintemp_change,dewpoint_change,humidity_change,cloud_change,sunshine_change,winddirection_change,windspeed_change
0,2190,1,1019.5,17.5,15.8,12.7,14.9,96.0,99.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
365,2555,1,1013.3,21.1,19.9,17.1,16.9,90.0,91.0,0.0,...,-6.2,3.6,4.1,4.4,2.0,-6.0,-8.0,0.0,10.0,19.5
366,2556,2,1018.5,16.6,14.6,12.5,12.5,97.0,96.0,0.0,...,5.2,-4.5,-5.3,-4.6,-4.4,7.0,5.0,0.0,-40.0,-18.8
1,2191,2,1016.5,17.5,16.5,15.8,15.1,97.0,99.0,0.0,...,-2.0,0.9,1.9,3.3,2.6,0.0,3.0,0.0,30.0,10.3
2,2192,3,1023.9,11.2,10.4,9.4,8.9,86.0,96.0,0.0,...,7.4,-6.3,-6.1,-6.4,-6.2,-11.0,-3.0,0.0,-10.0,-18.4


In [None]:
# Ensure test data has same features as training data
X_test_submission = test_data[X.columns]

# Make predictions
test_predictions = stacking_clf.predict_proba(X_test_submission)[:, 1]

# Create submission file
submission = pd.DataFrame({
    'id': test_data['id'],
    'rainfall': test_predictions
})

# Save submission file
submission.to_csv('submission.csv', index=False)

# Submit to Kaggle
api.competition_submit('submission.csv', 'Stacking Classifier Submission  - with feature engineering + HPT', 'playground-series-s5e3')

print("Submission completed successfully!")

100%|██████████| 17.8k/17.8k [00:00<00:00, 38.4kB/s]


Submission completed successfully!
