In [7]:
# %% -------------------- Imports and Setup --------------------
# Imports
import pandas as pd
import numpy as np
import sys
import pathlib
import random

# Set seed for reproducibility
SEED = 42
np.random.seed(SEED)
random.seed(SEED)

# Setup path
project_root = pathlib.Path().resolve().parent
sys.path.append(str(project_root / 'src'))

import util



In [8]:

# %% -------------------- Load and Preprocess Data --------------------

# Load datasets
train_df, test_df = util.load_data('../data/train.csv', '../data/test.csv')

# Separate target
target = 'rainfall'
train_y = train_df[target]
train_X = train_df.drop(columns=[target])

# Drop 'id' column before training
train_X = train_X.drop(columns=['id'])
test_df_no_id = test_df.drop(columns=['id'])  # Keep 'id' for submission

# Preprocess (fill NaNs, align columns)
train_X, test_X = util.preprocess_train_test(train_X, test_df_no_id)
test_X = util.align_columns(train_X, test_X)  # Ensure same columns


Train shape: (2190, 13), Test shape: (730, 12)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df[col].fillna(fill_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df[col].fillna(fill_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values 

In [9]:


# %% -------------------- Logistic Regression --------------------
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

print("\n===== Logistic Regression Cross-Validation (AUC) =====")
# Define model
logistic_model = LogisticRegression(max_iter=1000, solver='liblinear')

# Cross-validation with ROC AUC
util.cross_validate_classification(logistic_model, train_X, train_y, n_splits=5, stratify=True)  # Make sure your util supports AUC internally!

# Train model
print("\nTraining Logistic Regression...")
logistic_model.fit(train_X, train_y)

# Predict probabilities and submit
print("Generating probability predictions and writing submission for Logistic Regression...")
logistic_probs = logistic_model.predict_proba(test_X)[:, 1]  # Probabilities for class 1
util.write_submission(test_df, logistic_probs, id_column='id', output_path='submission_logistic.csv', prediction_column='rainfall')




===== Logistic Regression Cross-Validation (AUC) =====
Fold 1 Accuracy: 0.8995
Fold 2 Accuracy: 0.8470
Fold 3 Accuracy: 0.8425
Fold 4 Accuracy: 0.8607
Fold 5 Accuracy: 0.8699

Average Accuracy: 0.8639

Training Logistic Regression...
Generating probability predictions and writing submission for Logistic Regression...
Submission file written to: submission_logistic.csv


In [10]:

# %% -------------------- XGBoost Classifier --------------------
from xgboost import XGBClassifier

print("\n===== XGBoost Classifier Cross-Validation (AUC) =====")
# Define model
xgb_model = XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=5,
    random_state=SEED,
    eval_metric='auc'  # AUC for binary classification
)

# Cross-validation with ROC AUC
util.cross_validate_classification(xgb_model, train_X, train_y, n_splits=5, stratify=True)  # Again, ensure AUC is used inside util

# Train model
print("\nTraining XGBoost Classifier...")
xgb_model.fit(train_X, train_y)

# Predict probabilities and submit
print("Generating probability predictions and writing submission for XGBoost Classifier...")
xgb_probs = xgb_model.predict_proba(test_X)[:, 1]  # Probabilities for class 1
util.write_submission(test_df, xgb_probs, id_column='id', output_path='submission_xgboost.csv', prediction_column='rainfall')



===== XGBoost Classifier Cross-Validation (AUC) =====
Fold 1 Accuracy: 0.8813
Fold 2 Accuracy: 0.8333
Fold 3 Accuracy: 0.8425
Fold 4 Accuracy: 0.8630
Fold 5 Accuracy: 0.8539

Average Accuracy: 0.8548

Training XGBoost Classifier...
Generating probability predictions and writing submission for XGBoost Classifier...
Submission file written to: submission_xgboost.csv


In [11]:


# %% -------------------- CatBoost Classifier --------------------
from catboost import CatBoostClassifier

print("\n===== CatBoost Classifier Cross-Validation (AUC) =====")
# Define model
cat_model = CatBoostClassifier(iterations=500, learning_rate=0.05, depth=5, random_state=SEED, verbose=100, eval_metric='AUC')

# Cross-validation with ROC AUC
util.cross_validate_classification(cat_model, train_X, train_y, n_splits=5, stratify=True)

# Train model
print("\nTraining CatBoost Classifier...")
cat_model.fit(train_X, train_y)

# Predict probabilities and submit
print("Generating probability predictions and writing submission for CatBoost Classifier...")
cat_probs = cat_model.predict_proba(test_X)[:, 1]  # Probabilities for class 1
util.write_submission(test_df, cat_probs, id_column='id', output_path='submission_catboost.csv', prediction_column='rainfall')



===== CatBoost Classifier Cross-Validation (AUC) =====
0:	total: 4.06ms	remaining: 2.02s
100:	total: 418ms	remaining: 1.65s
200:	total: 803ms	remaining: 1.19s
300:	total: 1.2s	remaining: 793ms
400:	total: 1.6s	remaining: 394ms
499:	total: 1.99s	remaining: 0us
Fold 1 Accuracy: 0.8790
0:	total: 4.09ms	remaining: 2.04s
100:	total: 445ms	remaining: 1.76s
200:	total: 876ms	remaining: 1.3s
300:	total: 1.3s	remaining: 863ms
400:	total: 1.73s	remaining: 427ms
499:	total: 2.12s	remaining: 0us
Fold 2 Accuracy: 0.8447
0:	total: 3.92ms	remaining: 1.96s
100:	total: 537ms	remaining: 2.12s
200:	total: 1.33s	remaining: 1.98s
300:	total: 1.82s	remaining: 1.2s
400:	total: 2.21s	remaining: 547ms
499:	total: 2.72s	remaining: 0us
Fold 3 Accuracy: 0.8311
0:	total: 3.64ms	remaining: 1.82s
100:	total: 377ms	remaining: 1.49s
200:	total: 749ms	remaining: 1.11s
300:	total: 1.12s	remaining: 742ms
400:	total: 1.49s	remaining: 369ms
499:	total: 1.86s	remaining: 0us
Fold 4 Accuracy: 0.8539
0:	total: 7.53ms	remainin