<a href="https://colab.research.google.com/github/bythyag/machine-learning-projects/blob/main/rainfall_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
# library imports
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import resample
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import time
from sklearn.metrics import accuracy_score, roc_auc_score, cohen_kappa_score, roc_curve, classification_report
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

In [11]:
url = 'https://raw.githubusercontent.com/amankharwal/Website-data/refs/heads/master/weatherAUS.csv' #load dataset :)
weather_data = pd.read_csv(url)

In [12]:
weather_data.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,0.0,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,25.0,1010.6,1007.8,,,17.2,24.3,No,0.0,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,0.0,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,16.0,1017.6,1012.8,,,18.1,26.5,No,1.0,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,0.2,No


In [13]:
# Data Prerocessing

# convert to categorical variables
dir_to_deg = {
    'N': 0, 'NNE': 22.5, 'NE': 45, 'ENE': 67.5,
    'E': 90, 'ESE': 112.5, 'SE': 135, 'SSE': 157.5,
    'S': 180, 'SSW': 202.5, 'SW': 225, 'WSW': 247.5,
    'W': 270, 'WNW': 292.5, 'NW': 315, 'NNW': 337.5
}

weather_data['WindDir9am'] = weather_data['WindDir9am'].map(dir_to_deg)
weather_data['WindDir3pm'] = weather_data['WindDir3pm'].map(dir_to_deg)
weather_data['WindGustDir'] = weather_data['WindGustDir'].map(dir_to_deg)
weather_data['RainToday'] = weather_data['RainToday'].map({'No': 0, 'Yes': 1})
weather_data['RainTomorrow'] = weather_data['RainTomorrow'].map({'No': 0, 'Yes': 1})

# Convert 'Date' column to datetime format
weather_data['Date'] = pd.to_datetime(weather_data['Date'])


# rebalance the dataset

no = weather_data[weather_data.RainTomorrow == 0]
yes = weather_data[weather_data.RainTomorrow == 1]

yes_oversampled = resample(yes, replace=True, n_samples=len(no), random_state=123)
weather_data = pd.concat([no, yes_oversampled])

# Median Imputer for continuous features
median_imputer = SimpleImputer(strategy='median')
weather_data[["Sunshine", "Evaporation"]] = median_imputer.fit_transform(weather_data[["Sunshine", "Evaporation"]])

# Mode Imputer for categorical/discrete features
mode_imputer = SimpleImputer(strategy='most_frequent')
weather_data[["Cloud3pm", "Cloud9am"]] = mode_imputer.fit_transform(weather_data[["Cloud3pm", "Cloud9am"]])

# Median Imputer for continuous features
median_imputer = SimpleImputer(strategy='median')
weather_data[["RISK_MM", "Cloud9am", "Cloud3pm", "Humidity9am", "Humidity3pm", "Rainfall", "WindGustSpeed"]] = median_imputer.fit_transform(weather_data[["RISK_MM", "Cloud9am", "Cloud3pm", "Humidity9am", "Humidity3pm", "Rainfall", "WindGustSpeed"]])

# Mode Imputer for categorical/discrete features
mode_imputer = SimpleImputer(strategy='most_frequent')
weather_data[["RainToday"]] = mode_imputer.fit_transform(weather_data[["RainToday"]])

# Median Imputer for continuous features
from sklearn.impute import SimpleImputer
median_imputer = SimpleImputer(strategy='median')
weather_data[["Pressure9am", "Pressure3pm"]] = median_imputer.fit_transform(weather_data[["Pressure9am", "Pressure3pm"]])

# Mode Imputer for categorical/discrete features
mode_imputer = SimpleImputer(strategy='most_frequent')
weather_data[["WindGustDir", "WindDir9am", "WindDir3pm"]] = mode_imputer.fit_transform(weather_data[["WindGustDir", "WindDir9am", "WindDir3pm"]])

# remove outliers

numeric_cols = weather_data.select_dtypes(include=['float64']).columns

Q1 = weather_data[numeric_cols].quantile(0.25)
Q3 = weather_data[numeric_cols].quantile(0.75)
IQR = Q3 - Q1

# Keep only rows within 1.5 * IQR for all numeric columns
weather_data_clean = weather_data[~((weather_data[numeric_cols] < (Q1 - 1.5 * IQR)) | (weather_data[numeric_cols] > (Q3 + 1.5 * IQR))).any(axis=1)]

# drop missing values
weather_data.dropna(inplace=True)



In [15]:
# min-max scaling

continuous_cols = [
    'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
    'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm',
    'Humidity9am', 'Humidity3pm',
    'Pressure9am', 'Pressure3pm',
    'Cloud9am', 'Cloud3pm',
    'Temp9am', 'Temp3pm',
    'RISK_MM'
]

# Initialize scaler
scaler = MinMaxScaler()

weather_data[continuous_cols] = scaler.fit_transform(weather_data[continuous_cols])

In [17]:
features = weather_data[['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustDir',
                       'WindGustSpeed', 'WindDir9am', 'WindDir3pm', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am',
                       'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm',
                       'RainToday']]
target = weather_data['RainTomorrow']

# Split into test and train
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.25, random_state=12345)

In [34]:
# Function to run the model and print results

def run_model(model_name, model, X_train, y_train, X_test, y_test, verbose=True):
    t0=time.time()
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    time_taken = time.time()-t0
    print("Model = {}".format(model_name))
    print(classification_report(y_test,y_pred,digits=5))
    print("ROC Area under Curve = {}".format(roc_auc))
    print("Time taken for training = {}".format(time_taken))
    

In [35]:
# Logistic Regression
params_lr = {'penalty': 'l1', 'solver':'liblinear'}

model_lr = LogisticRegression(**params_lr)
run_model("Logistic Regression",model_lr, X_train, y_train, X_test, y_test)

Model = Logistic Regression
              precision    recall  f1-score   support

           0    0.77171   0.79163   0.78155     26727
           1    0.78473   0.76434   0.77440     26560

    accuracy                        0.77803     53287
   macro avg    0.77822   0.77799   0.77797     53287
weighted avg    0.77820   0.77803   0.77799     53287

ROC Area under Curve = 0.7779894038777176
Time taken for training = 18.922438144683838


In [36]:

# Decision Tree

params_dt = {'max_depth': 16,
             'max_features': "sqrt"}

model_dt = DecisionTreeClassifier(**params_dt)
run_model("Decision Tree", model_dt, X_train, y_train, X_test, y_test)


Model = Decision Tree
              precision    recall  f1-score   support

           0    0.85394   0.80847   0.83058     26727
           1    0.81707   0.86084   0.83838     26560

    accuracy                        0.83458     53287
   macro avg    0.83550   0.83466   0.83448     53287
weighted avg    0.83556   0.83458   0.83447     53287

ROC Area under Curve = 0.8346571041151923
Time taken for training = 0.33980894088745117


In [37]:
# Random Forest

params_rf = {'max_depth': 16,
             'min_samples_leaf': 1,
             'min_samples_split': 2,
             'n_estimators': 100,
             'random_state': 12345}

model_rf = RandomForestClassifier(**params_rf)
run_model("Random Forest Classifier",model_rf, X_train, y_train, X_test, y_test)

Model = Random Forest Classifier
              precision    recall  f1-score   support

           0    0.90985   0.87002   0.88949     26727
           1    0.87472   0.91325   0.89357     26560

    accuracy                        0.89157     53287
   macro avg    0.89228   0.89164   0.89153     53287
weighted avg    0.89234   0.89157   0.89152     53287

ROC Area under Curve = 0.8916360469377793
Time taken for training = 19.339380025863647


In [39]:
# XGBoost

params_xgb ={'n_estimators': 500,
            'max_depth': 16}

model_xgb = xgb.XGBClassifier(**params_xgb)
run_model("XGBoost Classifier", model_xgb, X_train, y_train, X_test, y_test)


Model = XGBoost Classifier
              precision    recall  f1-score   support

           0    0.97318   0.91230   0.94176     26727
           1    0.91697   0.97470   0.94496     26560

    accuracy                        0.94340     53287
   macro avg    0.94508   0.94350   0.94336     53287
weighted avg    0.94516   0.94340   0.94335     53287

ROC Area under Curve = 0.9434986099972907
Time taken for training = 5.516936302185059
