In [None]:
# Importing essential libraries for numerical operations and data handling
import numpy as np                                                            # For numerical computations and array handling
import pandas as pd                                                           # For data manipulation and analysis using DataFrames

# Importing libraries for data visualization
import matplotlib.pyplot as plt                                               # For creating static, animated, and interactive plots
import seaborn as sb                                                          # For advanced data visualization, built on top of matplotlib

# Importing tools for data preprocessing and model evaluation
from sklearn.model_selection import train_test_split                          # To split dataset into training and testing sets
from sklearn.preprocessing import StandardScaler                              # To scale/normalize feature data
from sklearn import metrics                                                   # For evaluating the performance of machine learning models

# Importing machine learning models
from sklearn.svm import SVC                                                   # Support Vector Classifier - good for classification problems
from xgboost import XGBClassifier                                             # Extreme Gradient Boosting Classifier - powerful ensemble model
from sklearn.linear_model import LogisticRegression                           # Logistic Regression - simple linear model for classification

# Importing technique for handling imbalanced datasets
from imblearn.over_sampling import RandomOverSampler

# Suppressing warning messages to keep output clean
import warnings
warnings.filterwarnings('ignore')                                             # Ignores warnings (e.g., from libraries or deprecated methods)

# Loading the dataset from a CSV file named 'Rainfall.csv' into a pandas DataFrame
df = pd.read_csv('Rainfall.csv')

# Displaying the first 5 rows of the DataFrame to understand the structure of the data
df.head()

# Checking the number of rows and columns in the DataFrame
df.shape

# Displaying concise summary information about the DataFrame
df.info()

# Generating descriptive statistics for all numerical columns, and transposing the output for better readability
df.describe().T

#DATA CLEANING

# Checking the total number of missing (null) values in each column
df.isnull().sum()

# Displaying a list of all column names in the DataFrame
df.columns

# Removing any leading or trailing whitespace from column names
df.rename(str.strip,                                                          # Apply the str.strip() method to each column name
          axis='columns',                                                     # Specify that we're renaming along the columns axis (not rows)
          inplace=True)                                                       # Apply the changes directly to the original DataFrame without creating a new one

# Displaying the cleaned column names
df.columns

# Looping through each column in the DataFrame
for col in df.columns:

  # Checking if the current column has any missing (null) values
  if df[col].isnull().sum() > 0:

    # Calculating the mean of the column (ignores NaN by default)
    val = df[col].mean()

    # Filling missing values in the column with the calculated mean
    df[col] = df[col].fillna(val)

# Checking the total number of missing values in the entire DataFrame after filling
df.isnull().sum().sum()

# Creating a pie chart to show the distribution of values in the 'rainfall' column
plt.pie(df['rainfall'].value_counts().values,                                 # Sizes of the pie slices (counts of each unique value)
        labels = df['rainfall'].value_counts().index,                         # Labels for each slice (the unique values in 'rainfall')
        autopct='%1.1f%%')                                                    # Display percentages with one decimal place on each slice

# Displaying the pie chart
plt.show()

# Creating a heatmap to visualize strong correlations between features
plt.figure(figsize=(10,10))                                                   # Setting the size of the figure (10x10 inches)
sb.heatmap(df.corr() > 0.8,                                                   # Computing correlation matrix and showing only those above 0.8 (strong correlation)
           annot=True,                                                        # Annotating each cell with True/False
           cbar=False)                                                        # Hiding the color bar (since we're only showing True/False)

plt.show()                                                                    # Displaying the heatmap

df.drop(['maxtemp', 'mintemp'], axis=1, inplace=True)                         # Dropping the 'maxtemp' and 'mintemp' columns from the DataFrame

#MODEL TRAINING


# Separating the input features (independent variables)
# Dropping 'day' and 'rainfall' columns from df to keep only the predictor features
features = df.drop(['day', 'rainfall'], axis=1)

# Selecting the target variable (dependent variable) for prediction
target = df.rainfall

# Splitting the dataset into training and validation sets
X_train, X_val, \
    Y_train, Y_val = train_test_split(features,                               # The independent variables (features)
                                      target,                                 # The target variable (what we want to predict)
                                      test_size=0.2,                          # 20% of the data will be used for validation/testing
                                      stratify=target,                        # Ensures that class proportions are the same in both train and validation sets
                                      random_state=2)                         # For reproducibility — ensures same split every time you run the code

# As the data was highly imbalanced we will
# balance it by adding repetitive rows of minority class.
ros = RandomOverSampler(sampling_strategy='minority',                         # Oversample only the minority class to match the majority class count
                        random_state=22)                                      # For reproducibility of sampling

# Resampling the training data to balance the class distribution
X, Y = ros.fit_resample(X_train, Y_train)

# Normalizing the features for stable and fast training.

scaler = StandardScaler()                                                     # Initializing the StandardScaler for feature normalization
X = scaler.fit_transform(X)                                                   # Fitting the scaler on training data and transforming it
X_val = scaler.transform(X_val)                                               # Using the same scaler to transform the validation data

# Creating a list of classification models to train and compare
models = [
    LogisticRegression(),                                                     # A simple, interpretable linear model
    XGBClassifier(),                                                          # A powerful gradient boosting model
    SVC(kernel='rbf', probability=True)                                       # Support Vector Classifier with RBF kernel and probability outputs
  ]

# Looping through each model to train and evaluate using ROC AUC Score
for i in range(3):

  # Training the model on the balanced, normalized training set
  models[i].fit(X, Y)

  # Printing the model type
  print(f'{models[i]} : ')

  # Predicting probabilities on training set
  train_preds = models[i].predict_proba(X)

  # Evaluating training performance using ROC AUC Score
  print('Training Accuracy : ', metrics.roc_auc_score(Y, train_preds[:,1]))

  # Predicting probabilities on validation set
  val_preds = models[i].predict_proba(X_val)

  # Evaluating validation performance using ROC AUC Score
  print('Validation Accuracy : ', metrics.roc_auc_score(Y_val, val_preds[:,1]))
  print()                                                                     # Just for clean spacing between model outputs

# Importing required libraries for plotting and evaluation
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn import metrics

# Plotting the confusion matrix for the third model in the list (SVC)
ConfusionMatrixDisplay.from_estimator(
    models[2],                                                                # Using the trained SVC model (index 2 in the models list)
    X_val,                                                                    # Validation features
    Y_val)                                                                    # True validation labels

# Displaying the plot
plt.show()

# Printing the classification report for the SVC model on validation data
print(metrics.classification_report(
    Y_val,                                                                    # True labels for the validation set
    models[2].predict(X_val)                                                  # Predicted labels by the SVC model (index 2 in models list)
    ))







