# Reading the dataset

In [1]:
import pandas as pd
from easygui import fileopenbox
import warnings
from warnings import filterwarnings
filterwarnings('ignore')

In [68]:
fileopenbox()

'D:\\Datasets\\Rainfall.csv'

In [2]:
df=pd.read_csv('D:\\Datasets\\Rainfall.csv')

# Data Pre-processing

Checking the data types of variables

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 19 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           145460 non-null  object 
 1   Location       145460 non-null  object 
 2   MinTemp        143975 non-null  float64
 3   MaxTemp        144199 non-null  float64
 4   Rainfall       142199 non-null  float64
 5   WindGustDir    135134 non-null  object 
 6   WindGustSpeed  135197 non-null  float64
 7   WindDir9am     134894 non-null  object 
 8   WindDir3pm     141232 non-null  object 
 9   WindSpeed9am   143693 non-null  float64
 10  WindSpeed3pm   142398 non-null  float64
 11  Humidity9am    142806 non-null  float64
 12  Humidity3pm    140953 non-null  float64
 13  Pressure9am    130395 non-null  float64
 14  Pressure3pm    130432 non-null  float64
 15  Temp9am        143693 non-null  float64
 16  Temp3pm        141851 non-null  float64
 17  RainToday      142199 non-nul

In [4]:
df.describe()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm
count,143975.0,144199.0,142199.0,135197.0,143693.0,142398.0,142806.0,140953.0,130395.0,130432.0,143693.0,141851.0
mean,12.194034,23.221348,2.360918,40.03523,14.043426,18.662657,68.880831,51.539116,1017.64994,1015.255889,16.990631,21.68339
std,6.398495,7.119049,8.47806,13.607062,8.915375,8.8098,19.029164,20.795902,7.10653,7.037414,6.488753,6.93665
min,-8.5,-4.8,0.0,6.0,0.0,0.0,0.0,0.0,980.5,977.1,-7.2,-5.4
25%,7.6,17.9,0.0,31.0,7.0,13.0,57.0,37.0,1012.9,1010.4,12.3,16.6
50%,12.0,22.6,0.0,39.0,13.0,19.0,70.0,52.0,1017.6,1015.2,16.7,21.1
75%,16.9,28.2,0.8,48.0,19.0,24.0,83.0,66.0,1022.4,1020.0,21.6,26.4
max,33.9,48.1,371.0,135.0,130.0,87.0,100.0,100.0,1041.0,1039.6,40.2,46.7


Dropping the insignificant columns

In [5]:
df.drop(['Date'],inplace=True,axis=1)

Segregating the columns as numerical and categorical

In [6]:
cat_cols=df.select_dtypes(include='object').columns
num_cols=df.select_dtypes(exclude='object').columns

In [7]:
num_cols

Index(['MinTemp', 'MaxTemp', 'Rainfall', 'WindGustSpeed', 'WindSpeed9am',
       'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am',
       'Pressure3pm', 'Temp9am', 'Temp3pm'],
      dtype='object')

In [8]:
cat_cols

Index(['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday',
       'RainTomorrow'],
      dtype='object')

Checking for null values

In [9]:
for i in cat_cols:
    y=df[i].isna().sum()
    print(i,':',y)

Location : 0
WindGustDir : 10326
WindDir9am : 10566
WindDir3pm : 4228
RainToday : 3261
RainTomorrow : 0


In [10]:
for i in num_cols:
    y=df[i].isna().sum()
    print(i,':',y)

MinTemp : 1485
MaxTemp : 1261
Rainfall : 3261
WindGustSpeed : 10263
WindSpeed9am : 1767
WindSpeed3pm : 3062
Humidity9am : 2654
Humidity3pm : 4507
Pressure9am : 15065
Pressure3pm : 15028
Temp9am : 1767
Temp3pm : 3609


Imputing the null values:
  

for categorical columns- values imputed using the mode value of the column
  

for numerical columns- values imputed using the Iterative imputer

In [11]:
for i in cat_cols:
    mode=df[i].mode()
    print(i,':',mode)   

Location : 0    Canberra
dtype: object
WindGustDir : 0    W
dtype: object
WindDir9am : 0    N
dtype: object
WindDir3pm : 0    SE
dtype: object
RainToday : 0    No
dtype: object
RainTomorrow : 0    No
dtype: object


In [12]:
df.WindGustDir=df.WindGustDir.replace(np.nan,'W')

In [13]:
df.WindDir9am=df.WindDir9am.replace(np.nan,'N')

In [14]:
df.WindDir3pm=df.WindDir3pm.replace(np.nan,'SE')

In [15]:
df.RainToday=df.RainToday.replace(np.nan,'No')

In [16]:
for i in num_cols:
    print(i,df[i].isna().sum())

MinTemp 1485
MaxTemp 1261
Rainfall 3261
WindGustSpeed 10263
WindSpeed9am 1767
WindSpeed3pm 3062
Humidity9am 2654
Humidity3pm 4507
Pressure9am 15065
Pressure3pm 15028
Temp9am 1767
Temp3pm 3609


In [17]:
from sklearn.linear_model import LinearRegression
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [18]:
IT=IterativeImputer(estimator=LinearRegression())

In [19]:
num_cols

Index(['MinTemp', 'MaxTemp', 'Rainfall', 'WindGustSpeed', 'WindSpeed9am',
       'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am',
       'Pressure3pm', 'Temp9am', 'Temp3pm'],
      dtype='object')

In [20]:
df.loc[:,['MinTemp', 'MaxTemp', 'Rainfall', 
          'WindGustSpeed', 'WindSpeed9am',
       'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
          'Pressure9am',
       'Pressure3pm', 'Temp9am',
          'Temp3pm']]=IT.fit_transform(df.loc[:,['MinTemp', 'MaxTemp', 'Rainfall', 'WindGustSpeed', 'WindSpeed9am',
       'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am',
       'Pressure3pm', 'Temp9am', 'Temp3pm']])

Encoding the categorical variables

In [21]:
X=df.iloc[:,:-1]
Y=df.iloc[:,-1]
Y=pd.DataFrame(Y)

In [22]:
X=pd.get_dummies(X)

Check for Data imbalance in dependent variable

In [24]:
Y.value_counts()/df.shape[0]*100

RainTomorrow
No              75.839406
Yes             24.160594
dtype: float64

Balancing the data using SMOTE

In [26]:
from imblearn.over_sampling import SMOTE

In [27]:
sm=SMOTE()

In [28]:
X,Y=sm.fit_resample(X,Y)

In [29]:
X=pd.DataFrame(X)

Splitting the data into train and test

In [30]:
from sklearn.model_selection import train_test_split

In [31]:
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.3,random_state=466)

Scaling the data

In [32]:
from sklearn.preprocessing import StandardScaler

In [33]:
sc=StandardScaler()

In [34]:
train_sc=sc.fit_transform(x_train)

In [35]:
test_sc=sc.transform(x_test)

# Building the model using XGBoost Algorithm

In [36]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
import xgboost

In [37]:
from xgboost import XGBClassifier
xgb=XGBClassifier()
kfold=KFold(n_splits=10)

In [38]:
param_grid={'base_score':[0.5], 'booster':['gbtree'], 'colsample_bylevel':[1],
              'colsample_bynode':[1], 'colsample_bytree':[1], 'gamma':[0], 'gpu_id':[-1],
              'importance_type':['gain'],
              'learning_rate':[0.1], 'max_delta_step':[0], 'max_depth':[6],
              'min_child_weight':[1],
              'n_estimators':[100,200,300], 'n_jobs':[-1], 'num_parallel_tree':[1],
              'objective':['binary:logistic'], 'random_state':[7592], 'reg_alpha':[0],
              'reg_lambda':[1], 'scale_pos_weight':[1], 'subsample':[1],'tree_method':['auto'],
               'validate_parameters':[1], 'verbosity':[0]}

In [39]:
grid_xgb=GridSearchCV(estimator=xgb,param_grid=param_grid,cv=kfold,scoring='accuracy')

In [40]:
grid_xgb.fit(train_sc,y_train)

GridSearchCV(cv=KFold(n_splits=10, random_state=None, shuffle=False),
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan,...
                         'importance_type': ['gain'], 'learning_rate': [0.1],
                         'max_delta_step': [0], 'max_depth': [6],
                         'min_child_weight': [1],
                         'n_estimators': [100, 200, 300], 'n_jobs': [-1],
                         'num_parallel_tree': [1],
  

Visualising the best parameters for the algorithm

In [41]:
grid_xgb.best_params_

{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bynode': 1,
 'colsample_bytree': 1,
 'gamma': 0,
 'gpu_id': -1,
 'importance_type': 'gain',
 'learning_rate': 0.1,
 'max_delta_step': 0,
 'max_depth': 6,
 'min_child_weight': 1,
 'n_estimators': 300,
 'n_jobs': -1,
 'num_parallel_tree': 1,
 'objective': 'binary:logistic',
 'random_state': 7592,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'subsample': 1,
 'tree_method': 'auto',
 'validate_parameters': 1,
 'verbosity': 0}

# Evaluating the XGB model

In [67]:
from sklearn.metrics import*

In [99]:
grid_xgb.score(train_sc,y_train)

0.9157418318851089

In [43]:
grid_xgb.score(test_sc,y_test)

0.9006949690285542

In [48]:
pred_test=grid_xgb.predict(test_sc)

In [50]:
confusion_matrix(y_test,pred_test)

array([[30838,  2179],
       [ 4394, 28779]], dtype=int64)

In [64]:
print(classification_report(y_test,pred_test))

              precision    recall  f1-score   support

          No       0.88      0.93      0.90     33017
         Yes       0.93      0.87      0.90     33173

    accuracy                           0.90     66190
   macro avg       0.90      0.90      0.90     66190
weighted avg       0.90      0.90      0.90     66190



# Building the model using Quadratic Discriminant Analysis Algorithm

In [85]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [86]:
qda=QuadraticDiscriminantAnalysis()

In [98]:
param_grid1={'priors':[None], 'reg_param':[0.0],
                              'store_covariance':[False], 'tol':[0.0001]}

In [87]:
grid_qda=GridSearchCV(estimator=qda,param_grid=param_grid1,scoring='accuracy')

In [88]:
grid_qda.fit(train_sc,y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=QuadraticDiscriminantAnalysis(priors=None, reg_param=0.0,
                                                     store_covariance=False,
                                                     tol=0.0001),
             iid='deprecated', n_jobs=None,
             param_grid={'priors': [None], 'reg_param': [0.0],
                         'store_covariance': [False], 'tol': [0.0001]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

Visualising the best parameters for the algorithm

In [102]:
grid_qda.best_params_

{'priors': None, 'reg_param': 0.0, 'store_covariance': False, 'tol': 0.0001}

# Evaluating the QDA model

In [90]:
grid_qda.score(train_sc,y_train)

0.8356794136310071

In [91]:
grid_qda.score(test_sc,y_test)

0.8361383894848164

In [94]:
pred_test1=grid_qda.predict(test_sc)

In [101]:
print(classification_report(y_test,pred_test1))

              precision    recall  f1-score   support

          No       0.75      1.00      0.86     33017
         Yes       1.00      0.67      0.80     33173

    accuracy                           0.84     66190
   macro avg       0.88      0.84      0.83     66190
weighted avg       0.88      0.84      0.83     66190



# Conclusion

Since it is a rainfall prediction model, Recall score is taken into account prominently and the Quadratic Discriminant Analysis Algorithm performed exceptionally, having a Recall score of 100%.


From, this we can conclude that Quadratic Discriminant Analysis Algorithm is the best model for this dataset.