In [2]:
# Main imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split,StratifiedKFold,KFold,cross_val_score
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

#Models
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [3]:
df = pd.read_csv('preprocessed.csv')

In [4]:
X = df.drop(["RainTomorrow"], axis=1)
Y = df["RainTomorrow"]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size =0.2, random_state = 0)

As there were outliers we used RobustScaling

In [7]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

Function to display accuracy and other metrices

In [8]:
def eval_metric(model, X_train, y_train, X_test, y_test):
    y_train_pred = model.predict(X_train)
    y_pred = model.predict(X_test)
    
    print("Test_Set")
    print("confussion matrix")
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print()
    print("Train_Set")
    print("confussion matrix")
    print(confusion_matrix(y_train, y_train_pred))
    print(classification_report(y_train, y_train_pred))
    print()
    dt_acc_score = accuracy_score(y_test, y_pred)
    print("Accuracy :",dt_acc_score*100,'\n')

Logistic Regressor : 84.37%

In [9]:
classifier1 = LogisticRegression(random_state = 0,max_iter=1000)
classifier1.fit(X_train, y_train)
y_pred = classifier1.predict(X_test)
eval_metric(classifier1, X_train, y_train, X_test, y_test)

Test_Set
confussion matrix
[[21552  1174]
 [ 3371  2995]]
              precision    recall  f1-score   support

           0       0.86      0.95      0.90     22726
           1       0.72      0.47      0.57      6366

    accuracy                           0.84     29092
   macro avg       0.79      0.71      0.74     29092
weighted avg       0.83      0.84      0.83     29092


Train_Set
confussion matrix
[[86025  4832]
 [13342 12169]]
              precision    recall  f1-score   support

           0       0.87      0.95      0.90     90857
           1       0.72      0.48      0.57     25511

    accuracy                           0.84    116368
   macro avg       0.79      0.71      0.74    116368
weighted avg       0.83      0.84      0.83    116368


Accuracy : 84.37714835693662 



In [10]:
#Takes lot of time due to big data
# classifier2 = SVC(kernel = 'rbf', random_state = 0)
# classifier2.fit(X_train, y_train)
# y_pred = classifier2.predict(X_test)
# eval_metric(classifier2, X_train, y_train, X_test, y_test)

In [11]:
#Takes lot of time due to big data
# classifier3 = KNeighborsClassifier(n_neighbors = 2)
# classifier3.fit(X_train, y_train)
# y_pred = classifier3.predict(X_test)
# eval_metric(classifier3, X_train, y_train, X_test, y_test)

Decision Tree : 78.86%

In [12]:
classifier4 = DecisionTreeClassifier(random_state = 0)
classifier4.fit(X_train, y_train)
y_pred = classifier4.predict(X_test)
eval_metric(classifier4, X_train, y_train, X_test, y_test)

Test_Set
confussion matrix
[[19569  3157]
 [ 2991  3375]]
              precision    recall  f1-score   support

           0       0.87      0.86      0.86     22726
           1       0.52      0.53      0.52      6366

    accuracy                           0.79     29092
   macro avg       0.69      0.70      0.69     29092
weighted avg       0.79      0.79      0.79     29092


Train_Set
confussion matrix
[[90856     1]
 [    6 25505]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     90857
           1       1.00      1.00      1.00     25511

    accuracy                           1.00    116368
   macro avg       1.00      1.00      1.00    116368
weighted avg       1.00      1.00      1.00    116368


Accuracy : 78.86704248590678 



RandomForest : 85.73%

In [13]:
classifier5 = RandomForestClassifier(n_estimators = 500)
classifier5.fit(X_train, y_train)
y_pred = classifier5.predict(X_test)
eval_metric(classifier5, X_train, y_train, X_test, y_test)

Test_Set
confussion matrix
[[21753   973]
 [ 3178  3188]]
              precision    recall  f1-score   support

           0       0.87      0.96      0.91     22726
           1       0.77      0.50      0.61      6366

    accuracy                           0.86     29092
   macro avg       0.82      0.73      0.76     29092
weighted avg       0.85      0.86      0.85     29092


Train_Set
confussion matrix
[[90856     1]
 [    6 25505]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     90857
           1       1.00      1.00      1.00     25511

    accuracy                           1.00    116368
   macro avg       1.00      1.00      1.00    116368
weighted avg       1.00      1.00      1.00    116368


Accuracy : 85.73147256977863 



Naive Bayes : 90.37%

In [14]:
classifier6 = GaussianNB()
classifier6.fit(X_train, y_train)
y_pred = classifier6.predict(X_test)
eval_metric(classifier6, X_train, y_train, X_test, y_test)

Test_Set
confussion matrix
[[19591  3135]
 [ 2574  3792]]
              precision    recall  f1-score   support

           0       0.88      0.86      0.87     22726
           1       0.55      0.60      0.57      6366

    accuracy                           0.80     29092
   macro avg       0.72      0.73      0.72     29092
weighted avg       0.81      0.80      0.81     29092


Train_Set
confussion matrix
[[77887 12970]
 [10060 15451]]
              precision    recall  f1-score   support

           0       0.89      0.86      0.87     90857
           1       0.54      0.61      0.57     25511

    accuracy                           0.80    116368
   macro avg       0.71      0.73      0.72    116368
weighted avg       0.81      0.80      0.81    116368


Accuracy : 80.37604839818506 



XGBoost : 86.01%

In [22]:
params = {
            'objective':'binary:logistic',
            'max_depth': 4,
            'alpha': 10,
            'learning_rate': 0.5,
            'n_estimators':500
        }        
classifier7 = XGBClassifier(**params)
classifier7.fit(X_train, y_train)
y_pred = classifier7.predict(X_test)
eval_metric(classifier7, X_train, y_train, X_test, y_test)

Test_Set
confussion matrix
[[21435  1291]
 [ 2779  3587]]
              precision    recall  f1-score   support

           0       0.89      0.94      0.91     22726
           1       0.74      0.56      0.64      6366

    accuracy                           0.86     29092
   macro avg       0.81      0.75      0.78     29092
weighted avg       0.85      0.86      0.85     29092


Train_Set
confussion matrix
[[87633  3224]
 [ 8926 16585]]
              precision    recall  f1-score   support

           0       0.91      0.96      0.94     90857
           1       0.84      0.65      0.73     25511

    accuracy                           0.90    116368
   macro avg       0.87      0.81      0.83    116368
weighted avg       0.89      0.90      0.89    116368


Accuracy : 86.00989962876392 



Gradient Boost : 85.63%

In [17]:
from sklearn.ensemble import GradientBoostingClassifier
classifier8 = GradientBoostingClassifier(n_estimators=500)
classifier8.fit(X_train,y_train)
y_pred = classifier8.predict(X_test)

In [18]:
eval_metric(classifier8, X_train, y_train, X_test, y_test)

Test_Set
confussion matrix
[[21594  1132]
 [ 3046  3320]]
              precision    recall  f1-score   support

           0       0.88      0.95      0.91     22726
           1       0.75      0.52      0.61      6366

    accuracy                           0.86     29092
   macro avg       0.81      0.74      0.76     29092
weighted avg       0.85      0.86      0.85     29092


Train_Set
confussion matrix
[[86676  4181]
 [11635 13876]]
              precision    recall  f1-score   support

           0       0.88      0.95      0.92     90857
           1       0.77      0.54      0.64     25511

    accuracy                           0.86    116368
   macro avg       0.83      0.75      0.78    116368
weighted avg       0.86      0.86      0.86    116368


Accuracy : 85.63866355011686 



Cross Validating RandomForest

In [16]:
cv = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
scores = cross_val_score(classifier5, X,Y, scoring='accuracy', cv=cv, n_jobs=-1)
print(scores,scores.mean()*100)

[0.85906778 0.8580022  0.85494294 0.85776158 0.85738347] 85.7431596315138


Cross Validating XGBoost

In [24]:
cv = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
scores = cross_val_score(classifier7, X,Y, scoring='accuracy', cv=cv, n_jobs=-1)
print(scores,scores.mean()*100)

[0.8614052  0.85862093 0.85838031 0.86150832 0.86051148] 86.00852468032448
