In [None]:
import pickle
import pandas as pd
import numpy as np
import csv
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.svm import SVC

FilePath = '/content/balanced_dataset.csv'

DF = pd.read_csv(FilePath,delimiter=',')

columns = list(DF.columns)

columns = DF.columns

# removing NAN values from RainToday 
DF = DF.dropna(subset=['RainToday'])
c = 'RainToday'

y = DF[['RainTomorrow']]
DF=DF.drop(columns = ['RainTomorrow'])

numerical_col = [i for i in DF.columns if (DF[i].dtype=='float64' or DF[i].dtype=='int64')]
objects_col = [i for i in DF.columns if DF[i].dtype=='object']

# checking skewness of the data
for i in [0,1]:
  print(f'Rain Tomorrow {i} :',len(y[y==i]))

# Preprocessing based on the training data
def pre_process(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, stratify=y, random_state=0)
    t = X_train.median()
    X_train = X_train.fillna(t)
    X_test = X_test.fillna(t)

    # replacing nan values with mode
    for t_df in [X_train, X_test]:
        t_df['WindGustDir'].fillna(X_train['WindGustDir'].mode()[0], inplace=True)
        t_df['WindDir9am'].fillna(X_train['WindDir9am'].mode()[0], inplace=True)
        t_df['WindDir3pm'].fillna(X_train['WindDir3pm'].mode()[0], inplace=True)
        t_df['RainToday'].fillna(X_train['RainToday'].mode()[0], inplace=True)

    X_train = pd.concat([X_train[numerical_col], pd.get_dummies(X_train.Location,prefix='Location'),
                        pd.get_dummies(X_train.WindGustDir,prefix='WindGustDir'),
                        pd.get_dummies(X_train.WindDir9am,prefix='WindDir9am'),
                        pd.get_dummies(X_train.WindDir3pm,prefix='WindDir3pm'),
                        pd.get_dummies(X_train.RainToday,prefix='RainToday')], axis=1)

    X_test = pd.concat([X_test[numerical_col], pd.get_dummies(X_test.Location,prefix='Location'),
                        pd.get_dummies(X_test.WindGustDir,prefix='WindGustDir'),
                        pd.get_dummies(X_test.WindDir9am,prefix='WindDir9am'),
                        pd.get_dummies(X_test.WindDir3pm,prefix='WindDir3pm'),
                        pd.get_dummies(X_test.RainToday,prefix='RainToday')], axis=1)
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test, y_train, y_test

Rain Tomorrow 0 : 62402
Rain Tomorrow 1 : 62402


In [None]:
X_train, X_test, y_train, y_test = pre_process(DF,y)
# training Gradient Boosting 
clf = GradientBoostingClassifier(learning_rate=0.09,n_estimators=300,max_depth=6,random_state=1234)
clf.fit(X_train, y_train)

# getting probabilities
y_pred_test = clf.predict_proba(X_test)[:,1]
y_pred_test

  y = column_or_1d(y, warn=True)


array([0.58979045, 0.12651289, 0.10632526, ..., 0.0664689 , 0.59426769,
       0.80630219])

In [None]:
# setting threshold for classification 
thre = 0.5
y_pred_test = np.where(y_pred_test>thre,1,0)
print("threshold =",thre)

threshold = 0.5


In [None]:
# predicting values
y_pred_train=clf.predict(X_train)
test_accu = accuracy_score(y_test, y_pred_test)
train_acc = accuracy_score(y_train,y_pred_train)

print('Model test accuracy score : {0:0.4f}'. format(test_accu))
print('Model train accuracy score : {0:0.4f}'. format(train_acc))

#confusion matrix
cm = confusion_matrix(y_test, y_pred_test)
cm_matrix = pd.DataFrame(data=cm, columns=['Predict Negative:0', 'Predict Positive:1'], 
                              index=['Actual Negative:0', 'Actual positive:1'])
print(cm_matrix)

v_c=y_test.value_counts()
v_c

# null Accuracy 
null_accuracy = v_c[0]/sum(v_c)

print("Null accuracy",null_accuracy)
# pickle.dump(clf,open("Boosting-trees.model",'wb'))

Model test accuracy score : 0.8032
Model train accuracy score : 0.8725
                   Predict Negative:0  Predict Positive:1
Actual Negative:0                4997                1244
Actual positive:1                1212                5028
Null accuracy RainTomorrow
0               0.50004
dtype: float64
