## 1. Importing the necessary libraries

In [121]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix, accuracy_score

## 2. Importing the dataset

In [122]:
df = pd.read_csv('https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ML0101EN-SkillUp/labs/ML-FinalAssignment/Weather_Data.csv')

df.head()

Unnamed: 0,Date,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2/1/2008,19.5,22.4,15.6,6.2,0.0,W,41,S,SSW,...,92,84,1017.6,1017.4,8,8,20.7,20.9,Yes,Yes
1,2/2/2008,19.5,25.6,6.0,3.4,2.7,W,41,W,E,...,83,73,1017.9,1016.4,7,7,22.4,24.8,Yes,Yes
2,2/3/2008,21.6,24.5,6.6,2.4,0.1,W,41,ESE,ESE,...,88,86,1016.7,1015.6,7,8,23.5,23.0,Yes,Yes
3,2/4/2008,20.2,22.8,18.8,2.2,0.0,W,41,NNE,E,...,83,90,1014.2,1011.8,8,8,21.4,20.9,Yes,Yes
4,2/5/2008,19.7,25.7,77.4,4.8,0.0,W,41,NNE,W,...,88,74,1008.3,1004.8,8,8,22.5,25.5,Yes,Yes


In [123]:
df.isna().sum()

Date             0
MinTemp          0
MaxTemp          0
Rainfall         0
Evaporation      0
Sunshine         0
WindGustDir      0
WindGustSpeed    0
WindDir9am       0
WindDir3pm       0
WindSpeed9am     0
WindSpeed3pm     0
Humidity9am      0
Humidity3pm      0
Pressure9am      0
Pressure3pm      0
Cloud9am         0
Cloud3pm         0
Temp9am          0
Temp3pm          0
RainToday        0
RainTomorrow     0
dtype: int64

In [124]:
df_sydney_processed=pd.get_dummies(data=df, columns=['RainToday', 'WindGustDir', 'WindDir9am', 'WindDir3pm'])
df_sydney_processed.dtypes

Date               object
MinTemp           float64
MaxTemp           float64
Rainfall          float64
Evaporation       float64
                   ...   
WindDir3pm_SSW      uint8
WindDir3pm_SW       uint8
WindDir3pm_W        uint8
WindDir3pm_WNW      uint8
WindDir3pm_WSW      uint8
Length: 68, dtype: object

In [125]:
df_sydney_processed.replace(['No','Yes'],[0,1],inplace=True)

In [126]:
df_sydney_processed.drop('Date',axis=1,inplace=True)

In [127]:
df_sydney_processed = df_sydney_processed.astype(float)
df_sydney_processed.dtypes

MinTemp           float64
MaxTemp           float64
Rainfall          float64
Evaporation       float64
Sunshine          float64
                   ...   
WindDir3pm_SSW    float64
WindDir3pm_SW     float64
WindDir3pm_W      float64
WindDir3pm_WNW    float64
WindDir3pm_WSW    float64
Length: 67, dtype: object

## 3. Training Set and Test Set

In [128]:
features=df_sydney_processed.drop('RainTomorrow',axis=1)
Y=df_sydney_processed['RainTomorrow']

In [129]:
x_train, x_test, y_train, y_test = train_test_split(features,Y,test_size=0.2,random_state=10)
print("X_train: ",x_train.shape)
print("X_test: ",x_test.shape)
print("Y_train: ",y_train.shape)
print("Y_test: ",y_test.shape)


X_train:  (2616, 66)
X_test:  (655, 66)
Y_train:  (2616,)
Y_test:  (655,)


## 4. Linear Regression

In [130]:
LinearReg = linear_model.LinearRegression()

In [131]:
LinearReg.fit(x_train,y_train)

LinearRegression()

In [132]:
predictions=LinearReg.predict(x_test)

In [133]:
from sklearn.metrics import r2_score

In [134]:
LinearRegression_MAE = np.mean(np.absolute(predictions - y_test))
LinearRegression_MSE = np.mean((predictions - y_test) ** 2)
LinearRegression_R2 = r2_score(y_test , predictions)

In [135]:
Report = {'Metrics':["MAE","MSE","R2"],'Results':[LinearRegression_MAE,LinearRegression_MSE,LinearRegression_R2]}
pd.DataFrame(Report)

Unnamed: 0,Metrics,Results
0,MAE,0.25632
1,MSE,0.115722
2,R2,0.427125


## 5. KNN 

In [136]:
KNN= KNeighborsClassifier(n_neighbors = 4).fit(x_train,y_train)

In [137]:
predictions=KNN.predict(x_test)

In [138]:
KNN_Accuracy_Score = metrics.accuracy_score(y_test, predictions)
KNN_JaccardIndex = jaccard_score(y_test, predictions)
KNN_F1_Score = f1_score(y_test, predictions, average='weighted')

In [139]:
Report = {'Metrics':["KNN_Accuracy_Score","KNN_JaccardIndex","KNN_F1_Score"],'Results':[KNN_Accuracy_Score,KNN_JaccardIndex,KNN_F1_Score]}
pd.DataFrame(Report)

Unnamed: 0,Metrics,Results
0,KNN_Accuracy_Score,0.818321
1,KNN_JaccardIndex,0.425121
2,KNN_F1_Score,0.802375


## 6. Decision tree

In [140]:
Tree = DecisionTreeClassifier(max_depth=4, random_state=35)
Tree.fit(x_train,y_train)

DecisionTreeClassifier(max_depth=4, random_state=35)

In [141]:
predictions= Tree.predict(x_test)

In [142]:
Tree_Accuracy_Score = metrics.accuracy_score(y_test, predictions)
Tree_JaccardIndex = jaccard_score(y_test, predictions)
Tree_F1_Score = f1_score(y_test, predictions, average='weighted')

In [143]:
Report = {'Metrics':["Tree_Accuracy_Score","Tree_JaccardIndex","Tree_F1_Score"],'Results':[Tree_Accuracy_Score,Tree_JaccardIndex,Tree_F1_Score]}
pd.DataFrame(Report)

Unnamed: 0,Metrics,Results
0,Tree_Accuracy_Score,0.819847
1,Tree_JaccardIndex,0.470852
2,Tree_F1_Score,0.812532


## 7. Logistic Regression

In [144]:
x_train, x_test, y_train, y_test = train_test_split(features,Y,test_size=0.2,random_state=1)

In [145]:
LR=LogisticRegression(C=0.01, solver='liblinear').fit(x_train,y_train)

In [146]:
predictions = LR.predict(x_test)

In [147]:
LR_Accuracy_Score = metrics.accuracy_score(y_test, predictions)
LR_JaccardIndex = jaccard_score(y_test, predictions)
LR_F1_Score = f1_score(y_test, predictions, average='weighted')

In [148]:
yhat_prob=LR.predict_proba(x_test)
LR_Log_Loss = log_loss(y_test, yhat_prob)

In [149]:
Report = {'Metrics':["LR_Accuracy_Score","LR_JaccardIndex","LR_F1_Score","LR_Log_Loss"],'Results':[LR_Accuracy_Score,LR_JaccardIndex,LR_F1_Score,LR_Log_Loss]}
pd.DataFrame(Report)

Unnamed: 0,Metrics,Results
0,LR_Accuracy_Score,0.827481
1,LR_JaccardIndex,0.484018
2,LR_F1_Score,0.820545
3,LR_Log_Loss,0.380085


## 8. SVM

In [150]:
SVM = svm.SVC(kernel='rbf')
SVM.fit(x_train,y_train)

SVC()

In [151]:
predictions = SVM.predict(x_test)

In [152]:
SVM_Accuracy_Score = metrics.accuracy_score(y_test, predictions)
SVM_JaccardIndex = jaccard_score(y_test, predictions,average='macro')
SVM_F1_Score = f1_score(y_test, predictions, average='weighted')

In [153]:
Report = {'Metrics':["SVM_Accuracy_Score","SVM_JaccardIndex","SVM_F1_Score"],'Results':[SVM_Accuracy_Score,SVM_JaccardIndex,SVM_F1_Score]}
pd.DataFrame(Report)

Unnamed: 0,Metrics,Results
0,SVM_Accuracy_Score,0.722137
1,SVM_JaccardIndex,0.361069
2,SVM_F1_Score,0.605622


## 9. Final Report

In [154]:
Final_Report = {'Metrics':["LinearRegression_MAE","LinearRegression_MSE","LinearRegression_R2","KNN_Accuracy_Score","KNN_JaccardIndex","KNN_F1_Score","Tree_Accuracy_Score","Tree_JaccardIndex","Tree_F1_Score","LR_Accuracy_Score","LR_JaccardIndex","LR_F1_Score","LR_Log_Loss","SVM_Accuracy_Score","SVM_JaccardIndex","SVM_F1_Score"],'Results':[LinearRegression_MAE,LinearRegression_MSE,LinearRegression_R2,KNN_Accuracy_Score,KNN_JaccardIndex,KNN_F1_Score,Tree_Accuracy_Score,Tree_JaccardIndex,Tree_F1_Score,LR_Accuracy_Score,LR_JaccardIndex,LR_F1_Score,LR_Log_Loss,SVM_Accuracy_Score,SVM_JaccardIndex,SVM_F1_Score]}
final_Report_df=pd.DataFrame(Final_Report)

In [155]:
final_Report_df.head(5)

Unnamed: 0,Metrics,Results
0,LinearRegression_MAE,0.25632
1,LinearRegression_MSE,0.115722
2,LinearRegression_R2,0.427125
3,KNN_Accuracy_Score,0.818321
4,KNN_JaccardIndex,0.425121


In [156]:
final_Report_df[5:10]

Unnamed: 0,Metrics,Results
5,KNN_F1_Score,0.802375
6,Tree_Accuracy_Score,0.819847
7,Tree_JaccardIndex,0.470852
8,Tree_F1_Score,0.812532
9,LR_Accuracy_Score,0.827481


In [157]:
final_Report_df[10:16]

Unnamed: 0,Metrics,Results
10,LR_JaccardIndex,0.484018
11,LR_F1_Score,0.820545
12,LR_Log_Loss,0.380085
13,SVM_Accuracy_Score,0.722137
14,SVM_JaccardIndex,0.361069
15,SVM_F1_Score,0.605622
