In [16]:
#Importing libraries
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
#Reading training data
weather=pd.read_csv('Weather Training Data.csv')
weather.head(10)

In [2]:
#Information about data
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99516 entries, 0 to 99515
Data columns (total 23 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   row ID         99516 non-null  object 
 1   Location       99516 non-null  object 
 2   MinTemp        99073 non-null  float64
 3   MaxTemp        99286 non-null  float64
 4   Rainfall       98537 non-null  float64
 5   Evaporation    56985 non-null  float64
 6   Sunshine       52199 non-null  float64
 7   WindGustDir    92995 non-null  object 
 8   WindGustSpeed  93036 non-null  float64
 9   WindDir9am     92510 non-null  object 
 10  WindDir3pm     96868 non-null  object 
 11  WindSpeed9am   98581 non-null  float64
 12  WindSpeed3pm   97681 non-null  float64
 13  Humidity9am    98283 non-null  float64
 14  Humidity3pm    97010 non-null  float64
 15  Pressure9am    89768 non-null  float64
 16  Pressure3pm    89780 non-null  float64
 17  Cloud9am       61944 non-null  float64
 18  Cloud3

In [3]:
#verifying percentage of missing values
weather.isnull().mean()*100

row ID            0.000000
Location          0.000000
MinTemp           0.445155
MaxTemp           0.231119
Rainfall          0.983761
Evaporation      42.737851
Sunshine         47.547128
WindGustDir       6.552715
WindGustSpeed     6.511516
WindDir9am        7.040074
WindDir3pm        2.660879
WindSpeed9am      0.939547
WindSpeed3pm      1.843925
Humidity9am       1.238997
Humidity3pm       2.518188
Pressure9am       9.795410
Pressure3pm       9.783351
Cloud9am         37.754733
Cloud3pm         40.196551
Temp9am           0.616986
Temp3pm           1.913260
RainToday         0.983761
RainTomorrow      0.000000
dtype: float64

In [4]:
#removing features with missing values more than 30% and irrelevant features
weather=weather.drop(['row ID','Location','Evaporation','Sunshine','Cloud9am','Cloud3pm'],axis=1)

In [5]:
#fill missing continous features
continous_columns = weather.select_dtypes(include=['float64','int64']).drop(['RainTomorrow'],axis=1).columns
for c in continous_columns:
    mean = weather[c].mean()
    weather[c]=weather[c].fillna(mean)

#fill missing categorical features
categorical_columns = weather.select_dtypes(include=['object']).columns
for c in categorical_columns:
    most_frequent_value = weather[c].mode()[0]
    weather[c]=weather[c].fillna(most_frequent_value)
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99516 entries, 0 to 99515
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MinTemp        99516 non-null  float64
 1   MaxTemp        99516 non-null  float64
 2   Rainfall       99516 non-null  float64
 3   WindGustDir    99516 non-null  object 
 4   WindGustSpeed  99516 non-null  float64
 5   WindDir9am     99516 non-null  object 
 6   WindDir3pm     99516 non-null  object 
 7   WindSpeed9am   99516 non-null  float64
 8   WindSpeed3pm   99516 non-null  float64
 9   Humidity9am    99516 non-null  float64
 10  Humidity3pm    99516 non-null  float64
 11  Pressure9am    99516 non-null  float64
 12  Pressure3pm    99516 non-null  float64
 13  Temp9am        99516 non-null  float64
 14  Temp3pm        99516 non-null  float64
 15  RainToday      99516 non-null  object 
 16  RainTomorrow   99516 non-null  int64  
dtypes: float64(12), int64(1), object(4)
memory usage: 

In [6]:
#Normalization continous features
scaler = StandardScaler()
weather[continous_columns] = scaler.fit_transform(weather[continous_columns])

#Convert categorical features to continous

label_encoders={}
for c in categorical_columns:
    label_encoders[c] = LabelEncoder()
    weather[c]=label_encoders[c].fit_transform(weather[c])

weather.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,0.19191,-0.044818,-0.207557,13,0.306357,13,14,0.676626,0.613461,0.112555,-1.434774,-1.478566,-1.223345,-0.010827,0.017285,0,0
1,-0.749029,0.264744,-0.278597,14,0.306357,6,15,-1.12917,0.38411,-1.311772,-1.288534,-1.049122,-1.118737,0.035548,0.38145,0,0
2,0.834885,1.277857,-0.160198,13,0.077905,1,7,-0.790583,0.154759,0.692836,-0.898561,-1.019505,-1.387729,0.1283,1.168047,0,0
3,0.380098,0.912011,-0.254917,14,1.220168,13,13,0.563764,0.613461,-0.731491,-1.386028,-1.25644,-1.477393,0.561142,1.051514,0,0
4,-0.701982,0.48988,-0.278597,13,-0.379,10,13,-0.903445,-0.189268,-1.100761,-1.581014,-0.634487,-0.775026,-0.103579,0.556249,0,0


In [7]:
#Divide features=X and target=Y, using 80 % of data for train and 20 % for train
X=weather.drop(['RainTomorrow'],axis=1)
Y=weather['RainTomorrow']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=.2)

In [12]:
#=========
#KNN Model
clf = KNeighborsClassifier(n_neighbors = 5)
#Training
clf.fit(X_train,y_train)
#Testing accuracy KNN
test_predictions = clf.predict(X_test)
from sklearn.metrics import accuracy_score
score_knn=accuracy_score(y_test, test_predictions)
accuracy_knn = round(score_knn*100, 2)
print("The classification accuracy of KNN model is "+ str(accuracy_knn)+"%")

The classification accuracy of KNN model is 82.63%


In [14]:
#=======
#Decision Tree Model
dtree_model = DecisionTreeClassifier()
#Training Model
dtree_model.fit(X_train, y_train)
#Testing accuracy
dtree_model.predict(X_test)
dtree_model_score = dtree_model.score(X_test, y_test)
accuracy_dtree = round(dtree_model_score*100, 2)
print("The classification accuracy of Decision Tree model is "+ str(accuracy_dtree)+"%")

The classification accuracy of Decision Tree model is 77.69%


In [17]:
#======
#Random Forest Model
rforest_model = RandomForestClassifier()
#Training
rforest_model.fit(X_train, y_train)
#Testing
rforest_model.predict(X_test)
rforest_model_score = rforest_model.score(X_test, y_test)
accuracy_rforest = round(rforest_model_score*100, 2)
print("The classification accuracy of Random Forest model is "+ str(accuracy_rforest)+"%")

The classification accuracy of Random Forest model is 84.82%
