<a href="https://colab.research.google.com/github/alexis-anciado/moop/blob/main/Official_codes_for_Lab_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Setup

In [1]:
import sys
assert sys.version_info >= (3, 5)

import sklearn
assert sklearn.__version__ >= "0.20"

import numpy as np
import os
import pandas as pd

np.random.seed(42)

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

import warnings
warnings.filterwarnings("ignore")

##Data Preprocessing

In [2]:
dataset = pd.read_csv("/content/sample_data/seattle-weather.csv")
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1461 entries, 0 to 1460
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           1461 non-null   object 
 1   precipitation  1461 non-null   float64
 2   temp_max       1461 non-null   float64
 3   temp_min       1461 non-null   float64
 4   wind           1461 non-null   float64
 5   weather        1461 non-null   object 
dtypes: float64(4), object(2)
memory usage: 68.6+ KB


In [3]:
dataset.duplicated().sum()

0

In [4]:
dataset['weather'].unique()

array(['drizzle', 'rain', 'sun', 'snow', 'fog'], dtype=object)

In [5]:
from sklearn.preprocessing import LabelEncoder
LE = LabelEncoder()

dataset['weather_label'] = LE.fit_transform(dataset['weather'])
dataset.head(20)

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather,weather_label
0,2012-01-01,0.0,12.8,5.0,4.7,drizzle,0
1,2012-01-02,10.9,10.6,2.8,4.5,rain,2
2,2012-01-03,0.8,11.7,7.2,2.3,rain,2
3,2012-01-04,20.3,12.2,5.6,4.7,rain,2
4,2012-01-05,1.3,8.9,2.8,6.1,rain,2
5,2012-01-06,2.5,4.4,2.2,2.2,rain,2
6,2012-01-07,0.0,7.2,2.8,2.3,rain,2
7,2012-01-08,0.0,10.0,2.8,2.0,sun,4
8,2012-01-09,4.3,9.4,5.0,3.4,rain,2
9,2012-01-10,1.0,6.1,0.6,3.4,rain,2


In [6]:
weather_dictionary = dict(zip(dataset['weather_label'], dataset['weather']))
weather_dictionary

{0: 'drizzle', 2: 'rain', 4: 'sun', 3: 'snow', 1: 'fog'}

In [7]:
def date_time(dataset) :

    dataset['date'] = pd.to_datetime(dataset['date'])
    dataset['year'] = dataset['date'].dt.year
    dataset['month'] = dataset['date'].dt.month
    dataset['day'] = dataset['date'].dt.day

    return dataset

dataset_final = date_time(dataset)
dataset_final

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather,weather_label,year,month,day
0,2012-01-01,0.0,12.8,5.0,4.7,drizzle,0,2012,1,1
1,2012-01-02,10.9,10.6,2.8,4.5,rain,2,2012,1,2
2,2012-01-03,0.8,11.7,7.2,2.3,rain,2,2012,1,3
3,2012-01-04,20.3,12.2,5.6,4.7,rain,2,2012,1,4
4,2012-01-05,1.3,8.9,2.8,6.1,rain,2,2012,1,5
...,...,...,...,...,...,...,...,...,...,...
1456,2015-12-27,8.6,4.4,1.7,2.9,rain,2,2015,12,27
1457,2015-12-28,1.5,5.0,1.7,1.3,rain,2,2015,12,28
1458,2015-12-29,0.0,7.2,0.6,2.6,fog,1,2015,12,29
1459,2015-12-30,0.0,5.6,-1.0,3.4,sun,4,2015,12,30


In [8]:
dataset_final = dataset_final.drop(['weather'],axis=1).set_index('date')
dataset_final

Unnamed: 0_level_0,precipitation,temp_max,temp_min,wind,weather_label,year,month,day
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2012-01-01,0.0,12.8,5.0,4.7,0,2012,1,1
2012-01-02,10.9,10.6,2.8,4.5,2,2012,1,2
2012-01-03,0.8,11.7,7.2,2.3,2,2012,1,3
2012-01-04,20.3,12.2,5.6,4.7,2,2012,1,4
2012-01-05,1.3,8.9,2.8,6.1,2,2012,1,5
...,...,...,...,...,...,...,...,...
2015-12-27,8.6,4.4,1.7,2.9,2,2015,12,27
2015-12-28,1.5,5.0,1.7,1.3,2,2015,12,28
2015-12-29,0.0,7.2,0.6,2.6,1,2015,12,29
2015-12-30,0.0,5.6,-1.0,3.4,4,2015,12,30


In [9]:
X = dataset_final.drop(['weather_label'], axis = 1)
y = dataset_final['weather_label']

X__train, X__test_, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

k = 20
kfold = KFold(n_splits=k, shuffle=True, random_state=42)

scaler = StandardScaler()
accuracy_scores = []

for train_index, test_index in kfold.split(X):
  X__train, X__test = X.iloc[train_index], X.iloc[test_index]
  y_train, y_test = y.iloc[train_index], y.iloc[test_index]

X_train = scaler.fit_transform(X__train)
X_test = scaler.transform(X__test)

##Preliminaries

###Without Bagging and K-Fold

In [10]:
X_train_, X_test_, y_train_, y_test_ = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
NBModel_clf = GaussianNB(var_smoothing = 1e-09)
SVMModel_clf = SVC(kernel='linear', probability=True)
HardVoting_clf = VotingClassifier(estimators=[('NB Model', NBModel_clf), ('SVM Model', SVMModel_clf)], voting='hard')
HardVoting_clf.fit(X_train_, y_train_)

In [12]:
for clf in (NBModel_clf, SVMModel_clf):
    clf.fit(X_train_, y_train_)
    y_pred_ = clf.predict(X_test_)
    print(clf.__class__.__name__, accuracy_score(y_test_, y_pred_))

GaussianNB 0.8430034129692833
SVC 0.8327645051194539


### With K-Fold

In [13]:
NB_Model_clf = GaussianNB(var_smoothing = 1e-09)
SVM_Model_clf = SVC(kernel='linear', probability=True)
Hard_Voting_clf = VotingClassifier(estimators=[('NB Model', NB_Model_clf), ('SVM Model', SVM_Model_clf)], voting='hard')
Hard_Voting_clf.fit(X_train, y_train)

In [14]:
NB_Model_clf.fit(X_train, y_train)
y_pred__NB = NB_Model_clf.predict(X_test)
print('NB Model:', accuracy_score(y_test, y_pred__NB))

SVM_Model_clf.fit(X_train, y_train)
y_pred__SVM =SVM_Model_clf.predict(X_test)
print('SVM Model:', accuracy_score(y_test, y_pred__SVM))

NB Model: 0.9041095890410958
SVM Model: 0.821917808219178


#Model

In [15]:
nb = GaussianNB(var_smoothing = 1e-09)
svm = SVC(C=100, kernel='linear', gamma='scale')

NBModel = BaggingClassifier(estimator=nb, n_estimators=10, random_state=42)
SVMModel = BaggingClassifier(estimator=svm, n_estimators=1, random_state=42)
HardVoting = VotingClassifier(estimators=[('NB Model', NBModel),('SVM Model', SVMModel)], voting='hard')
HardVoting.fit(X_train, y_train)

In [16]:
NBModel.fit(X_train, y_train)
y_pred_NB = NBModel.predict(X_test)
print('NB Model:', accuracy_score(y_test, y_pred_NB))

SVMModel.fit(X_train, y_train)
y_pred_SVM = SVMModel.predict(X_test)
print('SVM Model:',accuracy_score(y_test, y_pred_SVM))

y_pred_HV = HardVoting.predict(X_test)
print('Hard Voting Score:',accuracy_score(y_test, y_pred_HV))

NB Model: 0.9178082191780822
SVM Model: 0.9041095890410958
Hard Voting Score: 0.9315068493150684


#Evaluation

In [17]:
print("Naive Bayes Model")
print("Accuracy:", accuracy_score(y_test, y_pred_NB)*100)
print("Precision:", precision_score(y_test, y_pred_NB, average= 'weighted')*100)
print("Recall:", recall_score(y_test, y_pred_NB, average= 'weighted')*100)
print("F1 Score:", f1_score(y_test, y_pred_NB, average= 'weighted')*100)

Naive Bayes Model
Accuracy: 91.78082191780823
Precision: 92.44979623901408
Recall: 91.78082191780823
F1 Score: 91.06099444715088


In [18]:
print("                 NAIVE BAYES CLASSIFICATION REPORT")
print(classification_report(y_test,y_pred_NB))

                 NAIVE BAYES CLASSIFICATION REPORT
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       1.00      0.40      0.57         5
           2       0.97      0.95      0.96        37
           3       0.50      0.50      0.50         2
           4       0.87      1.00      0.93        27

    accuracy                           0.92        73
   macro avg       0.87      0.77      0.79        73
weighted avg       0.92      0.92      0.91        73



In [19]:
print("Support Vector Machine")
print("Accuracy:", accuracy_score(y_test, y_pred_SVM)*100)
print("Precision:", precision_score(y_test, y_pred_SVM, average= 'weighted')*100)
print("Recall:", recall_score(y_test, y_pred_SVM, average= 'weighted')*100)
print("F1 Score:", f1_score(y_test, y_pred_SVM, average= 'weighted')*100)

Support Vector Machine
Accuracy: 90.41095890410958
Precision: 85.05603985056041
Recall: 90.41095890410958
F1 Score: 87.16894977168951


In [20]:
print("           SUPPORT VECTOR MACHINE CLASSIFICATION REPORT")
print(classification_report(y_test,y_pred_SVM))

           SUPPORT VECTOR MACHINE CLASSIFICATION REPORT
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       0.00      0.00      0.00         5
           2       0.97      0.97      0.97        37
           3       1.00      0.50      0.67         2
           4       0.82      1.00      0.90        27

    accuracy                           0.90        73
   macro avg       0.76      0.69      0.71        73
weighted avg       0.85      0.90      0.87        73



In [21]:
print("Hard Voting Ensemble Model")
print("Accuracy:", accuracy_score(y_test, y_pred_HV)*100)
print("Precision:", precision_score(y_test, y_pred_HV, average= 'weighted')*100)
print("Recall:", recall_score(y_test, y_pred_HV, average= 'weighted')*100)
print("F1 Score:", f1_score(y_test, y_pred_HV, average= 'weighted')*100)

Hard Voting Ensemble Model
Accuracy: 93.15068493150685
Precision: 93.85771100309324
Recall: 93.15068493150685
F1 Score: 92.2306948287109


In [22]:
print("           HARD VOTING ENSEMBLE CLASSIFICATION REPORT")
print(classification_report(y_test,y_pred_HV))

           HARD VOTING ENSEMBLE CLASSIFICATION REPORT
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       1.00      0.40      0.57         5
           2       0.97      0.97      0.97        37
           3       1.00      0.50      0.67         2
           4       0.87      1.00      0.93        27

    accuracy                           0.93        73
   macro avg       0.97      0.77      0.83        73
weighted avg       0.94      0.93      0.92        73

