## CS74 Homework 2
Professor VS Subrahmanian  
November 3, 2020  
Angela Zhang

### Resources Consulted
* [Scikit Learn DecisionTreeClassifier](https://scikit-learn.org/stable/modules/tree.html)
* [Scikit Learn GaussianNB](https://scikit-learn.org/stable/modules/naive_bayes.html)
* [Scikit Learn Linear SVM](https://scikit-learn.org/stable/modules/svm.html)
* [Categorical Encoding](https://pbpython.com/categorical-encoding.html)

In [99]:
# importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as ply

# decision trees, naive bayes, linear SVM
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC

# preprocessing data
from sklearn.preprocessing import OrdinalEncoder, normalize, MinMaxScaler
from sklearn.impute import SimpleImputer

# splitting data
from sklearn.model_selection import train_test_split

# SMOTE
from imblearn.over_sampling import SMOTE

# quality of classifiers
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, f1_score

import math

## CSV Cleanup

In [116]:
# cleandata(csv_file) -> X, y for testing

def cleandata(csv_file):
    df = pd.read_csv(csv_file) # importing dataset
    
    df = df.drop(columns=['WindGustDir']) # dropping WindGustDir because of NaN
    
    # change RainToday and RainTomorrow to 0 and 1 for no and yes
    noyes_d = {'No': -1, 'Yes': 1}
    df['RainTomorrow'] = df['RainTomorrow'].map(noyes_d).fillna(df['RainTomorrow'])
    df['RainToday'] = df['RainToday'].map(noyes_d).fillna(df['RainToday'])
    
    wind_dict = {
        "0": 0,
        "E": 1,
        "ENE": 2,
        "ESE": 3,
        "N": 4,
        "NE": 5,
        "NNE": 6,
        "NNW": 7,
        "NW": 8,
        "S": 9,
        "SE": 10,
        "SSE": 11,
        "SSW": 12,
        "SW": 13,
        "W": 14,
        "WNW": 15,
        "WSW": 16,
    }
    # Replace Wind Directions with Numbers from wind_dict
    df['WindDir9am'] = df['WindDir9am'].map(wind_dict).fillna(df['WindDir9am'])
    df['WindDir3pm'] = df['WindDir3pm'].map(wind_dict).fillna(df['WindDir3pm'])
    
    # normalize data
    column_names_to_not_normalize = ['RainToday', 'RainTomorrow']
    column_names_to_normalize = [x for x in list(df) if x not in column_names_to_not_normalize ]
    x = df[column_names_to_normalize].values
    x_scaled = MinMaxScaler().fit_transform(x)
    df_temp = pd.DataFrame(x_scaled, columns=column_names_to_normalize, index = df.index)
    df[column_names_to_normalize] = df_temp
    
    return df

In [117]:
# Getting the df for Classifiers
df = cleandata("./Weather.csv")
df

Unnamed: 0,MinTemp,MaxTemp,Rainfall,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,0.626812,0.534934,0.000000,0.312500,0.8750,0.3125,0.195652,0.228070,0.79,0.718750,0.984886,0.983023,0.553425,0.539150,-1,-1
1,0.442029,0.456332,0.003656,0.270833,0.8750,0.3750,0.434783,0.228070,0.78,0.541667,0.987197,0.985435,0.372603,0.460850,-1,-1
2,0.728261,0.589520,0.001828,0.447917,0.1250,0.3125,0.195652,0.385965,0.78,0.645833,0.973046,0.972798,0.638356,0.592841,-1,-1
3,0.344203,0.414847,0.197441,0.000000,0.8750,0.1875,0.326087,0.263158,0.69,0.531250,0.978437,0.978200,0.386301,0.409396,1,-1
4,0.829710,0.617904,0.000000,0.000000,0.1875,0.3125,0.086957,0.421053,0.79,0.656250,0.978340,0.976367,0.679452,0.606264,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2495,0.909420,0.818777,0.000000,0.729167,0.6875,0.7500,0.239130,0.421053,0.58,0.562500,0.973238,0.974149,0.838356,0.751678,-1,-1
2496,0.789855,0.598253,0.000000,0.250000,0.8750,0.1250,0.130435,0.157895,0.70,0.708333,0.975452,0.976174,0.652055,0.563758,-1,-1
2497,0.634058,0.613537,0.000000,0.812500,0.2500,0.9375,0.478261,0.754386,0.31,0.208333,0.967174,0.970869,0.701370,0.514541,-1,-1
2498,0.373188,0.375546,0.000000,0.000000,0.8750,0.7500,0.565217,0.228070,0.48,0.312500,0.979496,0.981673,0.309589,0.375839,-1,-1


## Classifiers without AdaBoost

In [118]:
# preparing the data
X = df.drop('RainTomorrow', axis=1)
y = df['RainTomorrow']

# oversampling using SMOTE
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)

In [119]:
# Naïve Bayes without Boosting
gnb = GaussianNB()
sc_gnb = cross_val_score(gnb, X, y, scoring='f1_weighted', cv=10)
total_f1_gnb = sum(sc_gnb)
print("f1 using 10-fold: " + str(total_f1_gnb/10))

f1 using 10-fold: 0.7431405317279745


In [120]:
# Decision Tree without Boosting
dt = DecisionTreeClassifier()
sc_dt = cross_val_score(dt, X, y, scoring='f1_weighted', cv=10)
total_f1_dt = sum(sc_dt)
print("f1 using 10-fold: " + str(total_f1_dt/10))

f1 using 10-fold: 0.805616888033765


In [121]:
# Linear SVM without Boosting
svc = LinearSVC()
sc_svc = cross_val_score(svc, X, y, scoring='f1_weighted', cv=10)
total_f1_gnb = sum(sc_svc)
print("f1: " + str(total_f1_gnb/10))

f1: 0.7668499726881457


In [122]:
# cleandata(csv_file) -> X, y for testing
# slight changes to csv cleandata function for test data

def cleantestdata(csv_file):
    df = pd.read_csv(csv_file) # importing dataset
    
    df = df.drop(columns=['WindGustDir']) # dropping WindGustDir because of NaN
    
    # change RainToday and RainTomorrow to 0 and 1 for no and yes
    noyes_d = {'No': -1, 'Yes': 1}
    df['RainToday'] = df['RainToday'].map(noyes_d).fillna(df['RainToday'])
    
    wind_dict = {
        "0": 0,
        "E": 1,
        "ENE": 2,
        "ESE": 3,
        "N": 4,
        "NE": 5,
        "NNE": 6,
        "NNW": 7,
        "NW": 8,
        "S": 9,
        "SE": 10,
        "SSE": 11,
        "SSW": 12,
        "SW": 13,
        "W": 14,
        "WNW": 15,
        "WSW": 16,
    }
    # Replace Wind Directions with Numbers from wind_dict
    df['WindDir9am'] = df['WindDir9am'].map(wind_dict).fillna(df['WindDir9am'])
    df['WindDir3pm'] = df['WindDir3pm'].map(wind_dict).fillna(df['WindDir3pm'])
    
    # normalize data
    column_names_to_not_normalize = ['RainToday']
    column_names_to_normalize = [x for x in list(df) if x not in column_names_to_not_normalize ]
    x = df[column_names_to_normalize].values
    x_scaled = MinMaxScaler().fit_transform(x)
    df_temp = pd.DataFrame(x_scaled, columns=column_names_to_normalize, index = df.index)
    df[column_names_to_normalize] = df_temp
    
    X = df.drop('RainTomorrow', axis=1)
    
    return X

## AdaBoost

In [123]:
class AdaBoost:
    def __init__(self, classifier):
        self.classifier = classifier
        self.classifiers = []
        self.amount_of_say = []
        self.weights = []
        self.predictions = []

    def set_weights(self, incorrect_rows, N):
        aos = self.amount_of_say[-1]
        for i in range(N):
            if i in incorrect_rows:
                self.weights[i] *= math.e**aos
            else:
                self.weights[i] *= (math.e**((-1) * aos))
        sum_weights = sum(self.weights)
        for i in range(len(self.weights)):
            self.weights[i] /= sum_weights # normalize sample weights to add up to 1
    
    def fit(self, df):
        N = int(len(df) / 2) # sample size
        self.weights = [1/len(df)]*len(df) # weights for all df
        
        for i in range(10):
            # sample a training set using weights
            ts = df.sample(n=N, weights=self.weights)

            # learn a classifier
            X_train = ts.drop('RainTomorrow', axis=1)
            y_train = ts['RainTomorrow']

            clf = self.classifier.fit(X_train, y_train)
            predictions = clf.predict(df.drop('RainTomorrow', axis=1))
            
            y = df['RainTomorrow']

            # total error
            total_error = 0
            incorrect_rows = []
            for pred_ind in range(len(predictions)):
                if predictions[pred_ind] != y[pred_ind]:
                    total_error += self.weights[pred_ind]
                    incorrect_rows.append(pred_ind)

            # amount of say of classifier
            amount_of_say = (math.log((1-total_error)/total_error))/2
            self.classifiers.append(clf)
            self.amount_of_say.append(amount_of_say)

            # set weights accordingly
            self.set_weights(incorrect_rows, N)
    
    def predict(self, X):
        ada_predictions = [0]*len(X)
        for ind in range(len(self.classifiers)):
            clf = self.classifiers[ind]
            amount_of_say = self.amount_of_say[ind]
            prediction = clf.predict(X)
            for i in range(len(ada_predictions)):
                ada_predictions[i] += (amount_of_say * prediction[i])
        
        for i in range(len(ada_predictions)):
            ada_predictions[i] = 1 if ada_predictions[i] > 0 else -1
        
        self.predictions = ada_predictions
        

In [129]:
# F1_weighted for Adaboost
# Adaboost with GNB
trn_gnb = GaussianNB()
trn_gnb_adaboost = AdaBoost(trn_gnb)
trn_gnb_adaboost.fit(df)
trn_gnb_adaboost.predict(df.drop('RainTomorrow', axis=1))
trn_gnb_ada_predictions = trn_gnb_adaboost.predictions
gnb_ada_f1 = f1_score(y_true=df['RainTomorrow'], y_pred=trn_gnb_ada_predictions, average='weighted')
print('Weighted F1 for Adaboost with GNB: ' + str(gnb_ada_f1))

# Adaboost with DT
trn_dt = DecisionTreeClassifier()
trn_dt_adaboost = AdaBoost(trn_dt)
trn_dt_adaboost.fit(df)
trn_dt_adaboost.predict(df.drop('RainTomorrow', axis=1))
trn_dt_ada_predictions = trn_dt_adaboost.predictions
dt_ada_f1 = f1_score(y_true=df['RainTomorrow'], y_pred=trn_dt_ada_predictions, average='weighted')
print('Weighted F1 for Adaboost with DT: ' + str(dt_ada_f1))

# Adaboost with LinearSVM
trn_svc = LinearSVC()
trn_svc_adaboost = AdaBoost(trn_svc)
trn_svc_adaboost.fit(df)
trn_svc_adaboost.predict(df.drop('RainTomorrow', axis=1))
trn_svc_ada_predictions = trn_svc_adaboost.predictions
svc_ada_f1 = f1_score(y_true=df['RainTomorrow'], y_pred=trn_svc_ada_predictions, average='weighted')
print('Weighted F1 for Adaboost with SVC: ' + str(svc_ada_f1))

Weighted F1 for Adaboost with GNB: 0.8051834080777164
Weighted F1 for Adaboost with DT: 0.8750117056243701
Weighted F1 for Adaboost with SVC: 0.8211523256852066


## CSV Prediction Generation (6 files)

In [27]:
# using classifiers to generate predictions for test csv
testX = cleantestdata("./weather_test.csv")
index_list = list(range(0, len(testX)))

# Gaussian Naive Bayes
gnb.fit(X, y)
gnb_predictions = gnb.predict(testX)
gnb_predictions_df = pd.DataFrame({'index': index_list, 'RainTomorrow': gnb_predictions})
gnb_predictions_df.to_csv('gnb_predictions.csv')

  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


In [100]:
# Decision Tree
dt.fit(X, y)
dt_predictions = dt.predict(testX)
dt_predictions_df = pd.DataFrame({'index': index_list, 'RainTomorrow': dt_predictions})
dt_predictions_df.to_csv('dt_predictions.csv')

# Linear SVM
svc.fit(X, y)
svc_predictions = svc.predict(testX)
svc_predictions_df = pd.DataFrame({'index': index_list, 'RainTomorrow': svc_predictions})
svc_predictions_df.to_csv('svc_predictions.csv')

In [115]:
# Making CSV for adaboost predictions

testX = cleantestdata("./weather_test.csv")
index_list = list(range(0, len(testX)))

# AdaBoost for GNB
gnb = GaussianNB()
gnb_adaboost = AdaBoost(gnb)
gnb_adaboost.fit(df)
gnb_adaboost.predict(testX)
gnb_ada_predictions = gnb_adaboost.predictions
gnb_ada_predictions_df = pd.DataFrame({'index': index_list, 'RainTomorrow': gnb_ada_predictions})
gnb_ada_predictions_df.to_csv('gnb_adaboost_predictions.csv')

# AdaBoost for DT
dt = DecisionTreeClassifier()
dt_adaboost = AdaBoost(dt)
dt_adaboost.fit(df)
dt_adaboost.predict(testX)
dt_ada_predictions = dt_adaboost.predictions
dt_ada_predictions_df = pd.DataFrame({'index': index_list, 'RainTomorrow': dt_ada_predictions})
dt_ada_predictions_df.to_csv('dt_adaboost_predictions.csv')

# AdaBoost for Linear SVM
svc = LinearSVC()
svc_adaboost = AdaBoost(svc)
svc_adaboost.fit(df)
svc_adaboost.predict(testX)
svc_ada_predictions = svc_adaboost.predictions
svc_ada_predictions_df = pd.DataFrame({'index': index_list, 'RainTomorrow': svc_ada_predictions})
svc_ada_predictions_df.to_csv('svc_adaboost_predictions.csv')

  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


[-1, -1, -1, 1, -1, -1, -1, -1, -1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1, -1, 1, 1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, 1,