In [102]:
# To speed up, we will replace our own implementation of the decision tree with the implementation from sklearn 
# and compare the accuracy of current implemetation AdaBoost with the implementation of it from sklearn

In [98]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from copy import deepcopy
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [99]:
class AdaBoost:
    
    def __init__(self, n_estimators=50):
        self.n_estimators = n_estimators
        self.decision_stumps = []
    
    def __evalute_performance(self, total_error):
        return np.log((1-total_error)/total_error)/2
    
    def fit(self, X, y):
        X_copy = deepcopy(X)
        y_copy = deepcopy(y)
        iteration = 0
        
        while iteration < self.n_estimators:
            total_error = 0
            samples_weights = np.ones(X_copy.shape[0])/X_copy.shape[0]
            X_copy.reset_index(drop=True, inplace=True)
            y_copy.reset_index(drop=True, inplace=True)
            X_extended = pd.concat([X_copy, pd.Series(samples_weights, name="samples_weights")], axis=1)
            dt = DecisionTreeClassifier(max_depth=1, random_state=np.random.randint(1,self.n_estimators))
            dt.fit(X_copy, y_copy)
            y_predict = dt.predict(X_copy)
            incorrect_indices = X_copy[[i!=j for i,j in zip(y_copy, y_predict)]].index
            total_error += np.sum(X_extended.loc[incorrect_indices, "samples_weights"])
            correct_indices = X_copy[[i==j for i,j in zip(y_copy, y_predict)]].index
            if total_error > .5:
                break
            alpha = self.__evalute_performance(total_error)
            self.decision_stumps.append((dt, alpha))

            X_extended.loc[incorrect_indices, "samples_weights"] *= np.exp(alpha)
            X_extended.loc[correct_indices, "samples_weights"] *= np.exp(-alpha)      
            X_extended["samples_weights"] /= np.sum(X_extended["samples_weights"])
            X_copy = X_copy.sample(n=X_copy.shape[0], weights=X_extended["samples_weights"], replace=True)
            y_copy = y_copy[X_copy.index]
            iteration += 1
#             print(f"Iteration #{iteration}. Total error={total_error}. Alpha={alpha}")
    
    def predict(self, X):
        y_hat = []
        predictions = [{} for _ in range(X.shape[0])]
        for stump in self.decision_stumps:
            tree, alpha = stump
            y_pred = tree.predict(X)
            for i, y in enumerate(y_pred):
                predictions[i][y] = predictions[i].get(y, 0) + alpha
        for pred in predictions:
            y_hat.append(max(pred, key=pred.get))
        return y_hat                            

### Dataset from here: https://www.kaggle.com/datasets/jsphyg/weather-dataset-rattle-package?rvi=1

In [100]:
# Look at the data

In [32]:
df = pd.read_csv('weatherAUS.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           145460 non-null  object 
 1   Location       145460 non-null  object 
 2   MinTemp        143975 non-null  float64
 3   MaxTemp        144199 non-null  float64
 4   Rainfall       142199 non-null  float64
 5   Evaporation    82670 non-null   float64
 6   Sunshine       75625 non-null   float64
 7   WindGustDir    135134 non-null  object 
 8   WindGustSpeed  135197 non-null  float64
 9   WindDir9am     134894 non-null  object 
 10  WindDir3pm     141232 non-null  object 
 11  WindSpeed9am   143693 non-null  float64
 12  WindSpeed3pm   142398 non-null  float64
 13  Humidity9am    142806 non-null  float64
 14  Humidity3pm    140953 non-null  float64
 15  Pressure9am    130395 non-null  float64
 16  Pressure3pm    130432 non-null  float64
 17  Cloud9am       89572 non-null

In [33]:
df.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [None]:
# Preprocessing data

In [34]:
df_preprocessing = df.dropna(subset=['RainTomorrow']) # remove row with NaN in target variable
df_preprocessing = df_preprocessing.drop(['Evaporation', 'Sunshine', 'Cloud9am', 'Cloud3pm', 'Date'], axis=1) # remove features with many NaN values 
df_preprocessing.dropna(inplace=True) # remove rows with NaN values 
df_preprocessing.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 112925 entries, 0 to 145458
Data columns (total 18 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Location       112925 non-null  object 
 1   MinTemp        112925 non-null  float64
 2   MaxTemp        112925 non-null  float64
 3   Rainfall       112925 non-null  float64
 4   WindGustDir    112925 non-null  object 
 5   WindGustSpeed  112925 non-null  float64
 6   WindDir9am     112925 non-null  object 
 7   WindDir3pm     112925 non-null  object 
 8   WindSpeed9am   112925 non-null  float64
 9   WindSpeed3pm   112925 non-null  float64
 10  Humidity9am    112925 non-null  float64
 11  Humidity3pm    112925 non-null  float64
 12  Pressure9am    112925 non-null  float64
 13  Pressure3pm    112925 non-null  float64
 14  Temp9am        112925 non-null  float64
 15  Temp3pm        112925 non-null  float64
 16  RainToday      112925 non-null  object 
 17  RainTomorrow   112925 non-nul

In [None]:
# Single out X and y

In [35]:
X = df_preprocessing.loc[:, 'Location':'RainToday']
y = df_preprocessing['RainTomorrow']

In [None]:
# Row with numerical types. In order to be able to compare with the implementation from sklearn

In [36]:
numerical_columns = ['MinTemp', 'MaxTemp', 'Rainfall', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Temp9am', 'Temp3pm']

In [None]:
# Splitting on train and test sets

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X[numerical_columns], y, test_size=0.2)

In [93]:
ab = AdaBoost(n_estimators=50)
ab.fit(X_train, y_train)

In [95]:
y_test_hat = ab.predict(X_test)

#### Metrics for current implementation

In [97]:
print(classification_report(y_test, y_test_hat))

              precision    recall  f1-score   support

          No       0.87      0.94      0.90     17566
         Yes       0.71      0.49      0.58      5019

    accuracy                           0.84     22585
   macro avg       0.79      0.71      0.74     22585
weighted avg       0.83      0.84      0.83     22585



### Implementation from sklearn

In [72]:
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier()
clf.fit(X_train, y_train)

AdaBoostClassifier()

In [73]:
y_test_hat_clf = clf.predict(X_test)

#### Metrics for implementation from sklearn

In [74]:
print(classification_report(y_test, y_test_hat_clf))

              precision    recall  f1-score   support

          No       0.87      0.95      0.91     17566
         Yes       0.73      0.49      0.58      5019

    accuracy                           0.85     22585
   macro avg       0.80      0.72      0.74     22585
weighted avg       0.84      0.85      0.83     22585



### The accuracy of the current implementation and implementation from sklearn is the same