In [538]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from pprint import pprint
from copy import deepcopy
from multiprocessing import Pool, cpu_count
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [207]:
def get_gini_index_by_feature_value(y):
    _, counts = np.unique(y, return_counts=True)
    total = np.sum(counts)
    fractions = counts/total
    return 1-np.sum(fractions**2)

In [208]:
def evaluate_gini_index_for_threhold(data):
    X_feature, y, thr = data
    y_by_feature_value = y[X_feature <= thr]
    w1 = len(y_by_feature_value)/len(y)
    less_or_equal = get_gini_index_by_feature_value(y_by_feature_value)
    y_by_feature_value = y[X_feature > thr]
    w2 = len(y_by_feature_value)/len(y)
    greater = get_gini_index_by_feature_value(y_by_feature_value)
    return w1*less_or_equal + w2*greater   

In [209]:
def get_thresholds(X_feature, y):
    thresholds = []
    X_feature_sorted = X_feature.sort_values()
    current_target_value = y.loc[X_feature_sorted.index[0]]
    for i, idx in enumerate(X_feature_sorted.index[1:]):
        if np.all(y.loc[idx] != current_target_value):
            current_target_value = y.loc[idx]
            if X_feature_sorted.loc[idx] != X_feature_sorted.loc[X_feature_sorted.index[i]]:
                thresholds.append((X_feature_sorted.loc[idx] + X_feature_sorted.loc[X_feature_sorted.index[i]])/2)
    return sorted(list(set(thresholds)))

In [211]:
class Leaf:

    def __init__(self, feature_value, class_, classes, classes_distrib, 
                 wrong_class_samples_indices: None, right_class_samples_indices: None):
        self.feature_value = feature_value
        self.class_ = class_
        self.classes = [] if classes is None else classes
        self.classes_distrib = [] if classes_distrib is None else classes_distrib
        self.wrong_class_samples_indices = [] if wrong_class_samples_indices is None else wrong_class_samples_indices
        self.right_class_samples_indices = [] if right_class_samples_indices is None else right_class_samples_indices
    
    def __repr__(self):
        return f"Leaf(feature_value={self.feature_value}, class_={self.class_}, classes={self.classes}, classes_distrib={self.classes_distrib})" 

In [420]:
class DecisionStump:
    
    def __init__(self, feature, threshold=None, leafs=None, alpha=None):
        self.feature = feature
        self.threshold = threshold                   
        self.leafs = [] if leafs is None else leafs
        self.alpha = alpha
        
    def __repr__(self):
        return f"DecisionStump(feature={self.feature}, threshold={self.threshold}, alpha={self.alpha}, leafs={self.leafs})"

In [540]:
class AdaBoost:
    
    def __init__(self, n_iteration=50):
        self.n_iteration = n_iteration
        self.decision_stumps = []
    
    def __evalute_performance(self, total_error):
        return np.log((1-total_error)/total_error)/2

    def __get_gini_index(self, X_feature, y):                 
        threshold = None
        if X_feature.dtypes == np.int64 or X_feature.dtypes == np.float64:
            gini_index = np.inf
            possible_thresholds = get_thresholds(X_feature, y)
            if not possible_thresholds:
                return np.inf, None
#             print(f"Possible thresholds: {possible_thresholds}")
            data = [(X_feature, y, thr) for thr in possible_thresholds]
            with Pool(cpu_count()-1) as pool:
                gs = pool.map(evaluate_gini_index_for_threhold, data)   
            gini_index = np.min(gs)
            threshold = possible_thresholds[np.argmin(gs)]           
        else:
            gini_index = 0
            unique_values, unique_counts = np.unique(X_feature, return_counts=True)
#             print(f"Possible values: {unique_values}")
            weights = unique_counts/np.sum(unique_counts)
            for idx, value in enumerate(unique_values):
                y_by_feature_value = y[X_feature == value]
                gini_index +=weights[idx]*get_gini_index_by_feature_value(y_by_feature_value)
        return gini_index, threshold

    def __select_feature_with_min_gini_index(self, X, y):
        gis = []        
        for col in X.columns:
            print(f"Checking feature {col}")
            gi, thr = self.__get_gini_index(X[col], y)
            gis.append((gi, thr))            
        idx = np.argmin([el[0] for el in gis])
        print(f"Best feature is {X.columns[idx]}")
        return X.columns[idx], gis[idx][1]
    
    def fit(self, X, y):
        X_copy = deepcopy(X)
        y_copy = deepcopy(y)
        iteration = 0
        
        while iteration < self.n_iteration and len(X_copy.columns):
#             print(X_copy)
            samples_weights = np.ones(X_copy.shape[0])/X_copy.shape[0]
            X_copy.reset_index(drop=True, inplace=True)
            y_copy.reset_index(drop=True, inplace=True)
            columns = list(X_copy.columns)
            X_extended = pd.concat([X_copy, pd.Series(samples_weights, name="samples_weights")], axis=1)
    #         classes, counts = np.unique(y, return_counts=True)
    #         cost = get_gini_index_by_feature_value(y)
            current_feature, threshold = self.__select_feature_with_min_gini_index(X_copy, y_copy)
            leafs = []
            total_error = 0
            if threshold is None:
                columns.remove(current_feature)
                unique_values = np.unique(X_copy[current_feature])
                for value in unique_values:
                    X_after_split = X_copy[X_copy[current_feature]==value]
                    y_after_split = y_copy[X_after_split.index]
                    classes, counts = np.unique(y_after_split, return_counts=True)
                    majority_class = classes[np.argmax(counts)]
                    wrong_class_samples_indices = y_after_split[y_after_split!=majority_class].index
                    total_error += np.sum(X_extended.loc[wrong_class_samples_indices, "samples_weights"])
                    right_class_samples_indices = y_after_split[y_after_split==majority_class].index
                    leaf = Leaf(feature_value=value, 
                                class_=majority_class, 
                                classes=classes, 
                                classes_distrib=counts, 
                                wrong_class_samples_indices=wrong_class_samples_indices,
                                right_class_samples_indices=right_class_samples_indices)
                    leafs.append(leaf)
            else: 
                X_after_split_le = X_copy[X_copy[current_feature]<=threshold]
                y_after_split = y_copy[X_after_split_le.index]
                classes, counts = np.unique(y_after_split, return_counts=True)
                majority_class = classes[np.argmax(counts)]
                wrong_class_samples_indices = y_after_split[y_after_split!=majority_class].index
                total_error += np.sum(X_extended.loc[wrong_class_samples_indices, "samples_weights"])
                right_class_samples_indices = y_after_split[y_after_split==majority_class].index
                leaf = Leaf(feature_value=None, 
                            class_=majority_class, 
                            classes=classes, 
                            classes_distrib=counts, 
                            wrong_class_samples_indices=wrong_class_samples_indices,
                            right_class_samples_indices=right_class_samples_indices)
                leafs.append(leaf)                
                X_after_split_g = X_copy[X_copy[current_feature]>threshold]
                y_after_split = y_copy[X_after_split_g.index]
                classes, counts = np.unique(y_after_split, return_counts=True)
                majority_class = classes[np.argmax(counts)]
                wrong_class_samples_indices = y_after_split[y_after_split!=majority_class].index
                total_error += np.sum(X_extended.loc[wrong_class_samples_indices, "samples_weights"])
                right_class_samples_indices = y_after_split[y_after_split==majority_class].index
                leaf = Leaf(feature_value=None, 
                            class_=majority_class, 
                            classes=classes, 
                            classes_distrib=counts, 
                            wrong_class_samples_indices=wrong_class_samples_indices,
                            right_class_samples_indices=right_class_samples_indices)
                leafs.append(leaf)
#             total_error /= np.sum(X_extended["samples_weights"])
            if total_error > .5:
                break
            alpha = self.__evalute_performance(total_error)
            stump = DecisionStump(feature=current_feature, threshold=threshold, leafs=leafs, alpha=alpha)
            self.decision_stumps.append(stump)

            for leaf in leafs:
                X_extended.loc[leaf.wrong_class_samples_indices, "samples_weights"] *= np.exp(alpha)
                X_extended.loc[leaf.right_class_samples_indices, "samples_weights"] *= np.exp(-alpha)      
            X_extended["samples_weights"] /= np.sum(X_extended["samples_weights"])
            X_copy = X_copy.sample(n=X_copy.shape[0], weights=X_extended["samples_weights"], replace=True)
            y_copy = y_copy[X_copy.index]
#             buckets = np.cumsum(X_extended["samples_weights"])
#             new_indices = [buckets[buckets>=np.random.rand()].index[0] for _ in range(X_copy.shape[0])]            
#             X_copy = X_copy.loc[new_indices, columns] 
            iteration += 1
            print(f"Iteration #{iteration}. Total error={total_error}. Alpha={alpha}")
    
    def predict(self, X):
        y_hat = []
        for i in range(X.shape[0]):
            predictions = {}
            for stump in self.decision_stumps:
                if stump.threshold is None:
                    for leaf in stump.leafs:
                        if X.iloc[i][stump.feature] == leaf.feature_value:
                            predictions[leaf.class_] = predictions.get(leaf.class_, 0) + stump.alpha
                else:
                    if X.iloc[i][stump.feature] <= stump.threshold:
                        predictions[stump.leafs[0].class_] = predictions.get(stump.leafs[0].class_, 0) + stump.alpha
                    else:
                        predictions[stump.leafs[1].class_] = predictions.get(stump.leafs[1].class_, 0) + stump.alpha
#             classes, counts = np.unique(predictions, return_counts=True)
#             print(predictions)
            y_hat.append(max(predictions, key=predictions.get))
        return y_hat                            

In [541]:
data_dict = {"Outlook":["Sunny", "Sunny", "Overcast", "Rain", "Rain", "Rain", 
                        "Overcast", "Sunny", "Sunny", "Rain", "Sunny", 
                        "Overcast", "Overcast", "Rain" ], 
             "Temp": [30, 30, 30, 20, 10, 10, 10, 20, 10, 20, 20, 20, 30, 20], 
             "Humidity": ["High", "High", "High", "High", "Norm", 
                          "Norm", "Norm", "High", "Norm", "Norm", 
                          "Norm", "High", "Norm", "High"], 
             "Wind": ["Weak", "Strong", "Weak", "Weak", "Strong", "Weak", 
                      "Weak", "Weak", "Weak", "Strong", "Strong", "Strong", 
                      "Weak", "Strong"], 
             "Tennis": ["No", "No", "Yes", "Yes", "Yes", "No", "Yes", "No", 
                        "Yes", "Yes", "Yes", "Yes", "Yes", "No" ]} 
df = pd.DataFrame(data_dict)
df

Unnamed: 0,Outlook,Temp,Humidity,Wind,Tennis
0,Sunny,30,High,Weak,No
1,Sunny,30,High,Strong,No
2,Overcast,30,High,Weak,Yes
3,Rain,20,High,Weak,Yes
4,Rain,10,Norm,Strong,Yes
5,Rain,10,Norm,Weak,No
6,Overcast,10,Norm,Weak,Yes
7,Sunny,20,High,Weak,No
8,Sunny,10,Norm,Weak,Yes
9,Rain,20,Norm,Strong,Yes


In [542]:
X = df.loc[:,'Outlook':'Wind']
y = df['Tennis']

In [543]:
ab = AdaBoost(n_iteration=10)
ab.fit(X, y)

Checking feature Outlook
Checking feature Temp
Checking feature Humidity
Checking feature Wind
Best feature is Outlook
Iteration #1. Total error=0.2857142857142857. Alpha=0.45814536593707755
Checking feature Outlook
Checking feature Temp
Checking feature Humidity
Checking feature Wind
Best feature is Outlook
Iteration #2. Total error=0.21428571428571427. Alpha=0.6496414920651304
Checking feature Outlook
Checking feature Temp
Checking feature Humidity
Checking feature Wind
Best feature is Outlook
Iteration #3. Total error=0.2857142857142857. Alpha=0.45814536593707755
Checking feature Outlook
Checking feature Temp
Checking feature Humidity
Checking feature Wind
Best feature is Temp
Iteration #4. Total error=0.2857142857142857. Alpha=0.45814536593707755
Checking feature Outlook
Checking feature Temp
Checking feature Humidity
Checking feature Wind
Best feature is Outlook
Iteration #5. Total error=0.14285714285714285. Alpha=0.8958797346140276
Checking feature Outlook
Checking feature Temp
C

In [544]:
ab.predict(X)

['Yes', 'Yes', 'Yes', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'Yes', 'No']

In [545]:
df = pd.read_csv('weatherAUS.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           145460 non-null  object 
 1   Location       145460 non-null  object 
 2   MinTemp        143975 non-null  float64
 3   MaxTemp        144199 non-null  float64
 4   Rainfall       142199 non-null  float64
 5   Evaporation    82670 non-null   float64
 6   Sunshine       75625 non-null   float64
 7   WindGustDir    135134 non-null  object 
 8   WindGustSpeed  135197 non-null  float64
 9   WindDir9am     134894 non-null  object 
 10  WindDir3pm     141232 non-null  object 
 11  WindSpeed9am   143693 non-null  float64
 12  WindSpeed3pm   142398 non-null  float64
 13  Humidity9am    142806 non-null  float64
 14  Humidity3pm    140953 non-null  float64
 15  Pressure9am    130395 non-null  float64
 16  Pressure3pm    130432 non-null  float64
 17  Cloud9am       89572 non-null

In [546]:
df.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [547]:
df_preprocessing = df.dropna(subset=['RainTomorrow'])
df_preprocessing = df_preprocessing.drop(['Evaporation', 'Sunshine', 'Cloud9am', 'Cloud3pm', 'Date'], axis=1)
df_preprocessing.dropna(inplace=True)
df_preprocessing.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 112925 entries, 0 to 145458
Data columns (total 18 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Location       112925 non-null  object 
 1   MinTemp        112925 non-null  float64
 2   MaxTemp        112925 non-null  float64
 3   Rainfall       112925 non-null  float64
 4   WindGustDir    112925 non-null  object 
 5   WindGustSpeed  112925 non-null  float64
 6   WindDir9am     112925 non-null  object 
 7   WindDir3pm     112925 non-null  object 
 8   WindSpeed9am   112925 non-null  float64
 9   WindSpeed3pm   112925 non-null  float64
 10  Humidity9am    112925 non-null  float64
 11  Humidity3pm    112925 non-null  float64
 12  Pressure9am    112925 non-null  float64
 13  Pressure3pm    112925 non-null  float64
 14  Temp9am        112925 non-null  float64
 15  Temp3pm        112925 non-null  float64
 16  RainToday      112925 non-null  object 
 17  RainTomorrow   112925 non-nul

In [548]:
X = df_preprocessing.loc[:, 'Location':'RainToday']
y = df_preprocessing['RainTomorrow']

In [533]:
numerical_columns = ['MinTemp', 'MaxTemp', 'Rainfall', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Temp9am', 'Temp3pm']

In [549]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [550]:
ab = AdaBoost(n_iteration=50)
ab.fit(X_train, y_train)

Checking feature Location
Checking feature MinTemp
Checking feature MaxTemp
Checking feature Rainfall
Checking feature WindGustDir
Checking feature WindGustSpeed
Checking feature WindDir9am
Checking feature WindDir3pm
Checking feature WindSpeed9am
Checking feature WindSpeed3pm
Checking feature Humidity9am
Checking feature Humidity3pm
Checking feature Pressure9am
Checking feature Pressure3pm
Checking feature Temp9am
Checking feature Temp3pm
Checking feature RainToday
Best feature is Humidity3pm
Iteration #1. Total error=0.1745516936019482. Alpha=0.776852852130462
Checking feature Location
Checking feature MinTemp
Checking feature MaxTemp
Checking feature Rainfall
Checking feature WindGustDir
Checking feature WindGustSpeed
Checking feature WindDir9am
Checking feature WindDir3pm
Checking feature WindSpeed9am
Checking feature WindSpeed3pm
Checking feature Humidity9am
Checking feature Humidity3pm
Checking feature Pressure9am
Checking feature Pressure3pm
Checking feature Temp9am
Checking fea

Checking feature WindSpeed9am
Checking feature WindSpeed3pm
Checking feature Humidity9am
Checking feature Humidity3pm
Checking feature Pressure9am
Checking feature Pressure3pm
Checking feature Temp9am
Checking feature Temp3pm
Checking feature RainToday
Best feature is Location
Iteration #15. Total error=0.44690059774186397. Alpha=0.10660077141004504
Checking feature Location
Checking feature MinTemp
Checking feature MaxTemp
Checking feature Rainfall
Checking feature WindGustDir
Checking feature WindGustSpeed
Checking feature WindDir9am
Checking feature WindDir3pm
Checking feature WindSpeed9am
Checking feature WindSpeed3pm
Checking feature Humidity9am
Checking feature Humidity3pm
Checking feature Pressure9am
Checking feature Pressure3pm
Checking feature Temp9am
Checking feature Temp3pm
Checking feature RainToday
Best feature is Humidity3pm
Iteration #16. Total error=0.44589329200796995. Alpha=0.10863880704865143
Checking feature Location
Checking feature MinTemp
Checking feature MaxTemp

Checking feature Temp3pm
Checking feature RainToday
Best feature is Humidity3pm
Iteration #29. Total error=0.4593756918308613. Alpha=0.08142811100292538
Checking feature Location
Checking feature MinTemp
Checking feature MaxTemp
Checking feature Rainfall
Checking feature WindGustDir
Checking feature WindGustSpeed
Checking feature WindDir9am
Checking feature WindDir3pm
Checking feature WindSpeed9am
Checking feature WindSpeed3pm
Checking feature Humidity9am
Checking feature Humidity3pm
Checking feature Pressure9am
Checking feature Pressure3pm
Checking feature Temp9am
Checking feature Temp3pm
Checking feature RainToday
Best feature is Location
Iteration #30. Total error=0.4649878237768429. Alpha=0.07013914305713541
Checking feature Location
Checking feature MinTemp
Checking feature MaxTemp
Checking feature Rainfall
Checking feature WindGustDir
Checking feature WindGustSpeed
Checking feature WindDir9am
Checking feature WindDir3pm
Checking feature WindSpeed9am
Checking feature WindSpeed3pm


Checking feature MaxTemp
Checking feature Rainfall
Checking feature WindGustDir
Checking feature WindGustSpeed
Checking feature WindDir9am
Checking feature WindDir3pm
Checking feature WindSpeed9am
Checking feature WindSpeed3pm
Checking feature Humidity9am
Checking feature Humidity3pm
Checking feature Pressure9am
Checking feature Pressure3pm
Checking feature Temp9am
Checking feature Temp3pm
Checking feature RainToday
Best feature is Location
Iteration #44. Total error=0.4629067965463803. Alpha=0.07432295610520429
Checking feature Location
Checking feature MinTemp
Checking feature MaxTemp
Checking feature Rainfall
Checking feature WindGustDir
Checking feature WindGustSpeed
Checking feature WindDir9am
Checking feature WindDir3pm
Checking feature WindSpeed9am
Checking feature WindSpeed3pm
Checking feature Humidity9am
Checking feature Humidity3pm
Checking feature Pressure9am
Checking feature Pressure3pm
Checking feature Temp9am
Checking feature Temp3pm
Checking feature RainToday
Best featur

In [551]:
y_test_hat = ab.predict(X_test)

In [552]:
print(classification_report(y_test, y_test_hat))

              precision    recall  f1-score   support

          No       0.87      0.94      0.90     17651
         Yes       0.70      0.49      0.57      4934

    accuracy                           0.84     22585
   macro avg       0.78      0.71      0.74     22585
weighted avg       0.83      0.84      0.83     22585



In [553]:
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier()
clf.fit(X_train[numerical_columns], y_train)

AdaBoostClassifier()

In [554]:
y_test_hat_clf = clf.predict(X_test[numerical_columns])

In [555]:
print(classification_report(y_test, y_test_hat_clf))

              precision    recall  f1-score   support

          No       0.87      0.95      0.91     17651
         Yes       0.73      0.48      0.58      4934

    accuracy                           0.85     22585
   macro avg       0.80      0.72      0.74     22585
weighted avg       0.84      0.85      0.84     22585

