In [19]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [88]:
class GradientBoosting:
    
    def __init__(self, n_estimators=100, max_depth=1, learning_rate=0.1, threshold=.5):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.learning_rate = learning_rate
        self.threshold = threshold
        self.tree = []
    
    def __get_log_odds__(self, p):
        return np.log(p/(1-p))
    
    def __get_p__(self, log_odds):
        return np.exp(log_odds)/(1+np.exp(log_odds))
    
    def fit(self, X, y):
        self.F0 = self.__get_log_odds__(y.mean())
        F = np.full(X.shape[0], self.F0)
        for m in range(1, self.n_estimators+1):
            p = self.__get_p__(F)
            r = y - p
            dt = DecisionTreeRegressor(max_depth=self.max_depth)
            dt.fit(X, r)
            leaves_indices = dt.apply(X)
            for id_ in set(leaves_indices):
                filter_ = id_==leaves_indices
                nominator = np.sum(r[filter_])
                denominator = np.sum(p[filter_]*(1-p[filter_]))
                gamma = nominator/denominator
                F[filter_] += self.learning_rate*gamma
                dt.tree_.value[id_, 0, 0] = gamma
            self.tree.append(dt)
    
    def predict(self, X):
        F = np.full(X.shape[0], self.F0)
        for dt in self.tree:
            leaves_indices = dt.apply(X)
            for id_ in set(leaves_indices):
                filter_ = id_==leaves_indices
                F[filter_] += self.learning_rate*dt.tree_.value[id_, 0, 0]
        prob = self.__get_p__(F)
        y_hat = np.zeros_like(prob, dtype=np.int)
        y_hat[prob>=.5] = 1
        return y_hat

In [72]:
from sklearn.datasets import make_classification

In [83]:
X, y = make_classification(n_samples=1000, n_features=2, n_classes=2, n_redundant=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [89]:
gb = GradientBoosting(n_estimators=100)
gb.fit(X_train, y_train)

In [90]:
y_hat = gb.predict(X_test)

In [91]:
print(classification_report(y_test, y_hat))

              precision    recall  f1-score   support

           0       0.87      0.98      0.92       101
           1       0.98      0.85      0.91        99

    accuracy                           0.92       200
   macro avg       0.92      0.91      0.91       200
weighted avg       0.92      0.92      0.91       200



In [92]:
df = pd.read_csv('weatherAUS.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           145460 non-null  object 
 1   Location       145460 non-null  object 
 2   MinTemp        143975 non-null  float64
 3   MaxTemp        144199 non-null  float64
 4   Rainfall       142199 non-null  float64
 5   Evaporation    82670 non-null   float64
 6   Sunshine       75625 non-null   float64
 7   WindGustDir    135134 non-null  object 
 8   WindGustSpeed  135197 non-null  float64
 9   WindDir9am     134894 non-null  object 
 10  WindDir3pm     141232 non-null  object 
 11  WindSpeed9am   143693 non-null  float64
 12  WindSpeed3pm   142398 non-null  float64
 13  Humidity9am    142806 non-null  float64
 14  Humidity3pm    140953 non-null  float64
 15  Pressure9am    130395 non-null  float64
 16  Pressure3pm    130432 non-null  float64
 17  Cloud9am       89572 non-null

In [93]:
df.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [94]:
df_preprocessing = df.dropna(subset=['RainTomorrow'])
df_preprocessing = df_preprocessing.drop(['Evaporation', 'Sunshine', 'Cloud9am', 'Cloud3pm', 'Date'], axis=1)
df_preprocessing.dropna(inplace=True)
df_preprocessing.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 112925 entries, 0 to 145458
Data columns (total 18 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Location       112925 non-null  object 
 1   MinTemp        112925 non-null  float64
 2   MaxTemp        112925 non-null  float64
 3   Rainfall       112925 non-null  float64
 4   WindGustDir    112925 non-null  object 
 5   WindGustSpeed  112925 non-null  float64
 6   WindDir9am     112925 non-null  object 
 7   WindDir3pm     112925 non-null  object 
 8   WindSpeed9am   112925 non-null  float64
 9   WindSpeed3pm   112925 non-null  float64
 10  Humidity9am    112925 non-null  float64
 11  Humidity3pm    112925 non-null  float64
 12  Pressure9am    112925 non-null  float64
 13  Pressure3pm    112925 non-null  float64
 14  Temp9am        112925 non-null  float64
 15  Temp3pm        112925 non-null  float64
 16  RainToday      112925 non-null  object 
 17  RainTomorrow   112925 non-nul

In [99]:
X = df_preprocessing.loc[:, 'Location':'RainToday']
y = df_preprocessing['RainTomorrow'].astype('category').cat.codes

In [106]:
numerical_columns = ['MinTemp', 'MaxTemp', 'Rainfall', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Temp9am', 'Temp3pm']

In [107]:
X_train, X_test, y_train, y_test = train_test_split(X[numerical_columns], y, test_size=0.2)

In [123]:
gb = GradientBoosting(n_estimators=400, max_depth=4)
gb.fit(X_train, y_train)

In [124]:
y_hat = gb.predict(X_test)

In [125]:
print(classification_report(y_test, y_hat))

              precision    recall  f1-score   support

           0       0.87      0.95      0.91     17523
           1       0.76      0.52      0.62      5062

    accuracy                           0.86     22585
   macro avg       0.82      0.74      0.77     22585
weighted avg       0.85      0.86      0.85     22585



In [126]:
from sklearn.ensemble import GradientBoostingClassifier

In [129]:
sklearn_gbm = GradientBoostingClassifier(n_estimators=400, learning_rate=0.1, max_depth=4)
sklearn_gbm.fit(X_train, y_train)
y_hat = sklearn_gbm.predict(X_test)
print(classification_report(y_test, y_hat))

              precision    recall  f1-score   support

           0       0.87      0.95      0.91     17523
           1       0.76      0.52      0.62      5062

    accuracy                           0.86     22585
   macro avg       0.82      0.74      0.77     22585
weighted avg       0.85      0.86      0.85     22585

