In [None]:
# TODO
# 1. Comments
# 2. Why accuracy metrics don't match the implementation from sklearn ?

In [1]:
import numpy as np
import pandas as pd
from pprint import pprint
from multiprocessing import Pool, cpu_count
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from decision_tree_for_classification_from_scratch import Node, DecisionTree

In [43]:
class RandomForest:
    
    def __init__(self, n_estimator=100, max_depth=4, bootstrap_frac=0.5, min_samples_for_splitting=2):
        self.n_estimator = n_estimator
        self.max_depth = max_depth
        self.bootstrap_frac = bootstrap_frac
        self.min_samples_for_splitting = min_samples_for_splitting
        self.forest = []
    
    def fit(self, X, y):
        features_count = int(np.sqrt(len(X.columns)))
        row_count = int(self.bootstrap_frac*len(X))
        for tree_num in range(self.n_estimator):
            print(f"Fit tree #{(tree_num+1)}")
            X_subset = X.sample(row_count).sample(features_count, axis=1)
            y_subset = y.loc[X_subset.index]
#             print(X_subset)
#             print(y_subset)
            dt = DecisionTree(max_depth=self.max_depth, min_samples_for_splitting=self.min_samples_for_splitting)
            dt.fit(X_subset, y_subset)
            self.forest.append(dt)           
            
    def predict(self, X):
        output = []
        predictions = []
        for tree in self.forest:
            tree_predict, _ = tree.predict(X)
            predictions.append(tree_predict)
        predictions = np.array(predictions)
        for sample in range(predictions.shape[-1]):
            classes, counts = np.unique(predictions[:,sample], return_counts=True)
            output.append(classes[np.argmax(counts)])
        return output

In [44]:
data_dict = {"Outlook":["Sunny", "Sunny", "Overcast", "Rain", "Rain", "Rain", 
                        "Overcast", "Sunny", "Sunny", "Rain", "Sunny", 
                        "Overcast", "Overcast", "Rain" ], 
             "Temp": [30, 30, 30, 20, 10, 10, 10, 20, 10, 20, 20, 20, 30, 20], 
             "Humidity": ["High", "High", "High", "High", "Norm", 
                          "Norm", "Norm", "High", "Norm", "Norm", 
                          "Norm", "High", "Norm", "High"], 
             "Wind": ["Weak", "Strong", "Weak", "Weak", "Strong", "Weak", 
                      "Weak", "Weak", "Weak", "Strong", "Strong", "Strong", 
                      "Weak", "Strong"], 
             "Tennis": ["No", "No", "Yes", "Yes", "Yes", "No", "Yes", "No", 
                        "Yes", "Yes", "Yes", "Yes", "Yes", "No" ]} 
df = pd.DataFrame(data_dict)
df

Unnamed: 0,Outlook,Temp,Humidity,Wind,Tennis
0,Sunny,30,High,Weak,No
1,Sunny,30,High,Strong,No
2,Overcast,30,High,Weak,Yes
3,Rain,20,High,Weak,Yes
4,Rain,10,Norm,Strong,Yes
5,Rain,10,Norm,Weak,No
6,Overcast,10,Norm,Weak,Yes
7,Sunny,20,High,Weak,No
8,Sunny,10,Norm,Weak,Yes
9,Rain,20,Norm,Strong,Yes


In [45]:
X = df.loc[:,'Outlook':'Wind']
y = df['Tennis']

In [58]:
rf = RandomForest(n_estimator=20, max_depth=3)
rf.fit(X, y)

Fit tree #1
Fit tree #2
Fit tree #3
Fit tree #4
Fit tree #5
Fit tree #6
Fit tree #7
Fit tree #8
Fit tree #9
Fit tree #10
Fit tree #11
Fit tree #12
Fit tree #13
Fit tree #14
Fit tree #15
Fit tree #16
Fit tree #17
Fit tree #18
Fit tree #19
Fit tree #20


In [59]:
y_hat = rf.predict(X)

In [60]:
print(classification_report(y, y_hat))

              precision    recall  f1-score   support

          No       1.00      0.60      0.75         5
         Yes       0.82      1.00      0.90         9

    accuracy                           0.86        14
   macro avg       0.91      0.80      0.82        14
weighted avg       0.88      0.86      0.85        14



In [61]:
df = pd.read_csv('weatherAUS.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           145460 non-null  object 
 1   Location       145460 non-null  object 
 2   MinTemp        143975 non-null  float64
 3   MaxTemp        144199 non-null  float64
 4   Rainfall       142199 non-null  float64
 5   Evaporation    82670 non-null   float64
 6   Sunshine       75625 non-null   float64
 7   WindGustDir    135134 non-null  object 
 8   WindGustSpeed  135197 non-null  float64
 9   WindDir9am     134894 non-null  object 
 10  WindDir3pm     141232 non-null  object 
 11  WindSpeed9am   143693 non-null  float64
 12  WindSpeed3pm   142398 non-null  float64
 13  Humidity9am    142806 non-null  float64
 14  Humidity3pm    140953 non-null  float64
 15  Pressure9am    130395 non-null  float64
 16  Pressure3pm    130432 non-null  float64
 17  Cloud9am       89572 non-null

In [62]:
df.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [63]:
df_preprocessing = df.dropna(subset=['RainTomorrow'])
df_preprocessing = df_preprocessing.drop(['Evaporation', 'Sunshine', 'Cloud9am', 'Cloud3pm', 'Date'], axis=1)
df_preprocessing.dropna(inplace=True)
df_preprocessing.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 112925 entries, 0 to 145458
Data columns (total 18 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Location       112925 non-null  object 
 1   MinTemp        112925 non-null  float64
 2   MaxTemp        112925 non-null  float64
 3   Rainfall       112925 non-null  float64
 4   WindGustDir    112925 non-null  object 
 5   WindGustSpeed  112925 non-null  float64
 6   WindDir9am     112925 non-null  object 
 7   WindDir3pm     112925 non-null  object 
 8   WindSpeed9am   112925 non-null  float64
 9   WindSpeed3pm   112925 non-null  float64
 10  Humidity9am    112925 non-null  float64
 11  Humidity3pm    112925 non-null  float64
 12  Pressure9am    112925 non-null  float64
 13  Pressure3pm    112925 non-null  float64
 14  Temp9am        112925 non-null  float64
 15  Temp3pm        112925 non-null  float64
 16  RainToday      112925 non-null  object 
 17  RainTomorrow   112925 non-nul

In [64]:
X = df_preprocessing.loc[:, 'Location':'RainToday']
y = df_preprocessing['RainTomorrow']

In [65]:
numerical_columns = ['MinTemp', 'MaxTemp', 'Rainfall', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Temp9am', 'Temp3pm']

In [66]:
X_train, X_test, y_train, y_test = train_test_split(X[numerical_columns], y, test_size=0.2)

In [68]:
rf = RandomForest(n_estimator=100, max_depth=5, buggging_frac=0.5, min_samples_for_splitting=50)
rf.fit(X_train, y_train)

Fit tree #1
Fit tree #2
Fit tree #3
Fit tree #4
Fit tree #5
Fit tree #6
Fit tree #7
Fit tree #8
Fit tree #9
Fit tree #10
Fit tree #11
Fit tree #12
Fit tree #13
Fit tree #14
Fit tree #15
Fit tree #16
Fit tree #17
Fit tree #18
Fit tree #19
Fit tree #20
Fit tree #21
Fit tree #22
Fit tree #23
Fit tree #24
Fit tree #25
Fit tree #26
Fit tree #27
Fit tree #28
Fit tree #29
Fit tree #30
Fit tree #31
Fit tree #32
Fit tree #33
Fit tree #34
Fit tree #35
Fit tree #36
Fit tree #37
Fit tree #38
Fit tree #39
Fit tree #40
Fit tree #41
Fit tree #42
Fit tree #43
Fit tree #44
Fit tree #45
Fit tree #46
Fit tree #47
Fit tree #48
Fit tree #49
Fit tree #50
Fit tree #51
Fit tree #52
Fit tree #53
Fit tree #54
Fit tree #55
Fit tree #56
Fit tree #57
Fit tree #58
Fit tree #59
Fit tree #60
Fit tree #61
Fit tree #62
Fit tree #63
Fit tree #64
Fit tree #65
Fit tree #66
Fit tree #67
Fit tree #68
Fit tree #69
Fit tree #70
Fit tree #71
Fit tree #72
Fit tree #73
Fit tree #74
Fit tree #75
Fit tree #76
Fit tree #77
Fit tree

In [71]:
# y_train_hat = rf.predict(X_train)
y_test_hat = rf.predict(X_test)

In [72]:
# print(classification_report(y_train, y_train_hat))

In [73]:
print(classification_report(y_test, y_test_hat))

              precision    recall  f1-score   support

          No       0.80      1.00      0.89     17581
         Yes       0.90      0.10      0.18      5004

    accuracy                           0.80     22585
   macro avg       0.85      0.55      0.53     22585
weighted avg       0.82      0.80      0.73     22585



In [74]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=5, min_samples_split=50, max_samples=0.5, bootstrap=True)
clf = clf.fit(X_train, y_train)
y_hat_clf = clf.predict(X_test)
print(classification_report(y_test, y_hat_clf))

              precision    recall  f1-score   support

          No       0.85      0.97      0.91     17581
         Yes       0.81      0.41      0.55      5004

    accuracy                           0.85     22585
   macro avg       0.83      0.69      0.73     22585
weighted avg       0.84      0.85      0.83     22585

