In [26]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from joblib import Parallel, delayed
from sklearn.model_selection import train_test_split
from sklearn.datasets import (
    make_regression,
    make_classification,
)
from sklearn.metrics import accuracy_score, mean_absolute_error,mean_squared_error

# Our model Random Forest

In [27]:
class RandomForest:
    def __init__(self, regression=False, n_estimators=100, max_depth=None,
                 max_features=1.0, n_jobs=-1, random_state=0, ccp_alpha=0.0):
        self.regression = regression
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.max_features = max_features
        self.n_jobs = n_jobs
        self.random_state = random_state
        self.ccp_alpha = ccp_alpha
        self.trained_trees_info = []
        self.general_random = np.random.RandomState(self.random_state)

    #bootstrapping with random subspaces method
    def _rsm_bootstrapping(self, X, y):
        n_samples, n_features = X.shape
        if self.regression:
            max_features = self.max_features * n_features
        else:
            max_features = np.sqrt(n_features)

        sample_indexes = self.general_random.choice(n_samples, n_samples)
        features = self.general_random.choice(X.columns, round(max_features))
        X_b, y_b = X.iloc[sample_indexes][features], y.iloc[sample_indexes]

        return X_b, y_b

    def _train_tree(self, X, y):
        if self.regression:
            tree = DecisionTreeRegressor(max_depth=self.max_depth,
                                         random_state=self.random_state,
                                         ccp_alpha=self.ccp_alpha)
        else:
            tree = DecisionTreeClassifier(max_depth=self.max_depth,
                                          random_state=self.random_state,
                                          ccp_alpha=self.ccp_alpha)

        return tree.fit(X, y), X.columns

    def fit(self, X, y):
        boot_data = (self._rsm_bootstrapping(X, y) for _ in range(self.n_estimators))
        train_trees = (delayed(self._train_tree)(X_b, y_b) for X_b, y_b in boot_data)
        self.trained_trees_info = Parallel(n_jobs=self.n_jobs)(train_trees)

    def predict(self, samples):
        prediction = (delayed(tree_i.predict)(samples[tree_i_features])
                      for (tree_i, tree_i_features) in self.trained_trees_info)

        trees_predictions = pd.DataFrame(Parallel(n_jobs=self.n_jobs)(prediction))

        if self.regression:
            forest_prediction = trees_predictions.mean(axis=0)
        else:
            forest_prediction = trees_predictions.mode(axis=0).iloc[0]

        return np.array(forest_prediction)

# Dataset for classification

In [28]:
X_clsf, y_clsf = make_classification(
    n_samples=1000,
    n_features=10,
    n_informative=5,
    n_redundant=2,
    n_classes=2,
    random_state=42
)

X_clsf = pd.DataFrame(X_clsf, columns=[f"f{i}" for i in range(X_clsf.shape[1])])
y_clsf = pd.Series(y_clsf)

# Dataset for regression

In [29]:
from sklearn.datasets import make_regression

X_reg, y_reg = make_regression(
    n_samples=1000,
    n_features=10,
    n_informative=5,
    noise=10,
    random_state=42
)

X_reg = pd.DataFrame(X_reg, columns=[f"f{i}" for i in range(X_reg.shape[1])])
y_reg = pd.Series(y_reg)


# Noisy classification dataset

In [30]:
X_noisy, y_noisy = make_classification(
    n_samples=1000,
    n_features=20,
    n_informative=2,
    n_redundant=10,
    flip_y=0.25,
    class_sep=0.5,
    random_state=42
)

X_noisy = pd.DataFrame(X_noisy, columns=[f"f{i}" for i in range(X_noisy.shape[1])])
y_noisy = pd.Series(y_noisy)


# How number of estimators affects on metrics

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X_clsf, y_clsf, test_size=0.3)
for n in [1,5,10,50,100]:
    rf = RandomForest(n_estimators=n)
    rf.fit(X_train,y_train)
    preds = rf.predict(X_test)
    print(n,accuracy_score(y_test,preds))

1 0.5466666666666666
5 0.74
10 0.81
50 0.8733333333333333
100 0.8866666666666667


## At 10 estimators there is a jump from 5 estimators. After 10 probably there is overfitting

# How max depth insfluence?

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X_clsf, y_clsf, test_size=0.3)
for depth in [2,4,8,None]:
    rf = RandomForest(n_estimators=50,max_depth=depth)
    rf.fit(X_train,y_train)
    preds = rf.predict(X_test)
    print(depth,accuracy_score(y_test,preds))

2 0.8133333333333334
4 0.8633333333333333
8 0.8633333333333333
None 0.8533333333333334


## Starting from 4 depth metrics dont change significantly

# How parametr max feature effect?

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X_noisy, y_noisy, test_size=0.3)
for mf in [0.3, 0.5, 1.0]:
    rf = RandomForest(n_estimators=50, max_features=mf)
    rf.fit(X_train, y_train)
    preds = rf.predict(X_test)
    print(mf, accuracy_score(y_test, preds))


0.3 0.6966666666666667
0.5 0.6966666666666667
1.0 0.6966666666666667


## In the third experiment, changing the max_features parameter did not lead to noticeable changes in quality metrics. This is due to the specifics of the algorithm implementation: for the classification task, the number of features used is fixed and does not depend on the value of max_features.

# Comparing RF with single DT

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X_noisy, y_noisy, test_size=0.3)
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt_preds = dt.predict(X_test)

rf = RandomForest(n_estimators=50)
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)

print("Tree:", accuracy_score(y_test, dt_preds))
print("RF:", accuracy_score(y_test, rf_preds))


Tree: 0.6666666666666666
RF: 0.73


## We can see the difference between them. RF more stable because of number of tree. It reduces risk to overfit and variance

# How RF handle noisy data?

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X_clsf, y_clsf, test_size=0.3)
scores = []
for seed in range(10):
    rf = RandomForest(n_estimators=30, random_state=seed)
    rf.fit(X_train, y_train)
    preds = rf.predict(X_test)
    scores.append(accuracy_score(y_test, preds))

print("mean:", np.mean(scores))
print("std:", np.std(scores))


mean: 0.865
std: 0.028057480682025293


## RF has 0.865 average accuracy which is pretty well. While std is 0.02 showing its stability for random state

# Comparing RF and DT in regression

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X_reg, y_reg, test_size=0.3)
rf = RandomForest(
    regression=True,
    n_estimators=50,
    max_depth=None,
    random_state=42
)

rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)

print("RF MAE:", mean_absolute_error(y_test, rf_preds))
print("RF MSE:", mean_squared_error(y_test, rf_preds))

dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train)
dt_preds = dt.predict(X_test)

print("DT MAE:", mean_absolute_error(y_test, dt_preds))
print("DT MSE:", mean_squared_error(y_test, dt_preds))


RF MAE: 23.76913003739627
RF MSE: 873.2022438193327
DT MAE: 22.135846779935033
DT MSE: 774.2636379055491


## Metrics shows RF with this parameters is worse than DT. Bias in RF increase, although variance reduced. Rf doesnt guarantee better metrics without proper parameters 