In [11]:
import numpy as np
import pandas as pd
from RandomForestRegressor import RandomForestRegressor

#for simplicity, subsamples of features will not bee taken at each split of a tree in the forest

In [12]:
from sklearn.datasets import make_regression

X, y = make_regression(
    n_samples=1000,        # Number of samples
    n_features=10,          # Number of features
    n_informative=10,       # Number of features that are actually useful
    noise=10.0,            # Amount of noise in the output
    random_state=42,       # For reproducibility
    coef=False             # Don't return coefficients (we'll interpret ourselves)
)

df = pd.DataFrame(X)
df['y'] = y

df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,y
0,1.575876,0.891863,-0.726774,-0.047799,-0.657035,-1.113295,-0.877617,0.994558,-0.299696,-0.76176,-59.095196
1,0.535459,1.266661,-1.779875,1.20901,-0.113158,-0.971062,1.064171,-0.555273,0.74132,-0.987523,95.699008
2,-0.686279,0.157128,0.145836,-1.432671,-0.666849,0.17619,0.585299,0.197917,-0.404362,-0.607472,-107.883232
3,0.925282,0.395914,0.676357,0.168651,1.574147,-0.117172,-0.037687,-0.345907,0.895796,-0.837381,49.233736
4,-0.558922,1.073632,1.565524,0.377212,0.13297,-1.026515,-0.06575,-0.700121,-1.523187,1.195047,144.75538


In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
def r2_score(y_true, y_pred):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    y_mean = np.mean(y_true)
    sse = np.sum((y_true - y_pred) ** 2)
    sst = np.sum((y_true - y_mean) ** 2)
    r2 = 1 - (sse / sst)
    return r2

In [15]:
clf = RandomForestRegressor(num_trees=20)
clf.fit(X_test,y_test)

preds = clf.predict(X_test)

df_fin = pd.DataFrame(X_test)
df_fin['y'] = y_test
df_fin['preds'] = preds

r2 = r2_score(y_test, preds)

rmse = np.sqrt(np.mean((preds - y_test) ** 2))
print(f"Root Mean Squared Error: {rmse:.2f}")
print(f'R2 Score: {r2:.2f}')

df_fin.head()

Root Mean Squared Error: 25.89
R2 Score: 0.96


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,y,preds
0,0.997266,0.108014,-0.066449,1.18189,0.957994,-0.906649,-0.358079,0.663546,0.381426,-1.026081,33.89365,42.367938
1,0.106183,-0.386113,-0.057837,-0.637809,0.107984,1.015221,1.127788,2.606859,0.059692,0.193406,85.482613,68.117867
2,0.31111,-0.731632,2.015275,-1.889649,0.049308,1.890441,1.290644,0.76884,0.363116,-0.608227,-8.229056,-39.16657
3,0.021093,0.257373,-0.067277,-1.131067,0.432353,-0.630846,-1.074661,-0.717182,-1.321556,-2.032354,-301.248114,-254.574075
4,0.87399,1.983634,0.182994,0.078439,0.386586,0.01188,-0.834271,1.359298,-0.656624,-0.14714,34.686744,28.189096


In [16]:
from sklearn.ensemble import RandomForestRegressor

clf = RandomForestRegressor(n_estimators=20)

clf.fit(X_test,y_test)

preds = clf.predict(X_test)

df_fin = pd.DataFrame(X_test)
df_fin['y'] = y_test
df_fin['preds'] = preds

r2 = r2_score(y_test, preds)

rmse = np.sqrt(np.mean((preds - y_test) ** 2))
print(f"Root Mean Squared Error: {rmse:.2f}")
print(f'R2 Score: {r2:.2f}')

df_fin.head()

Root Mean Squared Error: 25.23
R2 Score: 0.96


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,y,preds
0,0.997266,0.108014,-0.066449,1.18189,0.957994,-0.906649,-0.358079,0.663546,0.381426,-1.026081,33.89365,31.539365
1,0.106183,-0.386113,-0.057837,-0.637809,0.107984,1.015221,1.127788,2.606859,0.059692,0.193406,85.482613,74.674436
2,0.31111,-0.731632,2.015275,-1.889649,0.049308,1.890441,1.290644,0.76884,0.363116,-0.608227,-8.229056,-40.466085
3,0.021093,0.257373,-0.067277,-1.131067,0.432353,-0.630846,-1.074661,-0.717182,-1.321556,-2.032354,-301.248114,-255.957943
4,0.87399,1.983634,0.182994,0.078439,0.386586,0.01188,-0.834271,1.359298,-0.656624,-0.14714,34.686744,9.310426
