In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from RegressionTree import RegressionTree

In [11]:
from sklearn.datasets import make_regression

X, y = make_regression(
    n_samples=1000,        # Number of samples
    n_features=5,          # Number of features
    n_informative=5,       # Number of features that are actually useful
    noise=10.0,            # Amount of noise in the output
    random_state=42,       # For reproducibility
    coef=False             # Don't return coefficients (we'll interpret ourselves)
)

df = pd.DataFrame(X)
df['y'] = y

df.head()

Unnamed: 0,0,1,2,3,4,y
0,2.056544,0.606851,0.482688,-1.130888,0.420094,70.771314
1,-0.799192,-0.645964,-0.182896,-0.482744,1.374876,-51.604236
2,1.076007,-0.796026,-0.751969,0.021312,-0.319054,-37.000448
3,-0.103255,-0.828497,1.489863,-1.643189,-1.600904,-83.535099
4,-2.063403,0.503252,-0.645572,-0.31735,-1.661083,-83.400394


In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
def r2_score(y_true, y_pred):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    y_mean = np.mean(y_true)
    sse = np.sum((y_true - y_pred) ** 2)
    sst = np.sum((y_true - y_mean) ** 2)
    r2 = 1 - (sse / sst)
    return r2

In [14]:
tree = RegressionTree(max_depth=200)

tree.fit(X_train,y_train)

preds = tree.predict(X_test)

rmse = np.sqrt(np.mean((preds - y_test) ** 2))
print(f"Root Mean Squared Error: {rmse:.2f}")
print(f'R2 Score: {r2_score(y_test,preds)}')

res_df = pd.DataFrame(X_test)
res_df['y'] = y_test
res_df['preds'] = preds

res_df.head()

Root Mean Squared Error: 32.82
R2 Score: 0.7187965337824684


Unnamed: 0,0,1,2,3,4,y,preds
0,-0.543425,-0.032753,-0.573662,-0.712846,-0.546859,-49.53926,-64.540736
1,-0.357029,-1.517874,-0.293967,0.890383,-0.071599,-58.226259,-85.45373
2,1.807197,1.450928,-0.077443,-1.682659,-1.195294,47.171144,70.787631
3,0.534817,-0.886681,-0.307808,1.228981,-0.838586,-4.063442,-0.610955
4,0.65045,1.401599,0.522514,-1.50308,0.418398,56.864649,102.964081


In [15]:
from sklearn.tree import DecisionTreeRegressor

tree2 = DecisionTreeRegressor(max_depth=200)

tree2.fit(X_train,y_train)

preds = tree2.predict(X_test)

rmse = np.sqrt(np.mean((preds - y_test) ** 2))
print(f"Root Mean Squared Error: {rmse:.2f}")
print(f'R2 Score: {r2_score(y_test,preds)}')

res_df = pd.DataFrame(X_test)
res_df['y'] = y_test
res_df['preds'] = preds

res_df.head()

Root Mean Squared Error: 34.19
R2 Score: 0.694846181140979


Unnamed: 0,0,1,2,3,4,y,preds
0,-0.543425,-0.032753,-0.573662,-0.712846,-0.546859,-49.53926,-64.540736
1,-0.357029,-1.517874,-0.293967,0.890383,-0.071599,-58.226259,-95.11616
2,1.807197,1.450928,-0.077443,-1.682659,-1.195294,47.171144,70.787631
3,0.534817,-0.886681,-0.307808,1.228981,-0.838586,-4.063442,-0.610955
4,0.65045,1.401599,0.522514,-1.50308,0.418398,56.864649,101.472375
