In [1]:
import numpy as np
import pandas as pd
from RandomForestClassifier import RandomForestClassifier

#for simplicity, subsamples of features will not bee taken at each split of a tree in the forest

In [2]:
from sklearn.datasets import make_classification

X, y = make_classification(
    n_samples=1500,         # 1000 samples
    n_features=5,           # 4 features total
    n_informative=3,        # 3 features are informative
    n_redundant=0,          # No redundant features
    n_classes=2,           # Binary classification
    n_clusters_per_class=2, # 2 clusters per class
    flip_y=0.05,           # 5% label noise for realism
    class_sep=1.5,         # Moderate class separation
    random_state=42        # Reproducibility
)

df = pd.DataFrame(X)
df['y'] = y

df.head()

Unnamed: 0,0,1,2,3,4,y
0,-1.482984,-0.966151,-1.876928,1.496683,-0.466759,0
1,-0.297867,2.132347,3.20749,2.034843,-0.630846,1
2,0.70266,-2.733089,-0.37391,-0.991398,0.38575,0
3,0.014149,0.015662,-2.879848,0.565224,1.452118,0
4,-0.80228,-2.394589,-0.983142,0.919953,0.246084,1


In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
clf = RandomForestClassifier(num_trees=50)
clf.fit(X_train, y_train)

preds = clf.predict(X_test)

df_fin = pd.DataFrame(X_test)
df_fin['y'] = y_test
df_fin['preds'] = preds

print(f'Accuracy: {np.mean(preds == y_test)}')

df_fin.head()

Accuracy: 0.93


Unnamed: 0,0,1,2,3,4,y,preds
0,0.518793,1.569475,-1.627176,1.939362,-0.998307,1,1
1,-0.549789,-2.161441,-0.988953,-1.378465,-0.65295,0,0
2,1.25535,-1.790209,-1.445198,-1.530221,1.540441,0,0
3,0.73918,-2.353526,-0.93159,0.527609,-2.050357,0,0
4,-0.104358,0.600644,0.601581,1.130654,0.450156,1,1


In [5]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=50)

clf.fit(X_train, y_train)

preds = clf.predict(X_test)

df_fin = pd.DataFrame(X_test)
df_fin['y'] = y_test
df_fin['preds'] = preds

print(f'Accuracy: {np.mean(preds == y_test)}')

df_fin.head()

Accuracy: 0.9366666666666666


Unnamed: 0,0,1,2,3,4,y,preds
0,0.518793,1.569475,-1.627176,1.939362,-0.998307,1,1
1,-0.549789,-2.161441,-0.988953,-1.378465,-0.65295,0,0
2,1.25535,-1.790209,-1.445198,-1.530221,1.540441,0,0
3,0.73918,-2.353526,-0.93159,0.527609,-2.050357,0,0
4,-0.104358,0.600644,0.601581,1.130654,0.450156,1,1
