In [15]:
import sys
import os

# Add the project root directory to sys.path
project_root = os.path.abspath(os.path.join(os.getcwd(), "/Users/adamboda/Documents/python/decision_trees_from_scratch"))  # Adjust as needed
if project_root not in sys.path:
    sys.path.append(project_root)

In [16]:
import pandas as pd
import numpy as np
from decision_tree.decision_tree import DecisionTree

In [72]:
class RandomForestClassifier():
    """
    Docstring
    """

    def __init__(self, 
                 max_depth: int,
                 min_information_gain: float,
                 min_samples_per_leaf: int,
                 feature_selection: str,
                 n_learners: int,
                 bootstrap_sample_size: int,
                 bootstrap: bool,
                 oob_score: bool
                ):
        self.max_depth = max_depth
        self.min_information_gain = min_information_gain
        self.min_samples_per_leaf = min_samples_per_leaf
        self.feature_selection = feature_selection
        self.n_learners = n_learners
        self.bootstrap_sample_size = bootstrap_sample_size
        self.bootstrap = bootstrap
        self.oob_score = oob_score
        self.trained_base_learners = []

    def create_bootstrap_data(self, X: pd.DataFrame, Y: pd.DataFrame) -> list: # ToDo: Check whether df or series
        bootstrap_samples_X = []
        bootstrap_samples_Y = []
        
        for idx in range(self.n_learners):
            sample_row_idx = np.random.choice(X.shape[0], size=self.bootstrap_sample_size, replace=True)
            bootstrap_samples_X.append(X.iloc[sample_row_idx, :])
            bootstrap_samples_Y.append(Y.iloc[sample_row_idx, :])

        return bootstrap_samples_X, bootstrap_samples_Y
    
    def train(self, train_X: pd.DataFrame, train_Y: pd.DataFrame) -> None:
        """
        Training a decision tree for every bootstrapped sample and storing in trained_base_learners attribute
        """
    
        bootstrap_samples_X, bootstrap_samples_Y = self.create_bootstrap_data(train_X, train_Y)
        
        for idx, _ in enumerate(bootstrap_samples_X):
            base_learner = DecisionTree(self.max_depth, self.min_information_gain, self.min_samples_per_leaf, self.feature_selection)
            base_learner.train(bootstrap_samples_X[idx], bootstrap_samples_Y[idx])
            self.trained_base_learners.append(base_learner)
    
    def predict_one_sample(self, row_data: pd.Series):
        predictions = []
        
        for base_learner in self.trained_base_learners:
            predicted_probabilities = base_learner.predict_prob_one_sample(row_data)
            predicted_classes = max(predicted_probabilities, key=predicted_probabilities.get)
            predictions.append(predicted_classes)

        return max(set(predictions), key=predictions.count)
        
    def predict(self, data: pd.DataFrame) -> pd.DataFrame:
        return data.apply(self.predict_one_sample, axis=1)


In [73]:
classifier = RandomForestClassifier(max_depth=15, min_information_gain=0.000001, min_samples_per_leaf=5, feature_selection='log', n_learners=10, bootstrap_sample_size=100, bootstrap=True, oob_score=True)

In [19]:
train_data = pd.read_csv('./train.csv').astype({
    'PassengerId': int,
    'Survived': int,
    'Pclass': int,
    'Age': float,
    'SibSp': int,
    'Parch': int,
    'Fare': float
})

X_train = train_data.loc[:, ~train_data.columns.isin(["Survived", "Name", "Sex", "Ticket", "Cabin", "Embarked"])]
Y_train = train_data.loc[:, train_data.columns == "Survived"]

X_train.shape, Y_train.shape

((891, 6), (891, 1))

In [33]:
test_data = pd.read_csv('test.csv').astype({
    'PassengerId': int,
    'Pclass': int,
    'Age': float,
    'SibSp': int,
    'Parch': int,
    'Fare': float
})

test_X = test_data.loc[:, ~test_data.columns.isin(["Survived", "Name", "Sex", "Ticket", "Cabin", "Embarked"])]

test_X

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
0,892,3,34.5,0,0,7.8292
1,893,3,47.0,1,0,7.0000
2,894,2,62.0,0,0,9.6875
3,895,3,27.0,0,0,8.6625
4,896,3,22.0,1,1,12.2875
...,...,...,...,...,...,...
413,1305,3,,0,0,8.0500
414,1306,1,39.0,0,0,108.9000
415,1307,3,38.5,0,0,7.2500
416,1308,3,,0,0,8.0500


In [74]:
classifier.train(X_train, Y_train)

In [75]:
predictions = classifier.predict(test_X)

predictions

0      1
1      1
2      1
3      1
4      1
      ..
413    1
414    0
415    1
416    1
417    1
Length: 418, dtype: int64