In [39]:
import numpy as np
import pandas as pd
from sklearn.utils import resample
from scipy.stats import mode
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from collections import Counter
import copy
import seaborn as sns

Results = {"Model":[],"Accuracy":[]}

## DecisionTree

In [40]:

def gini(y):
    hist = np.bincount(y)
    ps = hist / len(y)
    return 1 - np.sum(ps**2)

def entropy(y):
    hist = np.bincount(y)
    ps = hist / len(y)
    return -np.sum([p * np.log2(p) for p in ps if p > 0])

def misclassification_rate(y):
    hist = np.bincount(y)
    ps = hist / len(y)
    return 1 - np.max(ps)

class Node:
    def __init__(self, predicted_class):
        self.predicted_class = predicted_class
        self.feature_index = 0
        self.threshold = 0
        self.left = None
        self.right = None



class DecisionTree:
    def __init__(self, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1):
        self.criterion = criteria[criterion]
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.root = None

    def fit(self, X, y):
        self.n_classes = len(set(y))
        self.n_features = X.shape[1]
        self.root = self.grow_tree(X, y)

    def predict(self, X):
        return np.array([self.predict(inputs) for inputs in X])

    def best_split(self, X, y):
        m = y.size
        if m <= 1:
            return None, None

        # Compute the baseline criterion score
        best_score = self.criterion(y)
        best_idx, best_thr = None, None

        # Loop through all features
        for idx in range(self.n_features):
            thresholds, classes = zip(*sorted(zip(X[:, idx], y)))
            num_left = [0] * self.n_classes
            num_right = y.tolist().copy()

            for i in range(1, m):
                c = classes[i - 1]
                num_left[c] += 1
                num_right[c] -= 1
                if thresholds[i] == thresholds[i - 1]:
                    continue

                if i < self.min_samples_leaf or m - i < self.min_samples_leaf:
                    continue

                # Ensure no negatives in num_left and num_right
                num_left = [max(0, val) for val in num_left]
                num_right = [max(0, val) for val in num_right]

                score = (i * self.criterion(num_left) + (m - i) * self.criterion(num_right)) / m
                if score < best_score:
                    best_score = score
                    best_idx = idx
                    best_thr = (thresholds[i] + thresholds[i - 1]) / 2  # midpoint

        return best_idx, best_thr

    def grow_tree(self, X, y, depth=0):
        num_samples_per_class = [np.sum(y == i) for i in range(self.n_classes)]
        predicted_class = np.argmax(num_samples_per_class)

        node = Node(predicted_class=predicted_class)

        if depth < self.max_depth:
            idx, thr = self.best_split(X, y)
            if idx is not None:
                indices_left = X[:, idx] < thr
                X_left, y_left = X[indices_left], y[indices_left]
                X_right, y_right = X[~indices_left], y[~indices_left]
                node.feature_index = idx
                node.threshold = thr
                node.left = self.grow_tree(X_left, y_left, depth + 1)
                node.right = self.grow_tree(X_right, y_right, depth + 1)
        return node

    def predict(self, X):
        return np.array([self.pred(inputs) for inputs in X])

    def pred(self, inputs):
        node = self.root
        while node.left:
            if inputs[node.feature_index] < node.threshold:
                node = node.left
            else:
                node = node.right
        return node.predicted_class

    def boost_predict(self, X):
        return [self._predict(inputs) for inputs in X]

    def _predict(self, inputs):
        node = self.root
        while node.left:
            if inputs[node.feature_index] < node.threshold:
                node = node.left
            else:
                node = node.right
        return node.predicted_class



In [41]:

titanic = sns.load_dataset('titanic')

titanic = titanic.drop(['who', 'adult_male', 'deck', 'embark_town', 'alive', 'alone'], axis=1)

titanic['age'] = titanic['age'].fillna(titanic['age'].median())

titanic['embarked'] = titanic['embarked'].fillna(titanic['embarked'].mode()[0])

label_encoder = LabelEncoder()
titanic['sex'] = label_encoder.fit_transform(titanic['sex'])
titanic['embarked'] = label_encoder.fit_transform(titanic['embarked'])
titanic['class'] = label_encoder.fit_transform(titanic['class'])
titanic = titanic.dropna(subset=['fare'])

train_data, test_data = train_test_split(titanic, test_size=0.2, random_state=1)

X_train = train_data.drop('survived', axis=1)
y_train = train_data['survived']
X_test = test_data.drop('survived', axis=1)
y_test = test_data['survived']

X_train = X_train.to_numpy()
y_train = y_train.to_numpy()
X_test = X_test.to_numpy()
y_test = y_test.to_numpy()


In [42]:
# Create a DecisionTree model
criteria = {'gini': gini, 'entropy': entropy, 'misclassification_rate': misclassification_rate}

model = DecisionTree(criterion='gini', max_depth=3, min_samples_split=2, min_samples_leaf=1)

# Fit the model to the training data
model.fit(X_train, y_train)

# Make predictions on the test data
predictions = model.predict(X_test)

# Calculate the accuracy of the model
accuracy = (predictions == y_test).mean()
print(f'Accuracy: {accuracy*100:.2f}%')
Results['Model'].append("DecisionTreeClassifier")
Results['Accuracy'].append(accuracy)


Accuracy: 59.78%


## RandomForest

In [43]:


class RandomForest:
    def __init__(self, base_estimator, n_estimators=100, max_features='sqrt'):
        self.base_estimator = base_estimator
        self.n_estimators = n_estimators
        self.max_features = max_features
        self.estimators_ = []

    def _bootstrap_sample(self, X, y):
        n_samples = X.shape[0]
        X_, y_ = resample(X, y, replace=True, n_samples=n_samples)
        return X_, y_

    def _feature_sampling(self, X, max_features):
        n_features = X.shape[1]
        if max_features == 'sqrt':
            max_features = int(np.sqrt(n_features))
        if max_features == 'log2':
            max_features = int(np.log2(n_features))
        if isinstance(max_features, int):
            idxs = np.random.choice(n_features, max_features, replace=False)
        return X[:, idxs], idxs

    def fit(self, X, y):
        self.estimators_ = []
        for _ in range(self.n_estimators):
            estimator = self.base_estimator
            X_sample, y_sample = self._bootstrap_sample(X, y)
            X_sample, self.feat_idxs = self._feature_sampling(X_sample, self.max_features)
            estimator.fit(X_sample, y_sample)
            self.estimators_.append(estimator)

    def predict(self, X):
        X = X[:, self.feat_idxs]
        pred = np.zeros((X.shape[0], len(self.estimators_)))
        for i, estimator in enumerate(self.estimators_):
            pred[:, i] = estimator.predict(X)
        return np.array([np.bincount(row).argmax() for row in pred.astype(int)])


In [44]:
titanic = sns.load_dataset('titanic')

# Drop the 'who', 'adult_male', 'deck', 'embark_town', 'alive', 'alone' columns
titanic = titanic.drop(['who', 'adult_male', 'deck', 'embark_town', 'alive', 'alone'], axis=1)

# Fill missing 'age' values with the median age
titanic['age'] = titanic['age'].fillna(titanic['age'].median())

# Fill missing 'embarked' values with the most common port
titanic['embarked'] = titanic['embarked'].fillna(titanic['embarked'].mode()[0])

# Convert 'sex', 'class', and 'embarked' to numerical values
label_encoder = LabelEncoder()
titanic['sex'] = label_encoder.fit_transform(titanic['sex'])
titanic['embarked'] = label_encoder.fit_transform(titanic['embarked'])
titanic['class'] = titanic['class'].cat.codes

# Drop rows with missing 'fare'
titanic = titanic.dropna(subset=['fare'])

# Split the data into a training set and a test set
train_data, test_data = train_test_split(titanic, test_size=0.2, random_state=1)

# Separate the 'survived' column from the rest of the data
X_train = train_data.drop('survived', axis=1)
y_train = train_data['survived']
X_test = test_data.drop('survived', axis=1)
y_test = test_data['survived']

# Convert to numpy arrays
X_train = X_train.to_numpy()
y_train = y_train.to_numpy()
X_test = X_test.to_numpy()
y_test = y_test.to_numpy()

base_estimator = DecisionTree(criterion='gini', max_depth=3, min_samples_split=2, min_samples_leaf=1)

# Create a RandomForest model
model = RandomForest(base_estimator=base_estimator, n_estimators=100, max_features='sqrt')

# Fit the model to the training data
model.fit(X_train, y_train)

# Make predictions on the test data
predictions = model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy*100:.2f}%')

Results['Model'].append("RandomForestClassifier")
Results['Accuracy'].append(accuracy)

Accuracy: 59.22%


## AdaBoost

In [45]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
from copy import deepcopy

class AdaBoost:
    def __init__(self, base_estimator, n_estimators=50, learning_rate=1.):
        self.base_estimator = base_estimator
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate

    def fit(self, X, y):
        n_samples = X.shape[0]
        unique_values = np.unique(y)
        self.n_classes = len(unique_values)
        sample_weight = np.ones(n_samples) / n_samples

        self.estimators_ = []
        self.estimator_weights_ = []
        self.estimator_errors_ = []

        for iboost in range(self.n_estimators):
            sample_weight, estimator, estimator_weight, estimator_error = self._boost(
                iboost, X, y, sample_weight
            )
            if estimator_error == 0:
                break

            self.estimators_.append(deepcopy(estimator))
            self.estimator_weights_.append(estimator_weight)
            self.estimator_errors_.append(estimator_error)

            sample_weight /= sample_weight.sum()

    def _boost(self, iboost, X, y, sample_weight):
        indices = np.random.choice(np.arange(X.shape[0]), size=X.shape[0], p=sample_weight)
        X_sample, y_sample = X[indices], y[indices]

        estimator = deepcopy(self.base_estimator)
        estimator.fit(X_sample, y_sample)

        y_predict = estimator.boost_predict(X)
        incorrect = y_predict != y
        estimator_error = np.mean(np.average(incorrect, weights=sample_weight))

        estimator_weight = self.learning_rate * (np.log((1. - estimator_error) / estimator_error) + np.log(self.n_classes - 1))

        sample_weight *= np.exp(estimator_weight * incorrect)

        return sample_weight, estimator, estimator_weight, estimator_error

    def predict(self, X):
        predictions = np.array([estimator.boost_predict(X) for estimator in self.estimators_])
        avg_pred = np.average(predictions, axis=0, weights=self.estimator_weights_)
        return np.sign(avg_pred)




In [37]:
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from copy import deepcopy

# Your AdaBoost class goes here

# Load the dataset
titanic = sns.load_dataset('titanic')

# Drop the 'who', 'adult_male', 'deck', 'embark_town', 'alive', 'alone' columns
titanic = titanic.drop(['who', 'adult_male', 'deck', 'embark_town', 'alive', 'alone'], axis=1)

# Fill missing 'age' values with the median age
titanic['age'] = titanic['age'].fillna(titanic['age'].median())

# Fill missing 'embarked' values with the most common port
titanic['embarked'] = titanic['embarked'].fillna(titanic['embarked'].mode()[0])

# Convert 'sex', 'embarked', and 'class' to numerical values
label_encoder = LabelEncoder()
titanic['sex'] = label_encoder.fit_transform(titanic['sex'])
titanic['class'] = label_encoder.fit_transform(titanic['class'])
titanic['embarked'] = label_encoder.fit_transform(titanic['embarked'])

# Drop rows with missing 'fare'
titanic = titanic.dropna(subset=['fare'])

# Split the data into a training set and a test set
train_data, test_data = train_test_split(titanic, test_size=0.2, random_state=1)

# Separate the 'survived' column from the rest of the data
X_train = train_data.drop('survived', axis=1).values
y_train = train_data['survived'].values
X_test = test_data.drop('survived', axis=1).values
y_test = test_data['survived'].values

# Set base estimator as a decision tree
base_estimator = DecisionTree(max_depth=1)

# Create an AdaBoost model
model = AdaBoost(base_estimator=base_estimator, n_estimators=50, learning_rate=1.0)

# Fit the model to the training data
model.fit(X_train, y_train)

# Make predictions on the test data
predictions = model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy*100:.2f}%')
Results['Model'].append("AdaBoostClassifier")
Results['Accuracy'].append(accuracy)

Accuracy: 40.78%


## Results

In [38]:
# results..
res = pd.DataFrame(Results)
res

Unnamed: 0,Model,Accuracy
0,DecisionTreeClassifier,0.597765
1,RandomForestClassifier,0.592179
2,AdaBoostClassifier,0.407821
