## Implementation of Decision Tree

In [1]:
import numpy as np

class DecisionTreeClassifier:
    def __init__(self, criterion='gini', max_depth=10, min_samples_split=2, min_samples_leaf=1):
        self.criterion = criterion
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.tree = None

    def fit(self, X, y):
        self.tree = self.build_tree(X, y, depth=0)

    def predict(self, X):
        return np.array([self.predict_tree(x, self.tree) for x in X])

    def gini_criterion(self, y):
        _, counts = np.unique(y, return_counts=True)
        prob = counts / len(y)
        return 1 - np.sum(prob ** 2)

    def entropy_criterion(self, y):
        _, counts = np.unique(y, return_counts=True)
        prob = counts / len(y)
        return -np.sum(prob * np.log2(prob))

    def misclassification_rate_criterion(self, y):
        _, counts = np.unique(y, return_counts=True)
        prob = counts / len(y)
        return 1 - np.max(prob)

    def split(self, X, y, feature_index, threshold):
        l_mask = X[:, feature_index] <= threshold
        r_mask = ~l_mask
        return X[l_mask], X[r_mask], y[l_mask], y[r_mask]

    def build_tree(self, X, y, depth):
        no_samples, no_features = X.shape
        no_classes = len(np.unique(y))
        if no_classes == 1:
            return y[0]
        if depth == self.max_depth:
            return np.argmax(np.bincount(y))
        if no_samples < self.min_samples_split:
            return np.argmax(np.bincount(y))

        best_criterion_value = -1
        best_criterion = None
        best_feature_index = None
        best_threshold = None

        for feature_index in range(no_features):
            unique_values = np.unique(X[:, feature_index])
            thresholds = (unique_values[:-1] + unique_values[1:]) / 2
            for threshold in thresholds:
                X_left, X_right, y_left, y_right = self.split(X, y, feature_index, threshold)
                if len(y_left) < self.min_samples_leaf or len(y_right) < self.min_samples_leaf:
                    continue

                if self.criterion == 'gini':
                    criterion_value = self.gini_criterion(y_left) * len(y_left) / no_samples + \
                                      self.gini_criterion(y_right) * len(y_right) / no_samples
                elif self.criterion == 'entropy':
                    criterion_value = self.entropy_criterion(y_left) * len(y_left) / no_samples + \
                                      self.entropy_criterion(y_right) * len(y_right) / no_samples
                elif self.criterion == 'misclassification':
                    criterion_value = self.misclassification_rate_criterion(y_left) * len(y_left) / no_samples + \
                                      self.misclassification_rate_criterion(y_right) * len(y_right) / no_samples

                if criterion_value > best_criterion_value:
                    best_criterion_value = criterion_value
                    best_criterion = (feature_index, threshold)
                    best_feature_index = feature_index
                    best_threshold = threshold

        if best_criterion_value == -1:
            return np.argmax(np.bincount(y))

        X_left, X_right, y_left, y_right = self.split(X, y, best_feature_index, best_threshold)

        left_subtree = self.build_tree(X_left, y_left, depth + 1)
        right_subtree = self.build_tree(X_right, y_right, depth + 1)

        return (best_feature_index, best_threshold, left_subtree, right_subtree)

    def predict_tree(self, x, node):
        if not isinstance(node, tuple):
            return node
        
        feature_index, threshold, left_subtree, right_subtree = node

        if x[feature_index] <= threshold:
            return self.predict_tree(x, left_subtree)
        else:
            return self.predict_tree(x, right_subtree)


## Data Preprocessing for Titanic Dataset

In [2]:
import numpy as np
import pandas as pd
import re
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from IPython.display import Image as PImage
from subprocess import check_call
from PIL import Image, ImageDraw, ImageFont

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

PassengerId = test['PassengerId']

train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [3]:
original_train = train.copy()

full_data = [train, test]


train['Has_Cabin'] = train["Cabin"].apply(lambda x: 0 if type(x) == float else 1)
test['Has_Cabin'] = test["Cabin"].apply(lambda x: 0 if type(x) == float else 1)


for dataset in full_data:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

for dataset in full_data:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1

for dataset in full_data:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')

for dataset in full_data:
    dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())


for dataset in full_data:
    age_avg = dataset['Age'].mean()
    age_std = dataset['Age'].std()
    age_null_count = dataset['Age'].isnull().sum()
    age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)

    dataset.loc[np.isnan(dataset['Age']), 'Age'] = age_null_random_list
    dataset['Age'] = dataset['Age'].astype(int)


def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)

    if title_search:
        return title_search.group(1)
    return ""

for dataset in full_data:
    dataset['Title'] = dataset['Name'].apply(get_title)

for dataset in full_data:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

for dataset in full_data:

    dataset['Sex'] = dataset['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
    

    title_mapping = {"Mr": 1, "Master": 2, "Mrs": 3, "Miss": 4, "Rare": 5}
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)


    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
    

    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] 						        = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] 							        = 3
    dataset['Fare'] = dataset['Fare'].astype(int)
    

    dataset.loc[ dataset['Age'] <= 16, 'Age'] 					       = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age'] ;


In [4]:
drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp']
train = train.drop(drop_elements, axis = 1)
test  = test.drop(drop_elements, axis = 1)

In [5]:
from sklearn.model_selection import train_test_split

y = train['Survived'].values
X = train.drop('Survived', axis=1).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=6739)

Decision_Tree = DecisionTreeClassifier(criterion='gini', max_depth=10, min_samples_split=2, min_samples_leaf=1)
Decision_Tree.fit(X_train, y_train)


y_pred = Decision_Tree.predict(X_test)

Accuracy_DT_gini = accuracy_score(y_test, y_pred)

from sklearn.metrics import accuracy_score
print('Accuracy for Decision Tree with Gini: {:.2f}%'.format(Accuracy_DT_gini*100))

Accuracy for Decision Tree with Gini: 73.18%


In [6]:
from sklearn.model_selection import train_test_split

y = train['Survived'].values
X = train.drop('Survived', axis=1).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=6739)

Decision_Tree = DecisionTreeClassifier(criterion='entropy', max_depth=10, min_samples_split=2, min_samples_leaf=1)
Decision_Tree.fit(X_train, y_train)


y_pred = Decision_Tree.predict(X_test)

Accuracy_DT_entropy = accuracy_score(y_test, y_pred)

from sklearn.metrics import accuracy_score
print('Accuracy for Decision Tree with Entropy: {:.2f}%'.format(Accuracy_DT_entropy*100))

Accuracy for Decision Tree with Entropy: 70.95%


In [7]:
from sklearn.model_selection import train_test_split

y = train['Survived'].values
X = train.drop('Survived', axis=1).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=6739)

Decision_Tree = DecisionTreeClassifier(criterion='misclassification', max_depth=10, min_samples_split=2, min_samples_leaf=1)
Decision_Tree.fit(X_train, y_train)


y_pred = Decision_Tree.predict(X_test)

Accuracy_DT_misclassification = accuracy_score(y_test, y_pred)

from sklearn.metrics import accuracy_score
print('Accuracy for Decision Tree with Misclassification: {:.2f}%'.format(Accuracy_DT_misclassification*100))


Accuracy for Decision Tree with Misclassification: 63.69%


## Implementation of Random Forest

In [8]:
import numpy as np

class RandomForestClassifier:
    def __init__(self, main_classifier, no_trees, min_features):
        self.main_classifier = main_classifier
        self.no_trees = no_trees
        self.min_features = min_features
        self.trees = []

    def fit(self, X, y):
        num_features = X.shape[1]

        for _ in range(self.no_trees):
            idx = np.random.choice(len(y), len(y), replace=True)
            X_subset, y_subset = X[idx], y[idx]

            num_selected_features = np.random.randint(self.min_features, num_features + 1)
            selected_feature_indices = np.random.choice(num_features, num_selected_features, replace=False)

            tree = self.main_classifier(max_depth=None, min_samples_split=2, min_samples_leaf=1)
            tree.fit(X_subset[:, selected_feature_indices], y_subset)
            
            self.trees.append((selected_feature_indices, tree))

    def predict(self, X):
        num_samples = X.shape[0]
        num_classes = len(np.unique(y))

        predictions = np.zeros((num_samples, num_classes))

        for features_indices, tree in self.trees:
            tree_predictions = tree.predict(X[:, features_indices])
            predictions[np.arange(num_samples), tree_predictions] += 1

        final_predictions = np.argmax(predictions, axis=1)

        return final_predictions


In [9]:
from sklearn.model_selection import train_test_split

y = train['Survived'].values
X = train.drop('Survived', axis=1).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=6739)

Random_Forest = RandomForestClassifier(main_classifier=DecisionTreeClassifier, no_trees=10, min_features=6)
Random_Forest.fit(X_train, y_train)


y_pred = Random_Forest.predict(X_test)

Accuracy_RF = accuracy_score(y_test, y_pred)

from sklearn.metrics import accuracy_score
print('Accuracy for Random Forest:{:.2f}%'.format(Accuracy_RF*100))

Accuracy for Random Forest:79.33%


## Implementation of Ada Boost

In [10]:
import numpy as np

class AdaBoost:
    def __init__(self, weak_learner, num_learners, learning_rate):
        self.weak_learner = weak_learner
        self.num_learners = num_learners
        self.learning_rate = learning_rate


    def fit(self, X, y):
        num_samples = X.shape[0]
        weights = np.ones(num_samples) / num_samples 

        self.m_arr = []  
        self.al_arr = [] 

        for _ in range(self.num_learners):
            ml = self.weak_learner()
            ml.fit(X, y)
            y_pred = ml.predict(X) 

            error = np.mean(np.abs(y_pred - y) / 2 * weights) / np.mean(weights)

            if error > 0.5:
                break
            alpha = self.learning_rate * np.log((1 - error) / error)
            self.m_arr.append(ml)
            self.al_arr.append(alpha)

            weights *= np.exp(-alpha * y * y_pred)
            weights /= np.sum(weights)


    def predict(self, X):
        n_samples = X.shape[0]
        y_pred = np.zeros(n_samples)

        for i in range(len(self.m_arr)):
            ml = self.m_arr[i]
            alpha = self.al_arr[i]
            y_pred += alpha * ml.predict(X)

        return np.sign(y_pred)


In [11]:
from sklearn.model_selection import train_test_split

y = train['Survived'].values
X = train.drop('Survived', axis=1).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=6739)

Ada_Boost = AdaBoost(weak_learner=DecisionTreeClassifier, num_learners=50, learning_rate=0.1)
Ada_Boost.fit(X_train, y_train)


y_pred = Ada_Boost.predict(X_test)

Accuracy_AB = accuracy_score(y_test, y_pred)

from sklearn.metrics import accuracy_score
print('Accuracy for Ada Boost: {:.2f}%'.format(Accuracy_AB*100))

Accuracy for Ada Boost: 73.18%


In [12]:
print('Accuracy for Decision Tree with Gini: {:.2f}%'.format(Accuracy_DT_gini*100))
print('Accuracy for Decision Tree with Entropy: {:.2f}%'.format(Accuracy_DT_entropy*100))
print('Accuracy for Decision Tree with Misclassification: {:.2f}%'.format(Accuracy_DT_misclassification*100))
print("Accuracy for Random Forest: {:.2f}%".format(Accuracy_RF * 100))
print("Accuracy for Ada Boost: {:.2f}%".format(Accuracy_AB * 100))

Accuracy for Decision Tree with Gini: 73.18%
Accuracy for Decision Tree with Entropy: 70.95%
Accuracy for Decision Tree with Misclassification: 63.69%
Accuracy for Random Forest: 79.33%
Accuracy for Ada Boost: 73.18%
