In [1]:
import random
import math
import re

import numpy as np
import pandas as pd

from cart import Tree

from sklearn.datasets import fetch_openml
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier

### Random data

In [2]:
N = 1000
random.seed(42)
data = np.random.rand(N, 2)
y = np.random.randint(2, size=(N, 1))

data = np.hstack([data, y])

delim = 600
train = np.array(data[:delim])
test = np.array(data[delim:])
X_train = train[:, 0:-1]
y_train = train[:, -1]
X_test = test[:, 0:-1]
y_test = test[:, -1]

In [3]:
custom_tree = Tree()
sklearn_tree = DecisionTreeClassifier()

custom_tree.fit(X_train, y_train)
sklearn_tree.fit(X_train, y_train)

custom_ypred = custom_tree.predict(X_test)
sklearn_ypred = sklearn_tree.predict(X_test)

custom_rocauc = roc_auc_score(y_test, custom_ypred)
sklearn_rocauc = roc_auc_score(y_test, sklearn_ypred)

print(custom_rocauc, sklearn_rocauc)

0.5086404803350236 0.4639622593910038


### Titanic dataset

In [15]:
def get_marital_status(name):
    if (
        (name.find('Mrs') != -1) 
        or (name.find('Lady') != -1) 
        or (name.find('Countess') != -1)
    ):
        return 2
    if (
        (name.find('Miss') != -1) 
        or (name.find('Mlle') != -1) 
        or (name.find('Mme') != -1) 
        or (name.find('Ms') != -1)
    ):
        return 1
    return 0


def get_cabin(x):
    x = re.sub('[^A-Za-z ]', '', x)
    return x[0]


def prepare_data(X):
    X = X.copy()

    X['namelength'] = X['name'].map(len)
    
    X['title'] = X['name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
    stat_min = 10
    title_names = (X['title'].value_counts() < stat_min)
    X['title'] = X['title'].apply(lambda x: 'misc' if title_names.loc[x] == True else x)
    X = pd.get_dummies(X, columns = ['title'], prefix='title')
    
    X['maritalstatus'] = X['name'].map(get_marital_status)
    X['is_mr'] = X['maritalstatus'].map(lambda x: 1 if x == 0 else 0)
    X['is_miss'] = X['maritalstatus'].map(lambda x: 1 if x == 1 else 0)
    X['is_mrs'] = X['maritalstatus'].map(lambda x: 1 if x == 2 else 0)
    
    X['age_known'] = X['age'].isnull() == False
    X['age_known'] = X['age_known'].map(lambda x: 1 if x else 0)
    X['age'] = X.groupby('pclass')['age'].apply(lambda x: x.fillna(x.median()))
    
    X['fare'] = X.groupby('pclass')['fare'].apply(lambda x: x.fillna(x.median()))
    
    X['sex'] = X['sex'].map(lambda x: 1 if x == 'male' else 0)
    
    X['embarked'].fillna(value = 'S', inplace = True)
    X = pd.get_dummies(X, columns = ['embarked'], prefix='emb')
    
    X['cabin'] = X['cabin'].map(lambda x: 'n' if pd.isna(x) else x)
    X['lb'] = X['cabin'].map(
        lambda x: 1 if (
            (len(re.sub('[A-Za-z ]', '', x)) > 0) 
            and (int(re.sub('[A-Za-z ]', '', x)) % 2 == 0)
        ) else 0)
    X['rb'] = X['cabin'].map(lambda x: 1 if (
        (len(re.sub('[A-Za-z ]', '', x)) > 0) 
        and (int(re.sub('[A-Za-z ]', '', x)) % 2 == 1)
    ) else 0)
    X['cabin'] = X['cabin'].map(get_cabin)
    X = pd.get_dummies(X, columns=['cabin'], prefix='cabin')
    
    X['familysize'] = X['sibsp'] + X['parch']
    
    X['1class'] = X['pclass'].map(lambda x: 1 if x == 1 else 0)
    X['2class'] = X['pclass'].map(lambda x: 1 if x == 2 else 0)
    X['3class'] = X['pclass'].map(lambda x: 1 if x == 3 else 0)
    X.drop('pclass', axis = 1, inplace = True)
    
    X['farebin'] = pd.qcut(x = X['fare'], q = 4, labels = False)
    X['agebin'] = pd.qcut(x = X['age'], q = 4, labels = False)
    
    X['age'] = X['age'].map(np.log1p)
    X['fare'] = X['fare'].map(np.log1p)
    
    X['pass'] = 1
    X = X.join(X.groupby('ticket')['pass'].sum().rename('passinticket'), on='ticket')
    X = X.join(X.groupby('ticket')['fare'].median().rename('ticketfare'), on='ticket')
    X['ticketfare'] = X['ticketfare'].divide(X['passinticket'])
    X['familyfare'] = X['fare'].divide(X['familysize'] + 1)
    
    X['familyname'] = X['name'].map(lambda x: x.split(',')[0])
    X = X.join(X.groupby('familyname')['pass'].sum().rename('familysize_1'), on='familyname')
    X['familyfare_1'] = X['fare'].divide(X['familysize_1'] + 1)
    
    X = pd.get_dummies(X, columns=['familyname'], prefix='fname')
    
    X.drop('pass', inplace=True, axis=1)
    
    X['ttype'] = X['ticket'].str[0]
    X = pd.get_dummies(X, columns = ['ttype'], prefix='ttype')
    
    X = pd.get_dummies(X, columns = ['familysize'], prefix='famsize')
    X = pd.get_dummies(X, columns = ['parch'], prefix='parch')
    X = pd.get_dummies(X, columns = ['sibsp'], prefix='sibsp')
    
    X.drop(['name', 'ticket'], axis=1, inplace=True)
    
    X['boat'] = X['boat'].fillna('Unknown')
    X = pd.get_dummies(X, columns=['boat'], prefix='boat')
    
    X['home.dest'] = X['home.dest'].fillna('Unknown')
    X = pd.get_dummies(X, columns=['home.dest'], prefix='home')
    
    X['body'] = X['body'].fillna(0)
    
    return X


def scorer(classifier, X, y):
    score = 0
    x_train, x_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, shuffle=True, random_state=42)
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)
    score = roc_auc_score(y_test, y_pred)
    return score

In [16]:
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

X = prepare_data(X)
scaler = StandardScaler()
X = scaler.fit_transform(X)
X = np.array(X)
y = np.array(y)

decisiontree = DecisionTreeClassifier()
mytree = Tree()
sklearn_score = scorer(decisiontree, X, y)
my_score = scorer(mytree, X, y)
print(my_score, sklearn_score)

0.9472104519774012 0.9745762711864407
