In [1]:
import numpy as np
from collections import Counter

class Leaf():
    def __init__(self, y):
        counts = Counter(y)
        
        self.label = None
        maxcount = 0
        
        self.label = max(counts.keys(), key=lambda x: counts[x])
    
    def predict(self, _):
        return(self.label)
    
    
class Node():
    def __init__(self):
        self.threshold = None
        self.num = None
        self.rleaf = None
        self.lleaf = None
        
    def predict(self, X):
        if X[self.num] <= self.threshold:
            return self.lleaf.predict(X)
        else:
            return self.rleaf.predict(X)
        
    def fit(self, X, y):
        num_features = X.shape[1]
        data = np.hstack([X, y.reshape(-1, 1)])
        y0 = y[0]
        del X, y
        
        thr = np.zeros(num_features)
        gini = np.zeros(num_features)
        
        def calc_gini(y):
            if (y.size == 0):
                return 0
            c = Counter(y)
            return max(c.values())
        
        # Try to find best partitioning over all features
        for i in range(num_features):
            feat_values = np.unique(data[:, i])
            thresholds = np.zeros(len(feat_values))
            coef = np.zeros(len(feat_values))

            # For all unique feature values
            for k, j in enumerate(feat_values):
                # Count max length of classes in left and in right branches, take WORST (minimal) value
                g_l = calc_gini(data[data[:, i] <= j][:, -1])
                g_r = calc_gini(data[data[:, i] > j][:, -1])
                thresholds[k] = j
                coef[k] = min(g_l, g_r)
 
            # Take BEST (maximal) value for feature (worst branch must be best of all)
            thr[i] = thresholds[np.min(np.where(coef == np.max(coef)))]
            gini[i] = np.max(coef)
            
        # Find best partitioning over all features
        self.num = np.min(np.where(gini == np.max(gini)))
        self.threshold = thr[self.num]
        
        # Divide data
        data_l = data[data[:, self.num] <= self.threshold]
        data_r = data[data[:, self.num] > self.threshold]
        X_l = data_l[:, 0:-1]
        y_l = data_l[:, -1]
        X_r = data_r[:, 0:-1]
        y_r = data_r[:, -1]
        del data_l, data_r
        
        if (y_l.size == 0) or (y_r.size == 0):
            self.lleaf = Leaf(np.array([y0]))
            self.rleaf = Leaf(np.array([y0]))
            return
            
        if len(np.unique(y_l)) <= 1:
            self.lleaf = Leaf(y_l)
        else:
            self.lleaf = Node()
            self.lleaf.fit(X_l, y_l)
            
        if len(np.unique(y_r)) <= 1:
            self.rleaf = Leaf(y_r)
        else:
            self.rleaf = Node()
            self.rleaf.fit(X_r, y_r)
        return
    

class Tree():
    def __init__(self):
        self.root = Node()
        
    def fit(self, X, y):
        self.root.fit(X, y)
        return
    
    def predict(self, X):
        y = np.zeros(X.shape[0])
        for i in range(X.shape[0]):
            y[i] = self.root.predict(X[i, :])
        return y

In [2]:
import random
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score


N = 1000
random.seed(42)
data = np.random.rand(N, 2)
y = np.random.randint(2, size=(N, 1))

data = np.hstack([data, y])

In [3]:
delim = 600
train = np.array(data[:delim])
test = np.array(data[delim:])
X_train = train[:, 0:-1]
y_train = train[:, -1]
X_test = test[:, 0:-1]
y_test = test[:, -1]

In [5]:
custom_tree = Tree()
sklearn_tree = DecisionTreeClassifier()

custom_tree.fit(X_train, y_train)
sklearn_tree.fit(X_train, y_train)

custom_ypred = custom_tree.predict(X_test)
sklearn_ypred = sklearn_tree.predict(X_test)

custom_rocauc = roc_auc_score(y_test, custom_ypred)
sklearn_rocauc = roc_auc_score(y_test, sklearn_ypred)

print(custom_rocauc, sklearn_rocauc)

0.5259935768767564 0.502408671216379


In [7]:
import math
import re
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_openml

X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

def get_marital_status(name):
    if (name.find('Mrs') != -1) or (name.find('Lady') != -1) or (name.find('Countess') != -1):
        return 2
    if (name.find('Miss') != -1) or (name.find('Mlle') != -1) or (name.find('Mme') != -1) or (name.find('Ms') != -1):
        return 1
    return 0

def get_cabin(x):
    x = re.sub('[^A-Za-z ]', '', x)
    return x[0]

def prepare_data(X):
    X = X.copy()
    # print_nan(X)
    X['namelength'] = X['name'].map(len)
    
    X['title'] = X['name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
    stat_min = 10
    title_names = (X['title'].value_counts() < stat_min)
    X['title'] = X['title'].apply(lambda x: 'misc' if title_names.loc[x] == True else x)
    X = pd.get_dummies(X, columns = ['title'], prefix='title')
    
    X['maritalstatus'] = X['name'].map(get_marital_status)
    X['is_mr'] = X['maritalstatus'].map(lambda x: 1 if x == 0 else 0)
    X['is_miss'] = X['maritalstatus'].map(lambda x: 1 if x == 1 else 0)
    X['is_mrs'] = X['maritalstatus'].map(lambda x: 1 if x == 2 else 0)
    # X.drop('MaritalStatus', axis = 1, inplace = True)
    
    X['age_known'] = X['age'].isnull() == False
    X['age_known'] = X['age_known'].map(lambda x: 1 if x else 0)
    X['age'] = X.groupby('pclass')['age'].apply(lambda x: x.fillna(x.median()))
    
    X['fare'] = X.groupby('pclass')['fare'].apply(lambda x: x.fillna(x.median()))
    
    X['sex'] = X['sex'].map(lambda x: 1 if x == 'male' else 0)
    
    X['embarked'].fillna(value = 'S', inplace = True)
    X = pd.get_dummies(X, columns = ['embarked'], prefix='emb')
    X['cabin'] = X['cabin'].map(lambda x: 'n' if pd.isna(x) else x)
    X['lb'] = X['cabin'].map(lambda x: 1 if ((len(re.sub('[A-Za-z ]', '', x)) > 0) and (int(re.sub('[A-Za-z ]', '', x)) % 2 == 0)) else 0)
    X['rb'] = X['cabin'].map(lambda x: 1 if ((len(re.sub('[A-Za-z ]', '', x)) > 0) and (int(re.sub('[A-Za-z ]', '', x)) % 2 == 1)) else 0)
    X['cabin'] = X['cabin'].map(get_cabin)
    X = pd.get_dummies(X, columns = ['cabin'], prefix='cabin')
    # X['Cabin'] = X['Cabin'].map(lambda x: 0 if pandas.isna(x) else 1)
    
    X['familysize'] = X['sibsp'] + X['parch']
    #X['isAlone'] = X['FamilySize'].map(lambda x: 1 if x == 0 else 0)
    
    X['1class'] = X['pclass'].map(lambda x: 1 if x == 1 else 0)
    X['2class'] = X['pclass'].map(lambda x: 1 if x == 2 else 0)
    X['3class'] = X['pclass'].map(lambda x: 1 if x == 3 else 0)
    X.drop('pclass', axis = 1, inplace = True)
    X['farebin'] = pd.qcut(x = X['fare'], q = 4, labels = False)
    X['agebin'] = pd.qcut(x = X['age'], q = 4, labels = False)
    
    X['age'] = X['age'].map(np.log1p)
    X['fare'] = X['fare'].map(np.log1p)
    
    X['pass'] = 1
    X = X.join(X.groupby('ticket')['pass'].sum().rename('passinticket'), on = 'ticket')
    X = X.join(X.groupby('ticket')['fare'].median().rename('ticketfare'), on ='ticket')
    X['ticketfare'] = X['ticketfare'].divide(X['passinticket'])
    X['familyfare'] = X['fare'].divide(X['familysize'] + 1)
    X['familyname'] = X['name'].map(lambda x: x.split(',')[0])
    X = X.join(X.groupby('familyname')['pass'].sum().rename('familysize_1'), on = 'familyname')
    X['familyfare_1'] = X['fare'].divide(X['familysize_1'] + 1)
    X = pd.get_dummies(X, columns = ['familyname'], prefix='fname')
    X.drop('pass', inplace = True, axis = 1)
    
    X['ttype'] = X['ticket'].str[0]
    X = pd.get_dummies(X, columns = ['ttype'], prefix='ttype')
    
    X = pd.get_dummies(X, columns = ['familysize'], prefix='famsize')
    X = pd.get_dummies(X, columns = ['parch'], prefix='parch')
    X = pd.get_dummies(X, columns = ['sibsp'], prefix='sibsp')
    
    X.drop(['name', 'ticket'], axis = 1, inplace = True)
    
    
    return X

X = prepare_data(X)
X = X.drop(columns=['boat', 'home.dest']).fillna(0)
scaler = StandardScaler()
X = scaler.fit_transform(X)
X = np.array(X)
y = np.array(y)

In [10]:
def scorer(classifier, X, y, iter = 5):
    score = 0
    for i in range(iter):
        x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(X, y, test_size=0.2, shuffle=True)
        classifier.fit(x_train_split, y_train_split)
        y_pred = classifier.predict(x_test_split)
        score += roc_auc_score(y_test_split, y_pred)
    score /= iter
    return score

decisiontree = DecisionTreeClassifier()
mytree = Tree()
sklearn_score = scorer(decisiontree, X, y)
my_score = scorer(mytree, X, y)
print(my_score, sklearn_score)

0.6942570098048889 0.7841662575091981
