In [69]:
import numpy as np

class LLeaf():
    # Лист, соответствующий одному классу
    def __init__(self):
        self.label = None
    def fit(self, X, y):
        self.label = y[0]
        return
    def predict(self, X):
        return(self.label)

def calc_gini(y):
    # Посчитаем наибольшую вероятность встречи элемента из y
    if (y.size == 0):
        return 0
    else:
        unique_values = np.unique(y)
        return max(len(y[y == i]) for i in unique_values)
    
class CLeaf():
    # Разделяет множество. Разделяет обучающую выборку по максимальному критерию Джини
    def __init__(self):
        self.threshold = None
        self.num = None
        self.rleaf = None
        self.lleaf = None
    def predict(self, X):
        if X[self.num] <= self.threshold:
            return self.lleaf.predict(X)
        else:
            return self.rleaf.predict(X)
    def fit(self, X, y):
        # print('Debug, fitting {} objects'.format(X.shape[0]))
        # print('CLeaf, fit, X, y:\n', X, '\n', y)
        features = X.shape[1]
        data = np.hstack([X, y.reshape(-1, 1)])
        y0 = y[0]
        del X, y
        
        thr = np.zeros(features)
        gini = np.zeros(features)
        # Пройдем по всем признакам
        for i in range(features):
            thresholds = np.zeros(len(np.unique(data[:, i])))
            coef = np.zeros(len(np.unique(data[:, i])))
            k = 0
            # Рассмотрим все уникальные значения признака
            for j in np.unique(data[:, i]):
                # Делим выборку, считаем Джини для обеих частей, берем НАИМЕНЬШИЙ
                y_l = data[data[:, i] <= j][:, -1]
                y_r = data[data[:, i] > j][:, -1]
                g_l = calc_gini(y_l)
                g_r = calc_gini(y_r)
                thresholds[k] = j
                coef[k] = min(g_l, g_r)
                k = k + 1
            # Определяем, где НАИБОЛЬШИЙ Джини, то есть наилучшее разделение по данному признаку
            thr[i] = thresholds[np.min(np.where(coef == np.max(coef)))]
            gini[i] = np.max(coef)
        # Определяем, по какому из признаков получили наилучшее разделение
        self.num = np.min(np.where(gini == np.max(gini)))
        self.threshold = thr[self.num]
        # Делим выборку и обучаем следующие элементы дерева
        data_l = data[data[:, self.num] <= self.threshold]
        data_r = data[data[:, self.num] > self.threshold]
        X_l = data_l[:, 0:-1]
        y_l = data_l[:, -1]
        X_r = data_r[:, 0:-1]
        y_r = data_r[:, -1]
        del data_l, data_r
        
        if (y_l.size == 0) or (y_r.size == 0):
            self.lleaf = LLeaf()
            self.rleaf = LLeaf()
            self.lleaf.fit(np.array([0]), np.array([y0]))
            self.rleaf.fit(np.array([0]), np.array([y0]))
            return
            
        if len(np.unique(y_l)) <= 1:
            self.lleaf = LLeaf()
            self.lleaf.fit(X_l, y_l)
        else:
            self.lleaf = CLeaf()
            self.lleaf.fit(X_l, y_l)
        if len(np.unique(y_r)) <= 1:
            self.rleaf = LLeaf()
            self.rleaf.fit(X_r, y_r)
        else:
            self.rleaf = CLeaf()
            self.rleaf.fit(X_r, y_r)
        return
    

class CTree():
    def __init__(self):
        self.root = CLeaf()    
    def fit(self, X, y):
        self.root.fit(X, y)
        return
    def predict(self, X):
        y = np.zeros(X.shape[0])
        for i in range(X.shape[0]):
            # print('Predicting, {0} / {1}'. format(i, X.shape[0]))
            y[i] = self.root.predict(X[i, :])
        return y

In [70]:
X = np.array([[1, 1], [-1, 1], [1, -1], [-1, -1]])
y = np.array([1, 0, 0, 1])
X.shape

(4, 2)

In [71]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score

In [72]:
test_values = pd.read_csv('test_values.csv', index_col = 0)
delim = 350
train = np.array(test_values[test_values.index < delim])
test = np.array(test_values[test_values.index >= delim])
X_train = train[:, 0:-1]
y_train = train[:, -1]
X_test = test[:, 0:-1]
y_test = test[:, -1]

In [73]:
myTree = CTree()
rTree = DecisionTreeClassifier()
myTree.fit(X_train, y_train)
rTree.fit(X_train, y_train)
y_my = myTree.predict(X_test)
y_r = rTree.predict(X_test)
ra_my = roc_auc_score(y_test, y_my)
ra_r = roc_auc_score(y_test, y_r)
print(ra_my, ra_r)
print(X_test[np.where(y_test != y_my)])

0.9615384615384616 1.0
[[-0.1  0. ]
 [-0.2  0. ]
 [-0.6  0. ]
 [-0.7  0. ]]


In [74]:
import math
import re
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

data_train = pd.read_csv('train.csv', index_col = 'PassengerId')
train_index = data_train.index
y_train = data_train['Survived']
data_train.drop('Survived', inplace = True, axis = 1)
#print(data_train[:5])
data_test = pd.read_csv('test.csv', index_col = 'PassengerId')
test_index = data_test.index
data = pd.concat([data_train, data_test])

def get_marital_status(name):
    if (name.find('Mrs') != -1) or (name.find('Lady') != -1) or (name.find('Countess') != -1):
        return 2
    if (name.find('Miss') != -1) or (name.find('Mlle') != -1) or (name.find('Mme') != -1) or (name.find('Ms') != -1):
        return 1
    return 0

def get_cabin(x):
    x = re.sub('[^A-Za-z ]', '', x)
    return x[0]

def prepare_data(X):
    X = X.copy()
    # print_nan(X)
    X['NameLength'] = X['Name'].map(len)
    
    X['Title'] = X['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
    stat_min = 10
    title_names = (X['Title'].value_counts() < stat_min)
    X['Title'] = X['Title'].apply(lambda x: 'Misc' if title_names.loc[x] == True else x)
    X = pd.get_dummies(X, columns = ['Title'], prefix='Title')
    
    X['MaritalStatus'] = X['Name'].map(get_marital_status)
    X['is_Mr'] = X['MaritalStatus'].map(lambda x: 1 if x == 0 else 0)
    X['is_Miss'] = X['MaritalStatus'].map(lambda x: 1 if x == 1 else 0)
    X['is_Mrs'] = X['MaritalStatus'].map(lambda x: 1 if x == 2 else 0)
    # X.drop('MaritalStatus', axis = 1, inplace = True)
    
    X['Age_known'] = X['Age'].isnull() == False
    X['Age_known'] = X['Age_known'].map(lambda x: 1 if x else 0)
    X['Age'] = X.groupby('Pclass')['Age'].apply(lambda x: x.fillna(x.median()))
    
    X['Fare'] = X.groupby('Pclass')['Fare'].apply(lambda x: x.fillna(x.median()))
    
    X['Sex'] = X['Sex'].map(lambda x: 1 if x == 'male' else 0)
    
    X['Embarked'].fillna(value = 'S', inplace = True)
    X = pd.get_dummies(X, columns = ['Embarked'], prefix='Emb')
    X['Cabin'] = X['Cabin'].map(lambda x: 'N' if pd.isna(x) else x)
    X['LB'] = X['Cabin'].map(lambda x: 1 if ((len(re.sub('[A-Za-z ]', '', x)) > 0) and (int(re.sub('[A-Za-z ]', '', x)) % 2 == 0)) else 0)
    X['RB'] = X['Cabin'].map(lambda x: 1 if ((len(re.sub('[A-Za-z ]', '', x)) > 0) and (int(re.sub('[A-Za-z ]', '', x)) % 2 == 1)) else 0)
    X['Cabin'] = X['Cabin'].map(get_cabin)
    X = pd.get_dummies(X, columns = ['Cabin'], prefix='Cabin')
    # X['Cabin'] = X['Cabin'].map(lambda x: 0 if pandas.isna(x) else 1)
    
    X['FamilySize'] = X['SibSp'] + X['Parch']
    #X['isAlone'] = X['FamilySize'].map(lambda x: 1 if x == 0 else 0)
    
    X['1class'] = X['Pclass'].map(lambda x: 1 if x == 1 else 0)
    X['2class'] = X['Pclass'].map(lambda x: 1 if x == 2 else 0)
    X['3class'] = X['Pclass'].map(lambda x: 1 if x == 3 else 0)
    X.drop('Pclass', axis = 1, inplace = True)
    X['FareBin'] = pd.qcut(x = X['Fare'], q = 4, labels = False)
    X['AgeBin'] = pd.qcut(x = X['Age'], q = 4, labels = False)
    
    X['Age'] = X['Age'].map(np.log1p)
    X['Fare'] = X['Fare'].map(np.log1p)
    
    X['Pass'] = 1
    X = X.join(X.groupby('Ticket')['Pass'].sum().rename('PassInTicket'), on = 'Ticket')
    X = X.join(X.groupby('Ticket')['Fare'].median().rename('TicketFare'), on ='Ticket')
    X['TicketFare'] = X['TicketFare'].divide(X['PassInTicket'])
    X['FamilyFare'] = X['Fare'].divide(X['FamilySize'] + 1)
    X['FamilyName'] = X['Name'].map(lambda x: x.split(',')[0])
    X = X.join(X.groupby('FamilyName')['Pass'].sum().rename('FamilySize_1'), on = 'FamilyName')
    X['FamilyFare_1'] = X['Fare'].divide(X['FamilySize_1'] + 1)
    X = pd.get_dummies(X, columns = ['FamilyName'], prefix='FName')
    X.drop('Pass', inplace = True, axis = 1)
    
    X['Ttype'] = X['Ticket'].str[0]
    X = pd.get_dummies(X, columns = ['Ttype'], prefix='Ttype')
    
    X = pd.get_dummies(X, columns = ['FamilySize'], prefix='FamSize')
    X = pd.get_dummies(X, columns = ['Parch'], prefix='Parch')
    X = pd.get_dummies(X, columns = ['SibSp'], prefix='SibSp')
    
    X.drop(['Name', 'Ticket'], axis = 1, inplace = True)
    
    
    return X

X = prepare_data(data)
X_train = X.copy()
X_test = X.copy()
X_train.drop(X_train[X_train.index > train_index[-1]].index, inplace = True)
X_test.drop(X_test[X_test.index <= train_index[-1]].index, inplace = True)

scaler = StandardScaler()
x_train = scaler.fit_transform(X_train)
x_test = scaler.transform(X_test)

x_train = np.array(x_train)
y_train = np.array(y_train)


In [75]:
def mycv(cl, X, y, iter = 5):
    score = 0
    for i in range(iter):
        x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train, test_size = 0.2, shuffle = True)
        cl.fit(x_train_split, y_train_split)
        y_pred = cl.predict(x_test_split)
        score += roc_auc_score(y_test_split, y_pred)
    score /= iter
    return score

t_r = DecisionTreeClassifier()
t_my = CTree()
rscore = mycv(t_r, x_train, y_train)
myscore = mycv(t_my, x_train, y_train)
print(myscore, rscore)

0.7363923843351726 0.8090656888584743
