In [1]:
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler 
from sklearn.metrics import roc_auc_score


In [2]:
def get_error(pred, y):
    return sum(pred != y) / len(y)

In [3]:
# Реализуем класс узла

class Node:
    
    def __init__(self, index, t, true_branch, false_branch):
        self.index = index  # индекс признака, по которому ведется сравнение с порогом в этом узле
        self.t = t  # значение порога
        self.true_branch = true_branch  # поддерево, удовлетворяющее условию в узле
        self.false_branch = false_branch  # поддерево, не удовлетворяющее условию в узле

In [4]:
# И класс терминального узла (листа)

class Leaf:
    
    def __init__(self, data, labels, sample_weights):
        self.data = data
        self.labels = labels
        self.sample_weights = sample_weights
        self.prediction = self.predict()
        
    def predict(self):
        # подсчет количества объектов разных классов
        classes = {}  # сформируем словарь "класс: количество объектов"
        for num, label in enumerate(self.labels):
            if label not in classes:
                classes[label] = 0
            classes[label] += self.sample_weights[num]
            
        # найдем класс, количество объектов которого будет максимальным в этом листе и вернем его    
        prediction = max(classes, key=classes.get)
        return prediction        

In [5]:
# ИЗМЕНЕНИЯ: дерево организуем тоже классом
# Класс дерева

class Tree:
    
    # ИЗМЕНЕНИЯ: здесь указаны параметры для останова
    def __init__(self,
                 max_tree_depth_stop=np.inf,
                 max_leaf_num_stop=np.inf,
                 min_leaf_object_stop=1):
        self.max_depth = max_tree_depth_stop
        self.nodes = []
        self.leaves = []
        self.depth = 0
        self.max_leaves = max_leaf_num_stop
        self.min_objects = min_leaf_object_stop
        self.tree = None
        
    # Расчет критерия Джини
    def gini(self,
             labels,
             sample_weights):
        #  подсчет количества объектов разных классов
        classes = {}
        
        #################################
        for num, label in enumerate(labels):
            if label not in classes:
                classes[label] = 0
            classes[label] += sample_weights[num]
        

        #  расчет критерия
        impurity = 1
        for label in classes:
                                ###################
            p = classes[label] / sum(sample_weights)
            impurity -= p ** 2

        return impurity
     
    # Расчет прироста
    def gain(self,
             left_labels,
             right_labels,
             root_gini,
             true_sw,
             false_sw):

        # доля выборки, ушедшая в левое поддерево
        p = float(left_labels.shape[0]) / (left_labels.shape[0] + right_labels.shape[0])
        
                                                     #######                                     ##########
        return root_gini - p * self.gini(left_labels,true_sw) - (1 - p) * self.gini(right_labels,false_sw)
    
    # Разбиение датасета в узле
    def split(self,
              data,
              labels,
              column_index,
                  ###################
              t, sample_weights):
                ##################

        left = np.where(data[:, column_index] <= t)
        right = np.where(data[:, column_index] > t)

        true_data = data[left]
        false_data = data[right]

        true_labels = labels[left]
        false_labels = labels[right]
        ###############################
        true_sw = sample_weights[left]
        false_sw = sample_weights[right]
        ###############################
        return true_data, false_data, true_labels, false_labels, true_sw, false_sw 
    
    # Нахождение наилучшего разбиения
    def find_best_split(self,
                        data,
                        labels,
                        ################
                        sample_weights):
                        ################
        
        #  обозначим минимальное количество объектов в узле
        min_samples_leaf =  self.min_objects

                                      ##############
        root_gini = self.gini(labels, sample_weights)
                                      ##############
        
        best_gain = 0
        best_t = None
        best_index = None

        n_features = data.shape[1]

        for index in range(n_features):
            # будем проверять только уникальные значения признака, исключая повторения
            t_values = np.unique(data[:, index])

            for t in t_values:

                                                                  ######### #########                            ##############
                true_data, false_data, true_labels, false_labels, true_sw, false_sw = self.split(data, labels, index, t, sample_weights)
                                                                  #########  #########                          ##############
 
                    
                #  пропускаем разбиения, в которых в узле остается менее min_leaf_object_stop
                if len(true_data) < min_samples_leaf or len(false_data) < min_samples_leaf:
                    continue
                    
                                                                               ############## gini split gain
                current_gain = self.gain(true_labels, false_labels, root_gini, true_sw, false_sw)

                #  выбираем порог, на котором получается максимальный прирост качества
                if current_gain > best_gain:
                    best_gain, best_t, best_index = current_gain, t, index

        return best_gain, best_t, best_index
    
    # Построение дерева с помощью рекурсивной функции
    def build_tree(self,
                   data,
                   labels,
                   ################################
                   sample_weights
                   ################################
                   ):
                                                            ###############
        gain, t, index = self.find_best_split(data, labels, sample_weights)
                                                            ###############
        
 
        #  Базовый случай 2 - прекращаем рекурсию, когда достигли максимальной глубины дерева
        if self.depth > self.max_depth:
            self.leaves.append(Leaf(data, labels, sample_weights))
            return Leaf(data, labels, sample_weights)
        
        #  Базовый случай 3 - прекращаем рекурсию, когда достигли максимального количества листьев
        if len(self.leaves) >= self.max_leaves - 1 or self.depth >= self.max_leaves - 1:
            self.leaves.append(Leaf(data, labels, sample_weights))
            return Leaf(data, labels, sample_weights)
        
        #  Базовый случай 4 - прекращаем рекурсию, когда достигли минимального количества объектов в листе
        if len(data) <= self.min_objects:
            self.leaves.append(Leaf(data, labels, sample_weights))
            return Leaf(data, labels, sample_weights)
        
         #  Базовый случай 1 - прекращаем рекурсию, когда нет прироста в качества
        if gain == 0:
            self.leaves.append(Leaf(data, labels, sample_weights))
            return Leaf(data, labels, sample_weights)

        self.depth += 1
        
                                                          ######### #########                            ##############
        true_data, false_data, true_labels, false_labels, true_sw, false_sw = self.split(data, labels, index, t, sample_weights)
                                                          #########  #########                          ##############
        
        
        # Рекурсивно строим два поддерева
        true_branch = self.build_tree(true_data, true_labels , true_sw)
        false_branch = self.build_tree(false_data, false_labels, false_sw)

        # Возвращаем класс узла со всеми поддеревьями, то есть целого дерева
        self.nodes.append(Node(index, t, true_branch, false_branch))
        return Node(index, t, true_branch, false_branch)
    
    def classify_object(self,
                        obj,
                        node):

        #  Останавливаем рекурсию, если достигли листа
        if isinstance(node, Leaf):
            answer = node.prediction
            return answer

        if obj[node.index] <= node.t:
            return self.classify_object(obj, node.true_branch)
        else:
            return self.classify_object(obj, node.false_branch)
                                
    def fit(self, data, labels, sample_weights=None):
        ################
        if sample_weights is None:
            sample_weights=np.ones(len(labels))
                                   
        ###############
        labels = labels.flatten()
        #########
        self.tree = self.build_tree(data, labels, sample_weights)
        return self
    
    def predict(self, data):
    
        classes = []
        for obj in data:
            prediction = self.classify_object(obj, self.tree)
            classes.append(prediction)
        return classes

In [6]:
def adaboost(X, y, N, depth):

    # Размер выборки
    n_objects = len(X)

    # Запишем количество классов в переменную
    n_classes = len(np.unique((y)))

    # Начальные веса объектов
    w = np.ones(n_objects) / n_objects
    #w[[i for i, n in enumerate(y) if n == 1]]=1

    # Деревья с весами будем записывать в список
    models = []

    for n in range(N):
        # Зададим дерево и обучим его
        #clf = DecisionTreeClassifier(max_depth=depth)
        clf = Tree(max_tree_depth_stop=depth, max_leaf_num_stop=np.inf, min_leaf_object_stop=1)
        clf.fit(X, y, sample_weights=w)

        predictions = clf.predict(X)
        error = get_error(predictions, y)
        
        # отбросим дерево, если его ошибка больше 0.5
        # Запишем условие в общем виде (применимо к небинарным классификаторам)
        if error >= 1 - 1/n_classes: 
            continue

        # Обработаем граничные значения ошибок
        if error == 0:
            error += 1e-10
        elif error == 1:
            error -= 1e-10
        # Вычислим вес для дерева
        alpha = 0.5 * np.log((1 - error) / error)


        # Найдем индексы неправильно классифицированных элементов
        wrong_mask = predictions != y

        # Увеличим веса для неправильно классифицированных элементов
        w[wrong_mask] *= np.exp(alpha)
        # Уменьшаем веса для правильно классифицированных элементов
        w[~wrong_mask] *= np.exp(-alpha)

        # Нормализуем веса
        w /= w.sum()

        # Добавим дерево с весом в список
        models.append((alpha, clf))
    
    return models

In [7]:
def predict(X, models):
    
    n_classes = 2
    n_objects = len(X)
    
    # вначале обозначим предсказание нулевым массивом
    y_pred = np.ones((n_objects, n_classes))
    
    for alpha, clf in models:
        prediction = clf.predict(X)
        # Для каждого предсказания будем прибавлять alpha к
        # элементу с индексом предсказанного класса
        y_pred[range(n_objects), prediction] += alpha
    
    # выберем индексы с максимальными суммарными весами -
    # получим предсказанные алгоритмом классы
    y_pred = np.argmax(y_pred, axis=1)
    
    return y_pred

## Data

### Default dataset 

In [8]:
X_train = pd.read_csv('train.csv')

In [9]:
X_train = X_train.loc[:, X_train.columns != 'Id']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_train.loc[:, X_train.columns != 'choose'], X_train[['choose']], random_state=42)

In [11]:
X_train.columns

Index(['age', 'years_of_experience', 'lesson_price', 'qualification',
       'physics', 'chemistry', 'biology', 'english', 'geography', 'history',
       'mean_exam_points'],
      dtype='object')

### extended dataset

In [12]:
X_train2 = pd.read_csv('train.csv')

In [13]:
X_train2 = X_train2.loc[:, X_train2.columns != 'Id']

In [14]:
X_train2.columns

Index(['age', 'years_of_experience', 'lesson_price', 'qualification',
       'physics', 'chemistry', 'biology', 'english', 'geography', 'history',
       'mean_exam_points', 'choose'],
      dtype='object')

In [15]:
X_train2['price_per_point']=X_train2['lesson_price']/X_train2['mean_exam_points']

In [16]:
X_train2['price_per_qualification']=X_train2['lesson_price']/X_train2['qualification']

In [17]:
X_train2['exp_per_price']=X_train2['years_of_experience']/X_train2['lesson_price']

In [18]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_train2.loc[:, X_train2.columns != 'choose'], X_train2[['choose']], random_state=42)

In [19]:
X_train2.columns

Index(['age', 'years_of_experience', 'lesson_price', 'qualification',
       'physics', 'chemistry', 'biology', 'english', 'geography', 'history',
       'mean_exam_points', 'price_per_point', 'price_per_qualification',
       'exp_per_price'],
      dtype='object')

## Sampling

In [20]:
ros = RandomOverSampler(random_state=42)

In [21]:
rus = RandomUnderSampler(random_state=42)

In [22]:
X_ros, y_ros = ros.fit_resample(X_train, y_train)

In [23]:
X_rus, y_rus = rus.fit_resample(X_train, y_train)

In [24]:
X_ros2, y_ros2 = ros.fit_resample(X_train2, y_train2)

In [25]:
X_rus2, y_rus2 = rus.fit_resample(X_train2, y_train2)

In [26]:
X_ros, y_ros = X_ros.to_numpy(), y_ros.to_numpy().flatten()

In [27]:
X_rus, y_rus = X_rus.to_numpy(), y_rus.to_numpy().flatten()

In [28]:
X_ros2, y_ros2 = X_ros2.to_numpy(), y_ros2.to_numpy().flatten()

In [29]:
X_rus2, y_rus2 = X_rus2.to_numpy(), y_rus2.to_numpy().flatten()

In [30]:
X_test, y_test = X_test.to_numpy(), y_test.to_numpy().flatten()

## Forecasting

In [None]:
#testtesttest
results2 = []
for N in range(10,25):
    for depth in range(3,20):
        models = adaboost(X_rus, y_rus, N, depth)
        results2.append((N,depth,roc_auc_score(y_rus, predict(X_rus, models)),roc_auc_score(y_test, predict(X_test, models))))
        print((N,depth,roc_auc_score(y_rus, predict(X_rus, models)),roc_auc_score(y_test, predict(X_test, models))))

In [35]:
res2 = sorted(results2, key = lambda x: x[3])

In [36]:
res2[-5:]

[(19, 14, 0.7742927429274292, 0.7629341001618678),
 (16, 11, 0.7730627306273062, 0.7633142443714132),
 (19, 11, 0.7816728167281674, 0.7657851817334576),
 (20, 11, 0.7816728167281674, 0.7657851817334576),
 (18, 11, 0.7718327183271831, 0.766361529405994)]

In [70]:
res2

[(10, 6, 0.6838868388683886, 0.6716044538186099),
 (11, 6, 0.6838868388683886, 0.6716044538186099),
 (12, 6, 0.6838868388683886, 0.6716044538186099),
 (13, 6, 0.6838868388683886, 0.6716044538186099),
 (14, 6, 0.6838868388683886, 0.6716044538186099),
 (15, 6, 0.6838868388683886, 0.6716044538186099),
 (16, 6, 0.6838868388683886, 0.6716044538186099),
 (17, 6, 0.6838868388683886, 0.6716044538186099),
 (18, 6, 0.6838868388683886, 0.6716044538186099),
 (19, 6, 0.6838868388683886, 0.6716044538186099),
 (20, 6, 0.6838868388683886, 0.6716044538186099),
 (10, 7, 0.6857318573185732, 0.6724107274243392),
 (10, 8, 0.6857318573185732, 0.6724107274243392),
 (11, 7, 0.6857318573185732, 0.6724107274243392),
 (11, 8, 0.6857318573185732, 0.6724107274243392),
 (12, 7, 0.6857318573185732, 0.6724107274243392),
 (12, 8, 0.6857318573185732, 0.6724107274243392),
 (13, 7, 0.6857318573185732, 0.6724107274243392),
 (13, 8, 0.6857318573185732, 0.6724107274243392),
 (14, 7, 0.6857318573185732, 0.6724107274243392),


### Обучение итоговой модели на андерсемплиновых данных

In [49]:
X = pd.read_csv('train.csv')

In [50]:
y = X[['choose']]

In [51]:
X = X.loc[:, X.columns != 'Id']

In [52]:
X = X.loc[:, X.columns != 'choose']

In [54]:
X, y = X.to_numpy(), y.to_numpy().flatten()

In [55]:
X_rus, y_rus = rus.fit_resample(X, y)

In [72]:
models = adaboost(X_rus, y_rus, 40, 11)

roc_auc_score(y_rus, predict(X_rus, models))

0.7542831379621281

### Построение прогноза

In [75]:
X_new = pd.read_csv('test.csv')

In [78]:
X_new

Unnamed: 0,Id,age,years_of_experience,lesson_price,qualification,physics,chemistry,biology,english,geography,history,mean_exam_points
0,10000,32.0,2.0,2700.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,90.0
1,10001,35.0,6.0,1800.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,71.0
2,10002,44.0,2.0,1200.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,45.0
3,10003,44.0,4.0,2950.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,92.0
4,10004,38.0,3.0,1400.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,58.0
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,19995,44.0,3.0,1850.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,68.0
9996,19996,45.0,3.0,2450.0,2.0,1.0,1.0,0.0,1.0,0.0,0.0,72.0
9997,19997,44.0,2.0,1250.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,63.0
9998,19998,51.0,5.0,1000.0,2.0,1.0,0.0,1.0,0.0,0.0,0.0,64.0


In [77]:
ID = X_new[['Id']]

In [81]:
X_new = X_new.iloc[:,1:]

In [82]:
X_new

Unnamed: 0,age,years_of_experience,lesson_price,qualification,physics,chemistry,biology,english,geography,history,mean_exam_points
0,32.0,2.0,2700.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,90.0
1,35.0,6.0,1800.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,71.0
2,44.0,2.0,1200.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,45.0
3,44.0,4.0,2950.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,92.0
4,38.0,3.0,1400.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,58.0
...,...,...,...,...,...,...,...,...,...,...,...
9995,44.0,3.0,1850.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,68.0
9996,45.0,3.0,2450.0,2.0,1.0,1.0,0.0,1.0,0.0,0.0,72.0
9997,44.0,2.0,1250.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,63.0
9998,51.0,5.0,1000.0,2.0,1.0,0.0,1.0,0.0,0.0,0.0,64.0


In [101]:
y_pred = predict(X_new.to_numpy(), models)

In [102]:
y_pred.shape

(10000,)

In [103]:
X_new.shape

(10000, 11)

In [104]:
y_pred = y_pred.reshape(-1,1)

In [114]:
ID['choose']=y_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ID['choose']=y_pred


In [116]:
ID.to_csv('predictions.csv',index=False)