In [36]:
import matplotlib.pyplot as plt
import random

from matplotlib.colors import ListedColormap
from sklearn import datasets
from sklearn.tree import DecisionTreeRegressor 

import numpy as np
import pandas as pd
from sklearn import model_selection

Реализуем градиентный бустинг на основе Descision Tree из sklearn

In [424]:
class GradientBoosting:
    
    def __init__(self, n_trees, coefs,
                 max_depth=4,
                 max_leaf_nodes=200,
                 min_leaf=1,
                 eta = 0.7,
                 algorythm = 'sklearn'):
        self.n_trees = n_trees #число деревьев в ансабле
        self.max_depth = max_depth #ограничение максимальной глубины деревьев в ансабле
        self.max_leaf_nodes = max_leaf_nodes #ограничение максимального числа листьев в деревьях
        self.min_leaf = min_leaf #ограничние минимльного количества объектов в листе
        self.eta =eta # шаг градиентного бустинга
        self.algorythm = algorythm #алгоритм реализации решающего дерева
        self.trees_list = [] #список деревьев в ансабле
        self.coefs = coefs # список коэфицинтов 
        self.trees_predicts=[]
        self.bootstrap = []
        self.sample_indexes = None

    
    
    def predict(self, X):
        # Реализуемый алгоритм градиентного бустинга будет инициализироваться нулевыми значениями,
        # поэтому все деревья из списка trees_list уже являются дополнительными и при предсказании прибавляются с шагом eta
        return np.array([sum([self.eta* coef * alg.predict([x])[0] for alg, coef in zip(self.trees_list, self.coefs)]) for x in X])
    
    def mean_squared_error(self, y_real, prediction):
        return (sum((y_real - prediction)**2)) / len(y_real)

    def bias(self, y, z):
        return (y - z)
    
    def fit(self, X_train, y_train):
    
        # Деревья будем записывать в список
        self.trees_list = []
    
        for i in range(self.n_trees):
            tree = DecisionTreeRegressor(max_depth=self.max_depth,  random_state=42)
           
          
            if len(self.trees_list) == 0:
                # обучаем первое дерево на обучающей выборке
                tree.fit(X_train, y_train)
               
            else:
                # Получим ответы на текущей композиции
                target = self.predict(X_train)
                # алгоритмы начиная со второго обучаем на сдвиг
                bais = self.bias(y_train, target)
                tree.fit(X_train,bais)
                
            
            self.trees_list.append(tree)

        return self


In [425]:
def r_2(y_pred, y_true):
    numerator = ((y_true - y_pred) ** 2).sum(axis=0, dtype=np.float64)
    denominator = ((y_true - np.average(y_true)) ** 2).sum(axis=0,
                                                          dtype=np.float64)
    return 1 - (numerator / denominator)

Проверим работоспособность алгоритма на искусственных данных

In [426]:
n_samples = 10000
data, target, coef = datasets.make_regression(
    n_samples=n_samples,
    n_features=5,
    n_targets=1,
    noise=10,
    coef=True,
    random_state=69
)

In [427]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(data, target, test_size=0.25)

In [428]:
n_trees=20
coefs = [1] * n_trees

In [429]:
alg=GradientBoosting(n_trees=n_trees, max_depth=4, coefs=coefs)

In [430]:
alg.fit(X_train,y_train)

<__main__.GradientBoosting at 0x1ab22527d48>

In [431]:
train_target_pred = alg.predict(X_train)

In [432]:
r2_train = r_2(train_target_pred, y_train)
r2_train

0.9799150372988128

In [433]:
test_target_pred = alg.predict(X_test)

In [434]:
r2_test = r_2(test_target_pred, y_test)
r2_test

0.9729062066532215

Вывод: алгоритм дает хороший результат на искуственных данных

### Работаем с основным датасетом

In [599]:
#функция печати корреляционной матрицы
def print_corr_matrix(corr, v_max=0.7):
    mask = np.triu(np.ones_like(corr, dtype=np.bool))

    f, ax = plt.subplots(figsize=(11, 9))

    cmap = sns.diverging_palette(220, 10, as_cmap=True)

    sns.heatmap(corr, mask=mask, cmap=cmap, vmax=v_max, center=0,
                square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [600]:
test_data_link =r'test_data/test.csv'
train_data_link = r'train_data/train.csv'
result_data_link = r'test_data/submission4.csv'

In [601]:
train_data_main = pd.read_csv(train_data_link)

In [602]:
train_data = train_data_main.copy()

In [603]:
mean_score_train=train_data.loc[0:,'mean_exam_points'].to_numpy()

In [604]:
train_data=train_data.drop(['mean_exam_points'], axis=1)

In [605]:
# функция генерации дополнительных признаков на основе существующих
def feature_generation (train_data_main):
    #train_data_main['log_lesson_price']=np.log(train_data_main['lesson_price'])
    #train_data_main['sqr_lesson_price']=np.sqrt(train_data_main['lesson_price'])
    train_data_main['qulification_total']=train_data_main['physics']+train_data_main['chemistry']+train_data_main['biology']+train_data_main['english']+train_data_main['geography']+train_data_main['history']
    train_data_main['is_expirienced']=(train_data_main['qulification_total']> 0).astype('int64')
    train_data_main['highly_qualified']=(train_data_main['qualification']> 2).astype('int64')
    train_data_main['is_old']=(train_data_main['age']>train_data_main['age'].quantile(.75)).astype('int64')
    train_data_main['is_young']=(train_data_main['age']<train_data_main['age'].quantile(.25)).astype('int64')
    train_data_main['qualification_in_physics']=train_data_main["physics"]*train_data_main['qualification']
    train_data_main['expirience_in_physics']=train_data_main["physics"]*train_data_main['years_of_experience']
    train_data_main['qual_vs_experience_in_ph']=train_data_main["expirience_in_physics"]*train_data_main['qualification_in_physics']
    train_data_main['qual_for_money']=train_data_main['lesson_price']*(np.exp(train_data['qualification']))
    train_data_main['is_expensive']=(train_data_main['lesson_price']>train_data_main['lesson_price'].quantile(.75)).astype('int64')
    train_data_main['is_chip']=(train_data_main['lesson_price']<train_data_main['lesson_price'].quantile(.25)).astype('int64')
    train_data_main=train_data_main.drop(['Id'], axis=1)

    return train_data_main

In [606]:
train_features=feature_generation(train_data).to_numpy()

In [607]:
train_features

array([[4.00000000e+01, 0.00000000e+00, 1.40000000e+03, ...,
        3.80559456e+03, 0.00000000e+00, 0.00000000e+00],
       [4.80000000e+01, 4.00000000e+00, 2.85000000e+03, ...,
        5.72437802e+04, 1.00000000e+00, 0.00000000e+00],
       [3.90000000e+01, 0.00000000e+00, 1.20000000e+03, ...,
        3.26193819e+03, 0.00000000e+00, 1.00000000e+00],
       ...,
       [3.40000000e+01, 1.00000000e+00, 1.25000000e+03, ...,
        3.39785229e+03, 0.00000000e+00, 1.00000000e+00],
       [3.30000000e+01, 3.00000000e+00, 1.10000000e+03, ...,
        2.99011001e+03, 0.00000000e+00, 1.00000000e+00],
       [3.50000000e+01, 0.00000000e+00, 1.45000000e+03, ...,
        3.94150865e+03, 0.00000000e+00, 0.00000000e+00]])

In [608]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(train_features, mean_score_train, test_size=0.25)

In [701]:
n_trees=8
coefs = [1] * n_trees

In [702]:
model_GB = GradientBoosting(n_trees=n_trees, max_depth=4, coefs=coefs)

In [703]:
model_GB.fit(X_train, y_train)

<__main__.GradientBoosting at 0x1ab22a30608>

In [704]:
y_train_pred = model_GB.predict(X_train)

In [705]:
r2_train = r_2(y_train_pred, y_train)
r2_train

0.7902196482250665

In [706]:
y_test_pred = model_GB.predict(X_test)

In [707]:
r2_test = r_2(y_test_pred, y_test)
r2_test

0.7688581849495012

Обучим модель на всем объеме данных train

In [637]:
model_GB.fit(train_features, mean_score_train)

<__main__.GradientBoosting at 0x1ab22a0dac8>

### Предсказание на тестовых данных

Обработка данных и генерирование признаков

In [507]:
test_data= pd.read_csv(test_data_link)
test_data

Unnamed: 0,Id,age,years_of_experience,lesson_price,qualification,physics,chemistry,biology,english,geography,history
0,10000,46.0,3.0,1050.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1,10001,43.0,3.0,1850.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10002,52.0,1.0,1550.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
3,10003,57.0,6.0,2900.0,3.0,1.0,0.0,1.0,0.0,0.0,0.0
4,10004,44.0,4.0,3150.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
9995,19995,42.0,0.0,1500.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
9996,19996,51.0,2.0,2200.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0
9997,19997,33.0,5.0,1100.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
9998,19998,48.0,0.0,1750.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0


In [508]:
train_features=feature_generation(test_data)
train_features

Unnamed: 0,age,years_of_experience,lesson_price,qualification,physics,chemistry,biology,english,geography,history,...,is_expirienced,highly_qualified,is_old,is_young,qualification_in_physics,expirience_in_physics,qual_vs_experience_in_ph,qual_for_money,is_expensive,is_chip
0,46.0,3.0,1050.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0.0,0.0,0.0,2854.195920,0,1
1,43.0,3.0,1850.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0.0,0.0,0.0,37158.243308,0,0
2,52.0,1.0,1550.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1,0,1,0,1.0,1.0,1.0,4213.336834,0,0
3,57.0,6.0,2900.0,3.0,1.0,0.0,1.0,0.0,0.0,0.0,...,1,1,1,0,3.0,6.0,18.0,7883.017303,1,0
4,44.0,4.0,3150.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1,1,0,0,3.0,4.0,12.0,8562.587760,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,42.0,0.0,1500.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0.0,0.0,0.0,11083.584148,0,0
9996,51.0,2.0,2200.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1,1,0,0,0.0,0.0,0.0,16255.923418,1,0
9997,33.0,5.0,1100.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,1,0.0,0.0,0.0,2990.110011,0,1
9998,48.0,0.0,1750.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0.0,0.0,0.0,4756.993200,0,0


In [509]:
test_features=train_features.to_numpy()

In [510]:
test_target_pred = model_GB.predict(test_features)

In [511]:
submission = pd.DataFrame({"Id":[id for id in range (10000, 20000)], "mean_exam_points":np.rint(test_target_pred)})

In [512]:
submission

Unnamed: 0,Id,mean_exam_points
0,10000,54.0
1,10001,75.0
2,10002,47.0
3,10003,85.0
4,10004,86.0
...,...,...
9995,19995,55.0
9996,19996,71.0
9997,19997,55.0
9998,19998,59.0


In [513]:
submission.to_csv(result_data_link, index = False)

**После загрузки на Kaggle - результат 0.76534**