In [317]:
import matplotlib.pyplot as plt
import random

from matplotlib.colors import ListedColormap
from sklearn import datasets
from sklearn.tree import DecisionTreeRegressor 

import numpy as np
import pandas as pd

In [382]:
# Реализуем класс случайного леса
random.seed(42)


class RandomForestRegressor:

    def __init__(self, n_trees,
                 max_depth=100,
                 max_leaf_nodes=200,
                 min_leaf=1):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.max_leaf_nodes = max_leaf_nodes
        self.min_leaf = min_leaf
        self.trees = []
        self.trees_predicts=[]
        self.bootstrap = []
        self.sample_indexes = None


    def get_bootstrap(self, data, labels, N):
        n_samples = data.shape[0]
        indexes = []

        for i in range(N):
            random.seed(42)
            b_data = np.zeros(data.shape)
            b_labels = np.zeros(labels.shape)

            for j in range(n_samples):
                sample_index = random.randint(0, n_samples - 1)
                b_data[j] = data[sample_index]
                b_labels[j] = labels[sample_index]
                indexes.append(sample_index)
            self.bootstrap.append((b_data, b_labels))
            self.sample_indexes = np.unique(np.array(indexes))

        return self


    def fit(self, data, labels):

        self.get_bootstrap(data, labels, self.n_trees)

        for b_data, b_labels in self.bootstrap:
            new_tree = DecisionTreeRegressor(max_depth=self.max_depth, 
                                             max_leaf_nodes = self.max_leaf_nodes)
            new_tree.fit(b_data, b_labels)
            self.trees.append(new_tree)

        return self


    def predict(self, data):

        self.trees_predicts=[]
        for tree in self.trees:
            prediction = tree.predict(data)
            self.trees_predicts.append(prediction)
        target_pred = np.mean(np.vstack((self.trees_predicts)), axis=0)


        return target_pred



In [383]:
def r_2(y_pred, y_true):
    numerator = ((y_true - y_pred) ** 2).sum(axis=0, dtype=np.float64)
    denominator = ((y_true - np.average(y_true)) ** 2).sum(axis=0,
                                                          dtype=np.float64)
    return 1 - (numerator / denominator)

In [384]:
n_samples = 10000
data, target, coef = datasets.make_regression(
    n_samples=n_samples,
    n_features=5,
    n_targets=1,
    noise=10,
    coef=True,
    random_state=69
)

In [385]:
n_trees = 100
forest = RandomForestRegressor(n_trees = n_trees)
forest.fit(data, target)

<__main__.RandomForestRegressor at 0x26086c39708>

In [386]:
indexes=np.array([i for i in range(n_samples)])
test_indexes=indexes[~np.isin(indexes, forest.sample_indexes)]
train_indexes =indexes[np.isin(indexes, forest.sample_indexes)]
test_data, test_target=data[test_indexes], target[test_indexes]
train_data, train_target = data[train_indexes], target[train_indexes]

In [387]:
train_target_pred = forest.predict(train_data)
r2_train = r_2(train_target_pred, train_target)
r2_train

0.9597670225071878

In [388]:
test_target_pred = forest.predict(test_data)

In [389]:
r2_test = r_2(test_target_pred, test_target)
r2_test

0.932911446738784

Вывод: алгоритм дает хороший результат на искуственных данных

### Работаем с основным датасетом

In [399]:
#функция печати корреляционной матрицы
def print_corr_matrix(corr, v_max=0.7):
    mask = np.triu(np.ones_like(corr, dtype=np.bool))

    f, ax = plt.subplots(figsize=(11, 9))

    cmap = sns.diverging_palette(220, 10, as_cmap=True)

    sns.heatmap(corr, mask=mask, cmap=cmap, vmax=v_max, center=0,
                square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [400]:
test_data_link =r'test_data/test.csv'
train_data_link = r'train_data/train.csv'
result_data_link = r'test_data/submission2.csv'

In [401]:
train_data_main = pd.read_csv(train_data_link)

In [402]:
train_data = train_data_main.copy()

In [403]:
mean_score_train=train_data.loc[0:,'mean_exam_points'].to_numpy()

In [404]:
train_data=train_data.drop(['mean_exam_points'], axis=1)

In [405]:
# функция генерации дополнительных признаков на основе существующих
def feature_generation (train_data_main):
    #train_data_main['log_lesson_price']=np.log(train_data_main['lesson_price'])
    #train_data_main['sqr_lesson_price']=np.sqrt(train_data_main['lesson_price'])
    train_data_main['qulification_total']=train_data_main['physics']+train_data_main['chemistry']+train_data_main['biology']+train_data_main['english']+train_data_main['geography']+train_data_main['history']
    train_data_main['is_expirienced']=(train_data_main['qulification_total']> 0).astype('int64')
    train_data_main['highly_qualified']=(train_data_main['qualification']> 2).astype('int64')
    train_data_main['is_old']=(train_data_main['age']>train_data_main['age'].quantile(.75)).astype('int64')
    train_data_main['is_young']=(train_data_main['age']<train_data_main['age'].quantile(.25)).astype('int64')
    train_data_main['qualification_in_physics']=train_data_main["physics"]*train_data_main['qualification']
    train_data_main['expirience_in_physics']=train_data_main["physics"]*train_data_main['years_of_experience']
    train_data_main['qual_vs_experience_in_ph']=train_data_main["expirience_in_physics"]*train_data_main['qualification_in_physics']
    #train_data_main['qual_for_money']=train_data_main['lesson_price']*(np.exp(train_data['qualification']))
    train_data_main['is_expensive']=(train_data_main['lesson_price']>train_data_main['lesson_price'].quantile(.75)).astype('int64')
    train_data_main['is_chip']=(train_data_main['lesson_price']<train_data_main['lesson_price'].quantile(.25)).astype('int64')
    train_data_main=train_data_main.drop(['Id'], axis=1)

    return train_data_main

In [406]:
train_features=feature_generation(train_data).to_numpy()

In [407]:
train_features

array([[4.00e+01, 0.00e+00, 1.40e+03, ..., 0.00e+00, 0.00e+00, 0.00e+00],
       [4.80e+01, 4.00e+00, 2.85e+03, ..., 1.20e+01, 1.00e+00, 0.00e+00],
       [3.90e+01, 0.00e+00, 1.20e+03, ..., 0.00e+00, 0.00e+00, 1.00e+00],
       ...,
       [3.40e+01, 1.00e+00, 1.25e+03, ..., 1.00e+00, 0.00e+00, 1.00e+00],
       [3.30e+01, 3.00e+00, 1.10e+03, ..., 0.00e+00, 0.00e+00, 1.00e+00],
       [3.50e+01, 0.00e+00, 1.45e+03, ..., 0.00e+00, 0.00e+00, 0.00e+00]])

In [408]:
n_samples=train_features.shape[0]

In [409]:
n_trees = 100

In [410]:
model_rfr_main = RandomForestRegressor (n_trees = n_trees)

In [411]:
model_rfr_main.fit(train_features, mean_score_train)

<__main__.RandomForestRegressor at 0x260863f9508>

In [412]:
indexes=np.array([i for i in range(n_samples)])
test_indexes=indexes[~np.isin(indexes, model_rfr_main.sample_indexes)]
train_indexes =indexes[np.isin(indexes, model_rfr_main.sample_indexes)]
test_data, test_target=train_features[test_indexes], mean_score_train[test_indexes]
train_data, train_target = train_features[train_indexes], mean_score_train[train_indexes]

In [413]:
train_target_pred = model_rfr_main.predict(train_data)

In [414]:
r2_train = r_2(train_target_pred, train_target)
r2_train

0.8244666437692241

In [415]:
test_target_pred=model_rfr_main.predict(test_data)

In [416]:
r2_test = r_2(test_target_pred, test_target)
r2_test

0.7371253243026598

Имеет место ощутимое падение качества модели на тестовой выборке, что может говорить о переобучении модели. Попробуем настроить модель параметрами max_depth и max_leaf_nodes, min_leaf, чтобы устранить эффект переобучения

In [553]:
n_trees=200

In [554]:
model_rfr_tuned = RandomForestRegressor (n_trees = n_trees, max_depth=6, max_leaf_nodes=60, min_leaf=5)

In [555]:
model_rfr_tuned.fit(train_features, mean_score_train)

<__main__.RandomForestRegressor at 0x260afdedc08>

In [556]:
indexes=np.array([i for i in range(n_samples)])
test_indexes=indexes[~np.isin(indexes, model_rfr_tuned.sample_indexes)]
train_indexes =indexes[np.isin(indexes, model_rfr_tuned.sample_indexes)]
test_data, test_target=train_features[test_indexes], mean_score_train[test_indexes]
train_data, train_target = train_features[train_indexes], mean_score_train[train_indexes]

In [557]:
train_target_pred = model_rfr_tuned.predict(train_data)
r2_train = r_2(train_target_pred, train_target)
r2_train


0.7861292930587179

In [558]:
test_target_pred=model_rfr_tuned.predict(test_data)
r2_test = r_2(test_target_pred, test_target)
r2_test

0.7667435768089088

### Предсказание на тестовых данных

Обработка данных и генерирование признаков

In [559]:
test_data= pd.read_csv(test_data_link)
test_data

Unnamed: 0,Id,age,years_of_experience,lesson_price,qualification,physics,chemistry,biology,english,geography,history
0,10000,46.0,3.0,1050.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1,10001,43.0,3.0,1850.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10002,52.0,1.0,1550.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
3,10003,57.0,6.0,2900.0,3.0,1.0,0.0,1.0,0.0,0.0,0.0
4,10004,44.0,4.0,3150.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
9995,19995,42.0,0.0,1500.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
9996,19996,51.0,2.0,2200.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0
9997,19997,33.0,5.0,1100.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
9998,19998,48.0,0.0,1750.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0


In [560]:
train_features=feature_generation(test_data)
train_features

Unnamed: 0,age,years_of_experience,lesson_price,qualification,physics,chemistry,biology,english,geography,history,qulification_total,is_expirienced,highly_qualified,is_old,is_young,qualification_in_physics,expirience_in_physics,qual_vs_experience_in_ph,is_expensive,is_chip
0,46.0,3.0,1050.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1,0,0,0,0.0,0.0,0.0,0,1
1,43.0,3.0,1850.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0.0,0.0,0,0
2,52.0,1.0,1550.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1,0,1,0,1.0,1.0,1.0,0,0
3,57.0,6.0,2900.0,3.0,1.0,0.0,1.0,0.0,0.0,0.0,2.0,1,1,1,0,3.0,6.0,18.0,1,0
4,44.0,4.0,3150.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1,1,0,0,3.0,4.0,12.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,42.0,0.0,1500.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0.0,0.0,0,0
9996,51.0,2.0,2200.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1,1,0,0,0.0,0.0,0.0,1,0
9997,33.0,5.0,1100.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,1,0.0,0.0,0.0,0,1
9998,48.0,0.0,1750.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0.0,0.0,0,0


In [561]:
test_features=train_features.to_numpy()

In [562]:
test_target_pred = model_rfr_tuned.predict(test_features)

In [563]:
submission = pd.DataFrame({"Id":[id for id in range (10000, 20000)], "mean_exam_points":np.rint(test_target_pred)})

In [564]:
submission

Unnamed: 0,Id,mean_exam_points
0,10000,55.0
1,10001,63.0
2,10002,47.0
3,10003,92.0
4,10004,88.0
...,...,...
9995,19995,42.0
9996,19996,80.0
9997,19997,55.0
9998,19998,65.0


In [565]:
submission.to_csv(result_data_link, index = False)

**После загрузки на Kaggle - результат 0,76690**