In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,Id,age,years_of_experience,lesson_price,qualification,physics,chemistry,biology,english,geography,history,mean_exam_points
0,0,40.0,0.0,1400.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,63.0
1,1,48.0,4.0,2850.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,86.0
2,2,39.0,0.0,1200.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,53.0
3,3,46.0,5.0,1400.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,56.0
4,4,43.0,1.0,1500.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,59.0


In [3]:
class Node:
    
    def __init__(self, index, t, true_branch, false_branch):
        self.index = index  # индекс признака, по которому ведется сравнение с порогом в этом узле
        self.t = t  # значение порога
        self.true_branch = true_branch  # поддерево, удовлетворяющее условию в узле
        self.false_branch = false_branch  # поддерево, не удовлетворяющее условию в узле

In [4]:
# И класс терминального узла (листа)

class Leaf:
    
    def __init__(self, X, y):
        self.X = X
        self.y = y
        self.prediction = self.y.mean()

In [5]:
# Критерий информативности при разбиении - дисперсия
def crit(y):        
    return y.var()

In [6]:
# Расчет качества

def quality(left_y, right_y, current_crit):

    # доля выбоки, ушедшая в левое поддерево
    p = float(left_y.shape[0]) / (left_y.shape[0] + right_y.shape[0])
    
    return current_crit - p * crit(left_y) - (1 - p) * crit(right_y)

In [7]:
# Разбиение датасета в узле

def split(X, y, index, t):
    
    left = np.where(X[:, index] <= t)
    right = np.where(X[:, index] > t)
        
    true_X = X[left]
    false_X = X[right]
    true_y = y[left]
    false_y = y[right]
        
    return true_X, false_X, true_y, false_y

In [8]:
# Нахождение наилучшего разбиения

def find_best_split(X, y):
    
    #  обозначим минимальное количество объектов в узле
    min_leaf = 10

    current_crit = crit(y)

    best_quality = 0
    best_t = None
    best_index = None
    
    n_features = X.shape[1]
    
    # выбор индекса из подвыборки длиной sqrt(n_features)
    subsample = get_subsample(n_features)

    
    for index in range(n_features):
#    for index in subsample:
        # будем проверять только уникальные значения признака, исключая повторения
        t_values = np.unique([row[index] for row in X])
        
        for t in t_values:
            true_X, false_X, true_y, false_y = split(X, y, index, t)
            #  пропускаем разбиения, в которых в узле остается менее 5 объектов
            if len(true_X) < min_leaf or len(false_X) < min_leaf:
                continue
            
            current_quality = quality(true_y, false_y, current_crit)
            
            #  выбираем порог, на котором получается максимальный прирост качества
            if current_quality > best_quality:
                best_quality, best_t, best_index = current_quality, t, index

    return best_quality, best_t, best_index

In [9]:
# Построение дерева с помощью рекурсивной функции

def build_tree(X, y, max_depth=-1):
    
    quality, t, index = find_best_split(X, y)

    #  Базовый случай - прекращаем рекурсию, когда нет прироста в качества
    if quality == 0 or max_depth == 1:
        return Leaf(X, y)

    true_X, false_X, true_y, false_y = split(X, y, index, t)

    # Рекурсивно строим два поддерева
    true_branch = build_tree(true_X, true_y, max_depth-1)
    false_branch = build_tree(false_X, false_y, max_depth-1)

    # Возвращаем класс узла со всеми поддеревьями, то есть целого дерева
    return Node(index, t, true_branch, false_branch)

In [10]:
def predict_object(obj, node):

    #  Останавливаем рекурсию, если достигли листа
    if isinstance(node, Leaf):
        answer = node.prediction
        return answer

    if obj[node.index] <= node.t:
        return predict_object(obj, node.true_branch)
    else:
        return predict_object(obj, node.false_branch)

In [11]:
def predict(X, tree):    
    return [predict_object(obj, tree) for obj in X]

In [12]:
np.random.seed(42)

def get_bootstrap(X, y, n_estimators):
    n_samples = X.shape[0]
    bootstrap = []
    
    for i in range(n_estimators):
        b_X = np.zeros(X.shape)
        b_y = np.zeros(y.shape)
        
        for j in range(n_samples):
            sample_index = np.random.randint(0, n_samples-1)
            b_X[j] = X[sample_index]
            b_y[j] = y[sample_index]
        bootstrap.append((b_X, b_y))
        
    return bootstrap

In [13]:
def get_subsample(len_sample):
    # будем сохранять не сами признаки, а их индексы
    sample_indexes = [i for i in range(len_sample)]
    
    len_subsample = int(np.sqrt(len_sample))
    subsample = []
    
    np.random.shuffle(sample_indexes)
    for _ in range(len_subsample):
        subsample.append(sample_indexes.pop())
        
    return subsample

In [14]:
def random_forest(X, y, n_estimators, max_depth):
    forest = []
    bootstrap = get_bootstrap(X, y, n_estimators)
    
    for b_X, b_y in bootstrap:
        forest.append(build_tree(b_X, b_y, max_depth=max_depth))
        
    return forest

In [15]:
# предсказание голосованием деревьев

def forest_predict(forest, X):

    # добавим предсказания всех деревьев в список
    predictions = []
    for tree in forest:
        predictions.append(predict(X, tree))
    
    # сформируем список с предсказаниями для каждого объекта
    predictions_per_object = list(zip(*predictions))
    
    # выберем в качестве итогового предсказания для каждого объекта то,
    # за которое проголосовало большинство деревьев
    mean_predictions = []
    for obj in predictions_per_object:
        mean_predictions.append(np.mean(obj))
        
    return mean_predictions

In [16]:
def np_X_y(df):
    X = df.drop(['Id','mean_exam_points'], axis=1).values
    y = df['mean_exam_points'].values
    return X, y

In [17]:
#X_train, X_test, y_train, y_test = train_test_split(*np_X_y(df), test_size = 0.3, random_state = 1)

In [18]:
#my_forest_1 = random_forest(X_train, y_train, 700,-1)
#train_answers = forest_predict(my_forest_1, X_train)
#test_answers = forest_predict(my_forest_1, X_test)

In [19]:
#X_train, y_train = np_X_y(df)

In [20]:
#test_data = pd.read_csv('test.csv')
#X_test = test_data.drop(['Id'], axis=1).values

In [21]:
#final_forest = random_forest(X_train, y_train, 600,-1)
#test_data['mean_exam_points'] = forest_predict(final_forest, X_test)

In [22]:
#test_data[['Id','mean_exam_points']].to_csv('AChernenko_predictions_3.csv',index=None)

бустинг

In [163]:
def mse(y,z):
    return ((y-z)**2).mean(axis=0).sum()

In [164]:
def bias(y, z):
    return (y - z)

In [276]:
def fit(X, y, max_depth):
    b = np.mean(y) * np.ones(y.shape)
    prediction = b.copy()
    trees = []
    for t in range(n_estimators):
        resid = bias(y, prediction)
        tree = build_tree(X, resid, max_depth=max_depth)
        b = np.array(predict(X, tree))
        trees.append(tree)
        
        min_loss, gamma = np.inf, 1
   #     for _gamma in np.linspace(0.001,1,1000):
   #         loss = mse(y, prediction+(learning_rate*b*_gamma))
   #         if loss < min_loss:
   #             min_loss = loss
   #             gamma = _gamma
        
        #print('gamma',gamma)
        prediction += learning_rate * gamma * b
    return trees

In [277]:
def predict_gb(X, y_train, trees):
    pred = np.ones((X.shape[0],1)) * np.mean(y_train)
    for tree in trees:
        pred += learning_rate * np.array(predict(X, tree)).reshape((X.shape[0],1))
    return pred

In [474]:
learning_rate = 0.4
n_estimators = 120
max_depth = 6

In [475]:
X_train, X_test, y_train, y_test = train_test_split(*np_X_y(df), test_size = 0.3, random_state = 7)


In [476]:
%%time
gb_model = fit(X_train, y_train, max_depth)


Wall time: 1min 5s


In [477]:
y_train_pred = predict_gb(X_train, y_train, gb_model)
y_test_pred = predict_gb(X_test, y_train, gb_model)

In [478]:
from sklearn.metrics import r2_score
print('train:', r2_score(y_train, y_train_pred))
print('test:', r2_score(y_test, y_test_pred))

train: 0.8434220671556153
test: 0.7645901518704858


## сТАРОЕ

0.77538

lr: 0.4 n_est: 100 md: 5
train: 0.8178034134941936
test: 0.7749054392873778

In [460]:
from sklearn.metrics import r2_score
print('lr:', learning_rate, 'n_est:', n_estimators, 'md:', max_depth)
print('train:', r2_score(y_train, y_train_pred))
print('test:', r2_score(y_test, y_test_pred))

lr: 0.01 n_est: 600 md: 6
train: 0.7989839542928312
test: 0.7800470236460693


In [461]:
X_train, y_train = np_X_y(df)

test_data = pd.read_csv('test.csv')
X_test = test_data.drop(['Id'], axis=1).values

gb_model_fin = fit(X_train, y_train, max_depth)
test_data['mean_exam_points'] = predict_gb(X_test, y_train, gb_model_fin)

test_data[['Id','mean_exam_points']].to_csv('AChernenko_predictions_gb_7.csv',index=None)