In [1]:
from sklearn import ensemble, metrics
from sklearn.model_selection import cross_val_score, cross_validate, learning_curve

import pandas as pd
import numpy as np
import xgboost as xgb

In [2]:
%pylab inline

In [3]:
bioresponce = pd.read_csv('bioresponse.csv', header = 0, sep = ',')

In [4]:
bioresponce.head()

In [None]:
bioresponce_target = bioresponce.Activity.values

In [None]:
bioresponce_data = bioresponce.iloc[:, 1:]

## Модель RandomForestClassifier

#### Зависимость качества от количества деревьев

In [None]:
n_trees = [1] + np.arange(10, 55, 5)

In [None]:
%%time
scoring = []
for n_tree in n_trees:
    estimator = ensemble.RandomForestClassifier(n_estimators=n_tree, min_samples_split=5)
    score = cross_val_score(estimator, bioresponce_data, bioresponce_target, scoring = 'accuracy', cv=3)
    scoring.append(score)
scoring = np.asmatrix(scoring)

In [None]:
scoring

In [None]:
pylab.plot(n_trees, scoring.mean(axis = 1), marker = '.', label = 'RandomForest')
pylab.grid(True)
pylab.xlabel('Количество деревьев')
pylab.ylabel('Качество обучения')
pylab.title('Зависимость качества от количества деревьев')
pylab.legend(loc= 'lower right')
pylab.show()

#### Кривые обучения для деревьев большей глубины

In [None]:
%%time
xgb_scoring = []
for n_tree in n_trees:
    xgb_estimator = xgb.XGBClassifier(learning_rate=0.1, max_depth=3, min_child_weight=3)
    score = cross_val_score(xgb_estimator, bioresponce_data, bioresponce_target, scoring = 'accuracy', cv = 3)
    xgb_scoring.append(score)
xgb_scoring = np.asmatrix(xgb_scoring)

In [None]:
xgb_scoring

In [None]:
pylab.plot(n_trees, xgb_scoring.mean(axis = 1), marker = '.', label = 'XGBost')
pylab.plot(n_trees, scoring.mean(axis = 1), marker = '.', label = 'RandomForest')
pylab.grid(True)
pylab.xlabel('Количество деревьев')
pylab.ylabel('Качество обучения')
pylab.title('Зависимость качества от количества деревьев')
pylab.legend(loc= 'lower right')
pylab.show()