# Обучаем первые классификаторы в sklearn

### Данные


По данным характеристикам молекулы требуется определить, будет ли дан биологический ответ (biological response).

Для демонстрации используется обучающая выборка из исходных данных bioresponse.csv, файл с данными прилагается.

### Готовим обучающую и тестовую выборки

In [1]:
import pandas as pd

bioresponce = pd.read_csv('bioresponse.csv', header=0, sep=',')

In [2]:
bioresponce.head(5)

Unnamed: 0,Activity,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
0,1,0.0,0.497009,0.1,0.0,0.132956,0.678031,0.273166,0.585445,0.743663,...,0,0,0,0,0,0,0,0,0,0
1,1,0.366667,0.606291,0.05,0.0,0.111209,0.803455,0.106105,0.411754,0.836582,...,1,1,1,1,0,1,0,0,1,0
2,1,0.0333,0.480124,0.0,0.0,0.209791,0.61035,0.356453,0.51772,0.679051,...,0,0,0,0,0,0,0,0,0,0
3,1,0.0,0.538825,0.0,0.5,0.196344,0.72423,0.235606,0.288764,0.80511,...,0,0,0,0,0,0,0,0,0,0
4,0,0.1,0.517794,0.0,0.0,0.494734,0.781422,0.154361,0.303809,0.812646,...,0,0,0,0,0,0,0,0,0,0


In [3]:
y = bioresponce.Activity.values

In [4]:
X = bioresponce.iloc[:, 1:]

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Строим модель и оцениваем качество

In [6]:
from sklearn.linear_model import LogisticRegression

In [7]:
model = LogisticRegression()
model.fit(X_train, y_train)
preds = model.predict(X_test)

In [8]:
type(preds)

numpy.ndarray

In [9]:
10 // 9

1

In [10]:
print(sum(preds == y_test) / len(preds))

0.75605815832


In [11]:
print(sum(preds == y_test) / float(len(preds)))

0.75605815832


In [12]:
from sklearn.metrics import accuracy_score

print(accuracy_score(preds, y_test))

0.75605815832


### Качество на кросс-валидации

In [13]:
from sklearn.model_selection import cross_val_score

print(cross_val_score(model, X_train, y_train, cv=5))

[ 0.74404762  0.73956262  0.72310757  0.75099602  0.75896414]


In [14]:
print(cross_val_score(model, X_train, y_train, cv=5).mean())

0.743335594477


### Пробуем другие классификаторы

In [15]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


In [16]:
%%time

models = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    LinearSVC(),
    RandomForestClassifier(n_estimators=100), 
    GradientBoostingClassifier(n_estimators=100)
]

for model in models:
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    print(accuracy_score(preds, y_test), model)

0.718901453958 KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
0.707592891761 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
0.736672051696 LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
0.796445880452 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
 

## Опциональное задание:

Попробуйте разные классификаторы с разными параметрами и постарайтесь добиться максимального качества на тестовой выборке

In [68]:
for model in models:
    print(model, "\n")

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform') 

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best') 

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0) 

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
  

In [92]:
from sklearn import cross_validation, grid_search, metrics

parameters_grid_knn = {
    "n_neighbors" : [5, 6],
    "weights" : ['uniform', 'distance'],
    "p" : [1, 2, 3],
    "n_jobs" : [-1]  # распаралл поиск соседей на все доступ ядра
}

parameters_grid_tree = {
    "criterion" : ["gini", "entropy"],
    "max_depth" : [None, 5, 6]
}

parameters_grid_svc = {
    "multi_class" : ['ovr', 'crammer_singer'],
    "loss" : ['hinge', 'squared_hinge'],
}

parameters_grid_randForest = {
    "n_estimators" : [10, 15],
    "criterion" : ["gini", "entropy"]
}

parameters_grid_gbc = {
    "loss" : ['deviance', 'exponential'],
    "n_estimators" : [50, 100, 150],
    "criterion" : ['friedman_mse', 'mae']
    
}

In [70]:
cv = cross_validation.StratifiedShuffleSplit(y, n_iter=5, test_size=0.2, random_state=0)

In [93]:
grid_cv_1 = grid_search.GridSearchCV(models[0], parameters_grid_knn, scoring='accuracy', cv=cv)
grid_cv_2 = grid_search.GridSearchCV(models[1], parameters_grid_tree, scoring='accuracy', cv=cv)
grid_cv_3 = grid_search.GridSearchCV(models[2], parameters_grid_svc, scoring='accuracy', cv=cv)
grid_cv_4 = grid_search.GridSearchCV(models[3], parameters_grid_randForest, scoring='accuracy', cv=cv)
grid_cv_5 = grid_search.GridSearchCV(models[4], parameters_grid_gbc, scoring='accuracy', cv=cv)

In [72]:
# randomized_gr

In [94]:
r_grid_cv_1 = grid_search.RandomizedSearchCV(models[0], parameters_grid_knn, scoring='accuracy', cv=cv, n_iter=4, random_state=0)
r_grid_cv_2 = grid_search.RandomizedSearchCV(models[1], parameters_grid_tree, scoring='accuracy', cv=cv, n_iter=3, random_state=0)
r_grid_cv_3 = grid_search.RandomizedSearchCV(models[2], parameters_grid_svc, scoring='accuracy', cv=cv, n_iter=3, random_state=0)
r_grid_cv_4 = grid_search.RandomizedSearchCV(models[3], parameters_grid_randForest, scoring='accuracy', cv=cv, n_iter=3, random_state=0)
r_grid_cv_5 = grid_search.RandomizedSearchCV(models[4], parameters_grid_gbc, scoring='accuracy', cv=cv, n_iter=5, random_state=0)

In [77]:
result = []
rand_result = []

1. KNeighborsClassifier

In [78]:
%%time
print(r_grid_cv_1.fit(X, y), '\n')
a = r_grid_cv_1.best_estimator_
b = r_grid_cv_1.best_score_
c = r_grid_cv_1.best_params_
rand_result.append({"best_estimator" : a, "best_score" : b, "best_params" : c})

RandomizedSearchCV(cv=StratifiedShuffleSplit(labels=[1 1 ..., 1 0], n_iter=5, test_size=0.2, random_state=0),
          error_score='raise',
          estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
          fit_params={}, iid=True, n_iter=4, n_jobs=1,
          param_distributions={'n_neighbors': [5, 6], 'weights': ['uniform', 'distance'], 'p': [1, 2, 3], 'n_jobs': [-1]},
          pre_dispatch='2*n_jobs', random_state=0, refit=True,
          scoring='accuracy', verbose=0) 

Wall time: 7min 55s


In [81]:
%%time
gr = grid_cv_1
print(gr.fit(X, y), '\n')
a = gr.best_estimator_
b = gr.best_score_
c = gr.best_params_
result.append({"best_estimator" : a, "best_score" : b, "best_params" : c})

GridSearchCV(cv=StratifiedShuffleSplit(labels=[1 1 ..., 1 0], n_iter=5, test_size=0.2, random_state=0),
       error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_neighbors': [5, 6], 'weights': ['uniform', 'distance'], 'p': [1, 2, 3], 'n_jobs': [-1]},
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0) 

Wall time: 13min 11s


2. DecisionTreeClassifier

In [87]:
%%time
gr = r_grid_cv_2
print(gr.fit(X, y), '\n')
a = gr.best_estimator_
b = gr.best_score_
c = gr.best_params_
rand_result.append({"best_estimator" : a, "best_score" : b, "best_params" : c})

RandomizedSearchCV(cv=StratifiedShuffleSplit(labels=[1 1 ..., 1 0], n_iter=5, test_size=0.2, random_state=0),
          error_score='raise',
          estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          fit_params={}, iid=True, n_iter=3, n_jobs=1,
          param_distributions={'criterion': ['gini', 'entropy'], 'max_depth': [None, 5, 6]},
          pre_dispatch='2*n_jobs', random_state=0, refit=True,
          scoring='accuracy', verbose=0) 

Wall time: 8.82 s


In [88]:
%%time
gr = grid_cv_2
print(gr.fit(X, y), '\n')
a = gr.best_estimator_
b = gr.best_score_
c = gr.best_params_
result.append({"best_estimator" : a, "best_score" : b, "best_params" : c})

GridSearchCV(cv=StratifiedShuffleSplit(labels=[1 1 ..., 1 0], n_iter=5, test_size=0.2, random_state=0),
       error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'criterion': ['gini', 'entropy'], 'max_depth': [None, 5, 6]},
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0) 

Wall time: 23.8 s


3. LinearSVC

In [95]:
%%time
gr = r_grid_cv_3
print(gr.fit(X, y), '\n')
a = gr.best_estimator_
b = gr.best_score_
c = gr.best_params_
rand_result.append({"best_estimator" : a, "best_score" : b, "best_params" : c})

RandomizedSearchCV(cv=StratifiedShuffleSplit(labels=[1 1 ..., 1 0], n_iter=5, test_size=0.2, random_state=0),
          error_score='raise',
          estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          fit_params={}, iid=True, n_iter=3, n_jobs=1,
          param_distributions={'multi_class': ['ovr', 'crammer_singer'], 'loss': ['hinge', 'squared_hinge']},
          pre_dispatch='2*n_jobs', random_state=0, refit=True,
          scoring='accuracy', verbose=0) 

Wall time: 11min 45s


In [96]:
%%time
gr = grid_cv_3
print(gr.fit(X, y), '\n')
a = gr.best_estimator_
b = gr.best_score_
c = gr.best_params_
result.append({"best_estimator" : a, "best_score" : b, "best_params" : c})

GridSearchCV(cv=StratifiedShuffleSplit(labels=[1 1 ..., 1 0], n_iter=5, test_size=0.2, random_state=0),
       error_score='raise',
       estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'multi_class': ['ovr', 'crammer_singer'], 'loss': ['hinge', 'squared_hinge']},
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0) 

Wall time: 10min 58s


4. RandomForestClassifier

In [97]:
%%time
gr = r_grid_cv_4
print(gr.fit(X, y), '\n')
a = gr.best_estimator_
b = gr.best_score_
c = gr.best_params_
rand_result.append({"best_estimator" : a, "best_score" : b, "best_params" : c})

RandomizedSearchCV(cv=StratifiedShuffleSplit(labels=[1 1 ..., 1 0], n_iter=5, test_size=0.2, random_state=0),
          error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params={}, iid=True, n_iter=3, n_jobs=1,
          param_distributions={'n_estimators': [10, 15], 'criterion': ['gini', 'entropy']},
          pre_dispatch='2*n_jobs', random_state=0, refit=True,
          scoring='accuracy', verbose=0) 

Wall time: 8.46 s


In [98]:
%%time
gr = grid_cv_4
print(gr.fit(X, y), '\n')
a = gr.best_estimator_
b = gr.best_score_
c = gr.best_params_
result.append({"best_estimator" : a, "best_score" : b, "best_params" : c})

GridSearchCV(cv=StratifiedShuffleSplit(labels=[1 1 ..., 1 0], n_iter=5, test_size=0.2, random_state=0),
       error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [10, 15], 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0) 

Wall time: 10.3 s


5. GradientBoostingClassifier

In [99]:
%%time
gr = r_grid_cv_5
print(gr.fit(X, y), '\n')
a = gr.best_estimator_
b = gr.best_score_
c = gr.best_params_
rand_result.append({"best_estimator" : a, "best_score" : b, "best_params" : c})

KeyboardInterrupt: 

In [102]:
for a in result:
    print(a["best_score"])
# randeom forest рулит

0.7680426098535287
0.7856191744340879
0.7571238348868176
0.7898801597869507


In [103]:
for b in rand_result:
    print(b["best_score"])
# на случ наборе парам выделилось одиночное дерево, но итоговая победа за случ лесом
# хз, может бустинг выдал бы результат и получше, но времени уже нет

0.755525965379494
0.7880159786950732
0.7478029294274301
0.7866844207723036


Проход по случайным узлам сетки (комбинациям параметров классификаторов) быстрее, чем перебор "в лоб"
по результатам можно прикинуть, какие параметры не имеет смысла перебирать (те, что в топовых вариантах совпадают)
и уже с меньшим размером сетки перебор "в лоб" запускать

не ожидал, что бустинг настолько трудоемкая задача
оставлю на ночь
(*) днем пытался посчитать, но все упало из-за косяков в библиотеках (не все комбинации атрибутов из спецификации поддерживаются)
когда обнаружил, оставалось мало времени

In [56]:
#grid_cv.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=6, p=2,
           weights='distance')

In [57]:
#print(grid_cv.best_score_)
#print(grid_cv.best_params_)

0.7678650687971593
{'n_jobs': -1, 'n_neighbors': 6, 'weights': 'distance'}


In [60]:
# grid_cv.grid_scores_[:4]

### Randomized grid search

In [61]:
randomized_grid_cv = grid_search.RandomizedSearchCV(models[0], \
                                                    parameters_grid_knn, \
                                                    scoring='accuracy', \
                                                    cv=cv, \
                                                    n_iter=3, \
                                                    random_state=0)

In [62]:
#%%time
#randomized_grid_cv.fit(X,y)

Wall time: 1min 5s


RandomizedSearchCV(cv=StratifiedShuffleSplit(labels=[1 1 ..., 1 0], n_iter=3, test_size=0.2, random_state=0),
          error_score='raise',
          estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
          fit_params={}, iid=True, n_iter=3, n_jobs=1,
          param_distributions={'n_neighbors': [5, 6], 'weights': ['uniform', 'distance'], 'n_jobs': [-1]},
          pre_dispatch='2*n_jobs', random_state=0, refit=True,
          scoring='accuracy', verbose=0)

In [63]:
#randomized_grid_cv.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=6, p=2,
           weights='distance')

In [64]:
#print(randomized_grid_cv.best_score_)
#print(randomized_grid_cv.best_params_)

0.7678650687971593
{'weights': 'distance', 'n_neighbors': 6, 'n_jobs': -1}
