In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

import graphviz
import pydotplus
import io
from scipy import misc

In [2]:
all_dataset = pd.read_csv("data/arrhythmia.data", header=None)

In [3]:
chosen_column = [0] + [i for i in range(3, 15)] + [all_dataset.columns[-1]]
max_trash = 20

In [4]:
def clean_data(data):
    for col in data.columns:
        trash_row = (data[col] == "?")
        good_row = (data[col] != "?")
        if sum(trash_row) < max_trash:
            data = data[good_row]
        else:
            aver =  pd.to_numeric(data[good_row][col]).mean()
            data.loc[trash_row, col] = aver
        data[col] = pd.to_numeric(data[col])
        data[col] -= data[col].min()
    return data

In [5]:
def split_train_test(df):
    margin = int(0.8 * len(df))
    df = df.sample(frac=1)
    train = df.iloc[:margin]
    test = df.iloc[margin:]
    return train, test

In [6]:
def from_data_to_xy(data):
    x = np.array(data[data.columns[:-1]], dtype=float)
    y = np.array(data[data.columns[-1]], dtype=float)
    return x, y

In [7]:
def clf_acc(data, clf):
    train_data, test_data = split_train_test(data)
    train_x, train_y = from_data_to_xy(train_data)
    test_x, test_y = from_data_to_xy(test_data)
    clf_fit = clf.fit(train_x, train_y)
    y_pred = clf_fit.predict(test_x)
    acc = sum(y_pred == test_y)  / test_y.shape[0]
    return acc

In [8]:
dataset = all_dataset[chosen_column]
clean_dataset = clean_data(dataset)

  result = method(y)


In [9]:
def cv_accuracy(data, classifier, num_iter=100):
    accuracy = []
    for i in range(num_iter):
        acc = clf_acc(data, classifier)
        accuracy.append(acc)
    return accuracy

In [13]:
def show_tree(tree, features, path):
    f = io.StringIO()
    export_graphviz(tree, out_file=f, feature_names=features)
    pydotplus.graph_from_dot_data(f.getvalue()).write_png(path)
    #img = misc.imread(path)
    img = plt.imread(path)
    plt.rcParams["figure.figsize"] = (20, 20)
    plt.imshow(img)

In [None]:
## accuracy = 0.53 for full tree
## accuracy with min_samples_split=10 => 0.56
## accuracy with min_samples_split=10 => 0.59
##({'max_depth': 6, 'min_samples_split': 24}, 0.6175637393767706, 0.6741573033707865)
##({'max_depth': 7, 'min_samples_split': 18}, 0.6005665722379604, 0.6404494382022472)

In [31]:
def grid_search(data_, clf, param, scoring):
    train_data, test_data = split_train_test(data_)
    train_x, train_y = from_data_to_xy(train_data)
    test_x, test_y = from_data_to_xy(test_data)
    grid_clf = GridSearchCV(clf, param, cv=10, scoring=scoring, n_jobs=-1)
    grid_clf_fit = grid_clf.fit(train_x, train_y)
    model = grid_clf_fit.best_estimator_
    model_fit = model.fit(train_x, train_y)
    y_pred = model_fit.predict(test_x)
    acc = sum(y_pred == test_y)  / test_y.shape[0]
    return grid_clf_fit.best_params_, grid_clf_fit.best_score_, acc, model_fit

In [28]:
param_dt={'min_samples_split': range(2, 80, 2),
          'max_depth': range(1, 40, 2)}
clf_dt = DecisionTreeClassifier()
answer_dt = grid_search(clean_dataset, clf_dt, param_dt, 'accuracy')
print(answer_dt[:-1])
#show_tree(answer_dt[-1], chosen_column[:-1], "dec_tree_02.png")



({'max_depth': 5, 'min_samples_split': 26}, 0.6458923512747875, 0.5617977528089888)




In [None]:
##best boosting ({'learning_rate': 0.15, 'max_depth': 3}, 0.6260623229461756, 0.5955056179775281)  where n_estimators=20
##({'learning_rate': 0.1, 'max_depth': 3}, 0.6147308781869688, 0.651685393258427) for n_estimators=15, min_samples_split=5
#({'learning_rate': 0.1, 'max_depth': 3, 'min_samples_split': 6, 'n_estimators': 20}, 0.6118980169971672, 0.6629213483146067)

In [20]:
clf_boost = GradientBoostingClassifier(loss='deviance', min_samples_split=6, n_estimators=20)
param_boost = {"learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
              "max_depth": range(3, 12, 2)}
              #"n_estimators": [10, 15, 20]
              #"min_samples_split": [2, 4, 6, 8, 10]
answer_boost = grid_search(clean_dataset, clf_boost, param_boost, 'accuracy')
print(answer_boost[:-1])



({'learning_rate': 0.1, 'max_depth': 3, 'min_samples_split': 6, 'n_estimators': 20}, 0.6118980169971672, 0.6629213483146067)


In [None]:
##({'criterion': 'entropy', 'max_depth': 8, 'max_features': 'sqrt', 'n_estimators': 50}, 0.6713881019830028, 0.5617977)
##({'criterion': 'gini', 'max_depth': 12, 'max_features': 'log2', 'n_estimators': 75}, 0.6685552407932012, 0.6404494382022472)
##({'criterion': 'gini', 'max_depth': 8, 'max_features': 'auto', 'n_estimators': 75}, 0.6628895184135978, 0.6629213483146067)
##({'criterion': 'gini', 'max_depth': 8, 'max_features': 'log2', 'n_estimators': 30}, 0.6543909348441926, 0.6292134831460674)
##({'criterion': 'gini', 'max_depth': 10, 'max_features': 'auto', 'n_estimators': 80}, 0.6515580736543909, 0.6741573033707865)

In [37]:
clf_rf = RandomForestClassifier(bootstrap=True, criterion='gini')
param_rf = {
    'n_estimators': [30, 50, 70, 80, 100],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': range(2, 20, 2)
}
answer_rf = grid_search(clean_dataset, clf_rf, param_rf, 'accuracy')
print(answer_rf[:-1])



({'criterion': 'gini', 'max_depth': 10, 'max_features': 'auto', 'n_estimators': 80}, 0.6515580736543909, 0.6741573033707865)


In [None]:
1. Вирішуюче дерево
2. Бустінг 
3. Випадковий ліс

Оберіть два найбільш значущих регресори (будь-яким методом) та намалюйте області класифікації в R^2.

Для кожного методу проведіть підбір параметрів для потреби (для цього можна скористатися модулем sklearn.model_selection.GridSearchCV).
