# MLClass. "Прикладной анализ данных"
# Модуль "Машинное обучение с помощью Python"
<img src="../img/mlclass_logo.jpg" height="240" width="240">
## Авторы материала: преподаватель ФКН НИУ ВШЭ Кашницкий Юрий, магистрант ВМК МГУ Евгений Колмаков
Материал распространяется на условиях лицензии <a href="http://creativecommons.org/licenses/by-sa/4.0/">Creative Commons Attribution-Share Alike 4.0</a>. Можно использовать в любых целях, но с обязательным упоминанием автора курса и аффилиации.

# Урок 6. Нейронные сети. Бустинг. Смешивание алгоритмов. Стекинг.
## Часть 8.  kNN с метрикой, определенной пользователем

In [1]:
%pylab inline
import numpy as np
from math import sqrt
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.grid_search import GridSearchCV
from load_car_insurance_with_region import load_train_and_test
from sklearn.metrics import roc_auc_score
from sklearn.cross_validation import StratifiedKFold

Populating the interactive namespace from numpy and matplotlib


In [2]:
# read data
X_train, y, X_test = map(np.array, load_train_and_test("../data/car_insurance_train.csv",
                                         "../data/car_insurance_test.csv"))

In [3]:
X_train[:10]

array([[   2, 3200,   21],
       [   5, 6500,   12],
       [   2, 2100,    9],
       [   2, 2000,    4],
       [   2, 6100,   21],
       [   2, 5700,    6],
       [   2, 1800,   21],
       [   2,  500,    1],
       [   2, 1000,    3],
       [   2,  500,    1]])

In [None]:
def my_dist_metric(region_weight, brand_weight):
    def dist(x,y):
        return sqrt(region_weight * (x[2] - y[2]) ** 2 + 
                    brand_weight * (x[0] - y[0]) ** 2 +
                    (1 - region_weight - brand_weight) * (x[1] - y[1]) ** 2)
    return dist

In [None]:
results = []

params = [(neighbors, reg_weight, brand_weight, 1 - reg_weight - brand_weight)
          for reg_weight in np.arange(0, 1, 0.05)
          for brand_weight in np.arange(0, 1 - reg_weight, 0.05)
          for neighbors in range(1,12)]

for (n_neighbors, reg_weight, brand_weight, comp_weight) in params:
        scores = []
        dist_func = my_dist_metric(reg_weight, brand_weight)
        
        knn = KNeighborsClassifier(n_neighbors=n_neighbors,
                                   metric='pyfunc', 
                                   func=dist_func)
        
        folds = StratifiedKFold(y, n_folds=3, 
                                random_state=42)
        for train_index, test_index in folds:
            x_train, x_test = X_train[train_index], \
                                  X_train[test_index]
            y_train, y_test = y[train_index], y[test_index]
            knn.fit(x_train, y_train)
            scores.append(roc_auc_score(knn.predict(x_test), y_test))
        results.append(np.mean(scores))
            

In [None]:
max_auc = max(results)
max_auc, params[np.argmax(results)]

In [None]:
knn = KNeighborsClassifier(n_neighbors=11,
                           metric='pyfunc', 
                           func=my_dist_metric(.0, .05))
knn.fit(X_train, y)

In [None]:
predicted_labels = knn.predict(X_test)

# # turn predictions into data frame and save as csv file
predicted_df = pd.DataFrame(predicted_labels,
                            index = np.arange(1, X_test.shape[0] + 1),
                            columns=["too_much"])
predicted_df.to_csv("../output/knn_car_insurance_custom_metric.csv", 
                    index_label="id")

# that's for those who know the answers :)
# expected_labels_df = pd.read_csv("../data/car_insurance_test_labels.csv",
#                                  header=0, index_col=0)
# expected_labels = expected_labels_df['too_much']
# print(roc_auc_score(predicted_labels, expected_labels))
