# MLClass. "Прикладной анализ данных". Курс "Машинное обучение с помощью Python".
<img src="../img/mlclass_logo.jpg" height="240" width="240">
## Авторы материала: преподаватель ФКН НИУ ВШЭ Кашницкий Юрий, магистрант ВМК МГУ Евгений Колмаков
Материал распространяется на условиях лицензии <a href="http://creativecommons.org/licenses/by-sa/4.0/">Creative Commons Attribution-Share Alike 4.0</a>. Можно использовать в любых целях, но с обязательным упоминанием автора курса и аффилиации.

## 1. One-hot encoding для kNN

In [1]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.grid_search import GridSearchCV
from sklearn import preprocessing


def load_train_and_test(path_to_train, path_to_test, one_hot=False):
    # read data into pandas data frames
    train_df = pd.read_csv(path_to_train,
                           header=0, index_col=0)
    test_df = pd.read_csv(path_to_test,
                          header=0, index_col=0)

    def extract_region(auto_number):
        """
        Returns region based on the auto number
        X796TH96RUS -> 96
        E432XX77RUS -> 77
        If there are more than 2-3 digits before 'RUS', returns "not-auto-num"
        """
        index = auto_number.rindex("RUS") - 1
        while auto_number[index].isdigit():
            index -= 1
        auto_number = auto_number[index + 1 :auto_number.rindex('RUS')]
        return auto_number if len(auto_number) <= 3 else "not-auto-num"

    # auto brand and region are categorical so we encode these columns
    # ex: "Volvo" -> 1, "Audi" -> 2 etc
    auto_brand_encoder = preprocessing.LabelEncoder()
    auto_brand_encoder.fit(train_df['auto_brand'])

    regions_train = np.array([extract_region(num) for num in train_df['auto_number']])
    region_encoder = preprocessing.LabelEncoder()
    region_encoder.fit(regions_train)

    train_df['region'] = region_encoder.transform(train_df['auto_number'].apply(extract_region))
    train_df['auto_brand'] = auto_brand_encoder.transform(train_df['auto_brand'])

    test_df['region'] = region_encoder.transform(test_df['auto_number'].apply(extract_region))
    test_df['auto_brand'] = auto_brand_encoder.transform(test_df['auto_brand'])

    # form a numpy array to fit as train set labels
    y = train_df['too_much']

    # we don't need some columns in the training\test set anymore        
    train_df = train_df.drop(['auto_number', 'too_much'], axis=1)
    test_df = test_df.drop(['auto_number'], axis=1)
    
    # one-hot encoding region and auto_brand in both data frames
    if one_hot:
        df = pd.concat((train_df, test_df))
        df = pd.get_dummies(df, columns=['region', 'auto_brand'])
        # extracting both dataframes
        train_df = df[:train_df.shape[0]]
        test_df = df[train_df.shape[0]:]

    return train_df, y, test_df


# read data
train_df, y, test_df = load_train_and_test("../data/car_insurance_train.csv",
                                "../data/car_insurance_test.csv", one_hot=True)

# params for grid search
params = {'n_neighbors': list(range(3, 12))}

# make an instance of a grid searcher
best_clf = GridSearchCV(KNeighborsClassifier(), params, verbose=True, n_jobs=4,
                        scoring="roc_auc")

# fit X and y (train set and corresponding labels) to the grid searcher
best_clf.fit(train_df, y)

# print best estimatior and params
print("Best params:", best_clf.best_params_)
print("Best cross validation ROC AUC score", best_clf.best_score_)

# make predictions. This results in 0.741 AUC score
predicted_labels = best_clf.predict(test_df)

# turn predictions into data frame and save as csv file
predicted_df = pd.DataFrame(predicted_labels,
                            index = np.arange(1, test_df.shape[0] + 1),
                            columns=["too_much"])
predicted_df.to_csv("../output/knn_car_insurance.csv", index_label="id")

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=4)]: Done   1 jobs       | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  21 out of  27 | elapsed:    0.3s remaining:    0.1s
[Parallel(n_jobs=4)]: Done  27 out of  27 | elapsed:    0.3s finished


Best params: {'n_neighbors': 11}
Best cross validation ROC AUC score 0.741361490828


## 2. Выделение важных признаков с помощью Random forest

### Car insurance

In [2]:
from sklearn.ensemble import RandomForestClassifier

train_df, y, test_df = load_train_and_test("../data/car_insurance_train.csv",
                                "../data/car_insurance_test.csv", one_hot=False)

clf = RandomForestClassifier().fit(train_df, y)

for imp, name in zip(clf.feature_importances_, train_df.columns.values):
    print('Importance of %s = %.3f' % (name, imp))

Importance of auto_brand = 0.075
Importance of compensated = 0.754
Importance of region = 0.171


## 4. Beeline

In [10]:
from sklearn.preprocessing import LabelBinarizer
train_df = pd.read_csv('../data/beeline_train.csv', nrows=1000)
test_df = pd.read_csv('../data/beeline_test.csv', nrows=1000)
y = train_df['y']
train_df = train_df.drop(['y'], axis=1)
test_df = test_df.drop(['ID'], axis=1)

lb = LabelBinarizer()
y = lb.fit_transform(y)

df = pd.concat((train_df, test_df))
df = df.fillna(0.0)
df = pd.get_dummies(df)
train_df = df[:train_df.shape[0]]
test_df = df[train_df.shape[0]:]

params = {'n_neighbors': list(np.arange(1, 31, 2))}
best_clf = GridSearchCV(KNeighborsClassifier(), 
                        params, verbose=True, n_jobs=4,
                       scoring="roc_auc")

# fit X and y (train set and corresponding labels) to the grid searcher
best_clf.fit(train_df, y)

# print best estimatior and params
print("Best params:", best_clf.best_params_)
print("Best cross validation score", best_clf.best_score_)

Fitting 3 folds for each of 15 candidates, totalling 45 fits


[Parallel(n_jobs=4)]: Done   1 jobs       | elapsed:    1.1s
[Parallel(n_jobs=4)]: Done  39 out of  45 | elapsed:   22.1s remaining:    3.4s
[Parallel(n_jobs=4)]: Done  45 out of  45 | elapsed:   26.2s finished


Best params: {'n_neighbors': 27}
Best cross validation score 0.558761585504
