### Классификация, деревья решений и метод ближайших соседей. Открытый курс машинного обучения ods.ai

In [29]:
%matplotlib inline
from matplotlib import pyplot as plt
plt.rcParams['figure.figsize'] = (10, 8)
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import collections
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [30]:
train_data = pd.read_csv('.../titanic_train.csv')
test_data = pd.read_csv('.../titanic_test.csv')

y_train = train_data['Survived']

#### Функция для формирования csv-файла посылки на Kaggle:

In [31]:
def write_to_submission_file(predicted_labels, out_file, train_num=891,
                    target='Survived', index_label="PassengerId"):
    
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(train_num + 1, train_num + 1 + predicted_labels.shape[0]),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

#### Предобработка данных

In [32]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [33]:
train_data.count()

PassengerId    891
Survived       891
Pclass         891
Name           891
Sex            891
Age            714
SibSp          891
Parch          891
Ticket         891
Fare           891
Cabin          204
Embarked       889
dtype: int64

In [34]:
train_data.shape, y_train.shape

((891, 12), (891,))

Заменим пропуски на медианные значения

In [35]:
train_data['Age'].fillna(train_data['Age'].median(), inplace=True)
test_data['Age'].fillna(test_data['Age'].median(), inplace=True)
train_data['Embarked'].fillna('S', inplace=True)
test_data['Fare'].fillna(train_data['Fare'].median(), inplace=True)

Кодируем категориальные признаки Pclass, Sex, SibSp, Parch и Embarked с помощью техники One-Hot-Encoding.

In [36]:
train_data = pd.concat([train_data, pd.get_dummies(train_data['Pclass'], 
                                               prefix="PClass"),
                      pd.get_dummies(train_data['Sex'], prefix="Sex"),
                      pd.get_dummies(train_data['SibSp'], prefix="SibSp"),
                      pd.get_dummies(train_data['Parch'], prefix="Parch"),
                     pd.get_dummies(train_data['Embarked'], prefix="Embarked")],
                     axis=1)
test_data = pd.concat([test_data, pd.get_dummies(test_data['Pclass'], 
                                             prefix="PClass"),
                      pd.get_dummies(test_data['Sex'], prefix="Sex"),
                      pd.get_dummies(test_data['SibSp'], prefix="SibSp"),
                      pd.get_dummies(test_data['Parch'], prefix="Parch"),
                    pd.get_dummies(test_data['Embarked'], prefix="Embarked")],
                     axis=1)

In [37]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,...,1,0,0,0,0,0,0,0,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,...,1,0,0,0,0,0,0,1,0,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,...,1,0,0,0,0,0,0,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,...,1,0,0,0,0,0,0,0,0,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,...,1,0,0,0,0,0,0,0,0,1


Убираем нерелевантные столбцы данных

In [38]:
train_data.drop(['Survived', 'Pclass', 'Name', 'Sex', 'SibSp', 
               'Parch', 'Ticket', 'Cabin', 'Embarked', 'PassengerId'], 
              axis=1, inplace=True)
test_data.drop(['Pclass', 'Name', 'Sex', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked', 'PassengerId'], 
             axis=1, inplace=True)

In [39]:
train_data.shape, test_data.shape

((891, 24), (418, 25))

In [40]:
set(test_data.columns) - set(train_data.columns)

{'Parch_9'}

In [41]:
test_data.drop('Parch_9', axis=1, inplace=True)


In [42]:
X_train = train_data
X_test = test_data

## 1. Дерево решений без настройки параметров

Обучаем на имеющейся выборке дерево решений (DecisionTreeClassifier) максимальной глубины 2. Используем параметр random_state=17 для воспроизводимости результатов.

In [43]:
first_tree = DecisionTreeClassifier(max_depth=2, random_state=17)

In [45]:
fitted_tree = first_tree.fit(X_train, y_train)


Делаю предсказание

In [46]:
predict_result = fitted_tree.predict(X_test)

Записать в csv

In [47]:
write_to_submission_file(predict_result, '.../titanic_predicted_df.csv')

## 2. Дерево решений с настройкой параметров

Обучаем на имеющейся выборке дерево решений (DecisionTreeClassifier). Используем random_state=17. Максимальную глубину и минимальное число элементов в листе настраиваем на 5-кратной кросс-валидации с помощью GridSearchCV.

In [48]:
second_tree = DecisionTreeClassifier(random_state=17)

In [49]:
tree_params = {'max_depth': list(np.arange(1, 21)), 'min_samples_leaf': list(np.arange(1, 15))}
tree_grid = GridSearchCV(second_tree, tree_params, cv=5)              

In [50]:
tree_grid.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=17),
             param_grid={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                       13, 14, 15, 16, 17, 18, 19, 20],
                         'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
                                              12, 13, 14]})

Выделяем лучшие параметры решающего дерева и точность предсказания

In [51]:
tree_grid.best_params_, tree_grid.best_score_

({'max_depth': 7, 'min_samples_leaf': 6}, 0.8249325214989642)

Делаем предсказание на тестовой выборке

In [52]:
pred_result = tree_grid.predict(X_test)

In [105]:
write_to_submission_file(pred_result, '.../titanic_predicted_with_cv_df.csv')

## 3. Случайный лес

Делаем предсказание на 5-ти кратной кросс-валидации без настройки параметров

In [53]:
forest = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=17)
print(np.mean(cross_val_score(forest, X_train, y_train, cv=5)))

0.8013935095097608


Делаем предсказание с настройкой параметров

In [54]:
forest_params = {'max_depth': range(1,11),
'max_features': range(4,19)}
forest_grid = GridSearchCV(forest, forest_params, cv=5, n_jobs=-1)

In [55]:
forest_grid.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(n_jobs=-1, random_state=17),
             n_jobs=-1,
             param_grid={'max_depth': range(1, 11),
                         'max_features': range(4, 19)})

In [56]:
forest_grid.best_params_, forest_grid.best_score_

({'max_depth': 10, 'max_features': 12}, 0.8361684765551441)

In [57]:
forest_pred_result = forest_grid.predict(X_test)

In [18]:
write_to_submission_file(forest_pred_result, '.../titanic_predicted_with_rf_df.csv')