In [2]:
%matplotlib inline

import re

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas.plotting import scatter_matrix
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, precision_recall_curve, recall_score, roc_curve, roc_auc_score
from sklearn.model_selection import cross_val_predict, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import FeatureUnion, Pipeline

### Постановка задачи
    Необходимо создать классификатор, который будет предсказывать выживет пассажир или нет.
    
    Имеется два класса объектов. Классификация бинарная.
    
    Рассмотрим признаки:
        1. PassengerId - уникальный ID пассажира
        2. Pclass - класс, к которому относился пассажир
        3. Name - имя пассажира
        4. Sex - пол
        5. Age - возраст
        6. SibSp - наличие на борту мужей/жен, братьев/сестер)
        7. Parch - наличие на борту родителей и/или детей
        8. Ticket - номер билета
        9. Fare - плата за проезд
        10. Cabin - номер каюты
        11. Embarked - порт посадки на корабль

In [3]:
def get_data(path, delimiter=','):
    data = pd.read_csv(path, delimiter=delimiter)
    return data

In [4]:
def draw_plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], 'b--', label='Precision')
    plt.plot(thresholds, recalls[:-1], 'g--', label='Recall')
    plt.xlabel('Thredhorld')
    plt.legend(loc='center left')
    plt.ylim([0, 1])
    plt.xlim([-10, 10])

In [5]:
def draw_plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')

In [33]:
def process_initial_data(initial_data, labels=True):
    data = pre_pipeline.fit_transform(initial_data)
    num_data = num_pipeline.fit_transform(data)
    cat_data = cat_pipeline.fit_transform(data).toarray()
    X = np.concatenate((num_data, cat_data), axis=1)
    if labels:
        y = initial_data['Survived']
        return X, y
    return X

In [6]:
class DataFrameSelection(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.attribute_names].values

In [7]:
class AttributesDroping(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.drop(columns=self.attribute_names)

In [8]:
class PeopleStatusCreate(BaseEstimator, TransformerMixin):
    def __init__(self, needed_labels, label_for_replacing):
        self.needed_labels = needed_labels
        self.label_for_replacing = label_for_replacing
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):          
        X = self.create_new_label(X)
        X = self.replace_useless_statuses(X)
        return X
    
    def create_new_label(self, X):
        people_status = np.array([])
        for name in X['Name'].values:
            people_status = np.append(people_status, re.split(r'[.,]+', name)[1].replace(' ', ''))
        X['people_status'] = people_status
        return X
    
    def replace_useless_statuses(self, X):
        for index_ in X['people_status'].value_counts().index:
            if index_ not in self.needed_labels:
                row_indexer = X[X['people_status'] == index_].index
                X.loc[row_indexer,'people_status'] = self.label_for_replacing
        return X

In [9]:
class ToNumericType(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X

In [10]:
class FareFiller(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        zero_fare_objects = X[X['Fare'] == 0]
        if not zero_fare_objects.empty:
            self.fill_zero_values(X)
        return X
    
    def fill_zero_values(self, X):
        mean_fare_classes = {
            1: X[X['Pclass'] == 1]['Fare'].mean(),
            2: X[X['Pclass'] == 2]['Fare'].mean(),
            3: X[X['Pclass'] == 3]['Fare'].mean(),
        }
        X['Fare'] = X['Fare'].fillna(0)
        for idx, series in X[X['Fare'] == 0].iterrows():
            X.loc[idx, 'Fare'] = mean_fare_classes[series['Pclass']]
        return X        

In [11]:
class FareCatCreator(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        bins = [0, 10, 15, 50, 250, 1000]
        labels = ['very_lower', 'lower', 'medium', 'high', 'very_high']
        X['fare_cat'] = pd.cut(X['Fare'], bins=bins, labels=labels)
        return X

In [12]:
class EmbarkedFiller(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X.loc[X['Embarked'].isnull(), 'Embarked'] = 'S'
        return X

In [13]:
class AgeFiller(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        null_age_objects = X[X['Age'].isnull() == True]
        if not null_age_objects.empty:
            self.fill_zero_values(X)
        return X
    
    def fill_zero_values(self, X):
        mean_age_classes = {
            'Miss': X[X['people_status'] == 'Miss']['Age'].mean(),
            'Mrs': X[X['people_status'] == 'Mrs']['Age'].mean(),
            'Mr': X[X['people_status'] == 'Mr']['Age'].mean(),
        }
        for idx, series in X[X['Age'].isnull()].iterrows():
            X.loc[idx, 'Age'] = mean_age_classes[series['people_status']]
        return X   

In [14]:
class AgeCatCreater(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        bins = [0, 10, 20, 30, 40, 60, 150]
        labels = ['child', 'teenager', 'young', 'medium', 'high', 'old']
        X['age_cat'] = pd.cut(X['Age'], bins=bins, labels=labels)
        return X

### 1. Ислледование данных

In [15]:
train = get_data('train.csv')
train_data = train.copy()

In [16]:
train_data['Pclass'].value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [17]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [18]:
train_data['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [19]:
corr_matrix = train_data.corr()
corr_matrix['Survived'].sort_values(ascending=False)

Survived       1.000000
Fare           0.257307
Parch          0.081629
PassengerId   -0.005007
SibSp         -0.035322
Age           -0.077221
Pclass        -0.338481
Name: Survived, dtype: float64

### Подготовка данных

####  Разделение признаков на числовые и категориальные (текстовые). Отделение меток.

In [20]:
y_train = train_data['Survived']

In [21]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [22]:
pre_pipeline = Pipeline([
    ('fare_filler', FareFiller()),
    ('people_cat_creator', PeopleStatusCreate(['Mr', 'Miss', 'Mrs'], 'Mr')),
    ('embarked_filler', EmbarkedFiller()),
    ('age_filler', AgeFiller()),
    ('age_cat_creator', AgeCatCreater()),
#     ('data_droper', AttributesDroping(['PassengerId', 'Survived', 'Name', 'Cabin', 'Ticket', 'Age']))
])

In [23]:
num_pipeline = Pipeline([
    ('selector', DataFrameSelection(['SibSp', 'Parch', 'Fare'])),
    ('scaler', StandardScaler())
])

In [24]:
cat_pipeline = Pipeline([
    ('selector', DataFrameSelection(['Sex', 'Embarked', 'people_status', 'age_cat', 'Pclass'])),
    ('encoder', OneHotEncoder()),
])

### Выбор и обучение модели

#### Линейный классификатор

In [None]:
sgd_clf = SGDClassifier(random_state=42)
cross_val_score(
    sgd_clf, X_train, y_train, cv=5, scoring='accuracy', n_jobs=2
)

In [None]:
y_train_pred = cross_val_predict(
    sgd_clf, X_train, y_train, cv=5, n_jobs=2
)

In [None]:
print(
    f'Precision: {precision_score(y_train, y_train_pred, average="macro")}\nRecall: {recall_score(y_train, y_train_pred, average="macro")}')

In [None]:
f1_score(y_train, y_train_pred, average='macro')

In [None]:
y_scores = cross_val_predict(
    sgd_clf, X_train, y_train, cv=5,
    method='decision_function'
)

In [None]:
precisions, recalls, thresholds = precision_recall_curve(y_train, y_scores)

In [None]:
draw_plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.show()

In [None]:
fpr, tpr, thresholds = roc_curve(y_train, y_scores)

draw_plot_roc_curve(fpr, tpr)
plt.show()

#### Случайный лес

In [None]:
rnf_clf = RandomForestClassifier(random_state=42)
y_proba_predict = cross_val_predict(
    rnf_clf, X_train, y_train, cv=5, n_jobs=3,
    method='predict_proba'
)

In [None]:
y_scores_forest = y_proba_predict[:, 1]
fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train, y_scores_forest)

In [None]:
plt.plot(fpr, tpr, 'b:', label='SGD')
draw_plot_roc_curve(fpr_forest, tpr_forest, 'Random Forest')
plt.legend(loc='lower right')
plt.show()


In [None]:
roc_auc_score(y_train, y_scores_forest)

In [None]:
y_train_pred_forest = cross_val_predict(
    rnf_clf, X_train, y_train, cv=5, n_jobs=3
)

In [None]:
print(
    f'Precision: {precision_score(y_train, y_train_pred_forest, average="macro")}\nRecall: {recall_score(y_train, y_train_pred_forest, average="macro")}')

In [None]:
grid_params = [
    {'n_estimators': [15, 20, 25], 'max_features': [12, 15, 17], 'max_depth': [5, 7, 10]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}
]

grid_search = GridSearchCV(rnf_clf, grid_params, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
rdm_forest_clf = RandomForestClassifier(n_estimators=15, max_depth=7, max_features=15)

In [None]:
y_train_pred_forest_2 = cross_val_predict(
    rdm_forest_clf, X_train, y_train, cv=5, n_jobs=3
)

In [None]:
print(
    f'Precision: {precision_score(y_train, y_train_pred_forest_2, average="macro")}\nRecall: {recall_score(y_train, y_train_pred_forest_2, average="macro")}')

### Обучение модели

In [27]:
train_set = get_data('train.csv')
X_train, y_train = process_initial_data(train_set)

In [28]:
rdm_forest_clf = RandomForestClassifier(n_estimators=15, max_depth=7, max_features=15)
rdm_forest_clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=7, max_features=15, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=15,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

### Обработка тестовых данных

In [37]:
test_set = get_data('test.csv')
test_set.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [36]:
X_test = process_initial_data(test_set, labels=False)

In [31]:
y_test_pred = rdm_forest_clf.predict(X_test)