In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
from matplotlib import pyplot as plt
from sklearn.metrics import accuracy_score, log_loss
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC, LinearSVC
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
%matplotlib inline

In [3]:
#загрузка данных
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
features = pd.concat([train, test], ignore_index=True, sort  = False)

In [4]:
features.head(20)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0.0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0.0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0.0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1.0,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1.0,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [5]:
# количество нулей в данных
features.isnull().sum()

PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64

In [6]:
# удаляю переменную, потому что много пропущенных значений
features.drop(['Cabin'], axis='columns', inplace=True)

In [7]:
# удаляю переменную, потому что не буду использовать номер билета 
features.drop(['Ticket'], axis='columns', inplace=True)

In [9]:
# заполняю нули в месте посадки и цене билета модами по переменным
features.Embarked.fillna(features.Embarked.mode()[0], inplace = True)
features.Fare.fillna(features.Fare.mode()[0], inplace = True)

In [8]:
# заполняю нули в возрасте, котоырй является очень важным для целевой переменной, медианами, сгруппировав пассажирова по полу и классу 
grp = features.groupby(['Sex', 'Pclass'])  
features.Age = grp.Age.apply(lambda x: x.fillna(x.median()))
features.Age.fillna(features.Age.median, inplace = True)

In [11]:
# создаю новые переменные: размер семьи и отдельную переменную - путешествует ли пассажир 1
features['Family'] = features.Parch + features.SibSp
features['Is_Alone'] = features.Family == 0

In [11]:
# создаю новую переменную: "обращение" к пассажиру (Мистер, Миссис и тд.)
features['Salutation'] = features.Name.apply(lambda name: name.split(',')[1].split('.')[0].strip()) 
features.Salutation.nunique()

18

In [12]:
# создаю новую переменную: длина имени, так как у пассажиров более высокого класса больше "регалий" и имена длиннее
features['Name_len'] = features['Name'].apply(lambda x: len(x))

In [13]:
features = pd.concat([features, pd.get_dummies(features.Embarked, prefix="Emb", drop_first = True), 
                      pd.get_dummies(features.Salutation, prefix="Title", drop_first = True), 
                      pd.get_dummies(features.Pclass, prefix="Class", drop_first = True)], axis=1)

In [14]:
features.drop(['PassengerId', 'Pclass', 'Name','Salutation','Embarked', 'SibSp', 'Parch'], axis=1, inplace=True)

In [17]:
len_train = train.shape[0]

In [20]:
train_m = features.iloc[:len_train, :]
test_m = features.iloc[len_train:, :]

In [21]:
y = train_m['Survived']
train_m = train_m.drop(['Survived'], axis = 1)
test_m = test_m.drop(['Survived'], axis = 1)

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
# Разделение на тренировочную и тестовую выборки  
X_train, X_test, y_train, y_test = train_test_split(
                          train_m, y, random_state=42, test_size=.33)

In [24]:
# Применяем модель Случайного леса для првоерки его работы на полуенных данных
clf = RandomForestClassifier(criterion='entropy', 
                             n_estimators=300,
                             min_samples_split=5,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
clf.fit(X_train,  y_train)
print("RF Accuracy: "+repr(round(clf.score(X_test, y_test) * 100, 2)) + "%")

RF Accuracy: 82.71%


In [25]:
# Результат неплохой, можно занять подборкой гиперпараметров
result_rf=cross_val_score(clf,X_train,y_train,cv=10,scoring='accuracy')
print('The cross validated score for Random forest is:',round(result_rf.mean()*100,2))

The cross validated score for Random forest is: 81.87


In [42]:
# Подборка гиперпараметров
n=[300,500,700]
depth = [10, 20, 30]
prmtrs = [{'max_depth': depth, 'n_estimators' : n, 
        'min_samples_split' : range(4, 7)}]
frst = GridSearchCV(estimator=clf, param_grid=prmtrs, scoring='accuracy', cv=2)
frst.fit(X_train, y_train)
scores_clf=cross_val_score(frst, X_train.astype(float), y_train,scoring='accuracy', cv=5)

In [44]:
best_clf = frst.best_estimator_
model=best_clf.fit(X_train, y_train)

In [45]:
frst.best_params_

{'max_depth': 10, 'min_samples_split': 6, 'n_estimators': 500}

In [46]:
result_rf=cross_val_score(model,X_train,y_train,cv=10,scoring='accuracy')

In [47]:
# Результат стал лучше
print('The cross validated score for Random forest is:',round(result_rf.mean()*100,2))

The cross validated score for Random forest is: 82.55


In [55]:
# Выгрузка результатов для тестовой выборки для Kaggle
result=model.predict(test_m)
output=pd.DataFrame({'PassengerId':test.PassengerId, 'Survived':result})
output.Survived = output.Survived.astype(int)
output.to_csv('submission.csv', index=False)