In [None]:
import pandas
import math
import re
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
import matplotlib.pyplot as plt
%matplotlib inline
pandas.set_option('display.max_columns', None)

In [None]:
data_train = pandas.read_csv('train.csv', index_col = 'PassengerId')
train_index = data_train.index
y_train = data_train['Survived']
data_train.drop('Survived', inplace = True, axis = 1)
#print(data_train[:5])
data_test = pandas.read_csv('test.csv', index_col = 'PassengerId')
test_index = data_test.index

data = pandas.concat([data_train, data_test])
#print(data_test[:5])
data.head()

In [None]:
def print_nan(X):
    counts = X.count()
    length = X.shape[0]
    print('Full length:', length)
    for i in counts[counts < length].index:
        print(i, ': Non NaN num:', counts[i], ', Not NaN:', '%.3f' % (100*counts[i] / length), '%')
    return

def get_marital_status(name):
    if (name.find('Mrs') != -1) or (name.find('Lady') != -1) or (name.find('Countess') != -1):
        return 2
    if (name.find('Miss') != -1) or (name.find('Mlle') != -1) or (name.find('Mme') != -1) or (name.find('Ms') != -1):
        return 1
    return 0

def get_cabin(x):
    x = re.sub('[^A-Za-z ]', '', x)
    return x[0]

def prepare_data(X):
    X = X.copy()
    # print_nan(X)
    X['NameLength'] = X['Name'].map(len)
    
    X['Title'] = X['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
    stat_min = 10
    title_names = (X['Title'].value_counts() < stat_min)
    X['Title'] = X['Title'].apply(lambda x: 'Misc' if title_names.loc[x] == True else x)
    X = pandas.get_dummies(X, columns = ['Title'], prefix='Title')
    
    X['MaritalStatus'] = X['Name'].map(get_marital_status)
    X['is_Mr'] = X['MaritalStatus'].map(lambda x: 1 if x == 0 else 0)
    X['is_Miss'] = X['MaritalStatus'].map(lambda x: 1 if x == 1 else 0)
    X['is_Mrs'] = X['MaritalStatus'].map(lambda x: 1 if x == 2 else 0)
    # X.drop('MaritalStatus', axis = 1, inplace = True)
    
    X['Age_known'] = X['Age'].isnull() == False
    X['Age_known'] = X['Age_known'].map(lambda x: 1 if x else 0)
    X['Age'] = X.groupby('Pclass')['Age'].apply(lambda x: x.fillna(x.median()))
    
    X['Fare'] = X.groupby('Pclass')['Fare'].apply(lambda x: x.fillna(x.median()))
    
    X['Sex'] = X['Sex'].map(lambda x: 1 if x == 'male' else 0)
    
    X['Embarked'].fillna(value = 'S', inplace = True)
    X = pandas.get_dummies(X, columns = ['Embarked'], prefix='Emb')
    X['Cabin'] = X['Cabin'].map(lambda x: 'N' if pandas.isna(x) else x)
    X['LB'] = X['Cabin'].map(lambda x: 1 if ((len(re.sub('[A-Za-z ]', '', x)) > 0) and (int(re.sub('[A-Za-z ]', '', x)) % 2 == 0)) else 0)
    X['RB'] = X['Cabin'].map(lambda x: 1 if ((len(re.sub('[A-Za-z ]', '', x)) > 0) and (int(re.sub('[A-Za-z ]', '', x)) % 2 == 1)) else 0)
    X['Cabin'] = X['Cabin'].map(get_cabin)
    X = pandas.get_dummies(X, columns = ['Cabin'], prefix='Cabin')
    # X['Cabin'] = X['Cabin'].map(lambda x: 0 if pandas.isna(x) else 1)
    
    X['FamilySize'] = X['SibSp'] + X['Parch']
    #X['isAlone'] = X['FamilySize'].map(lambda x: 1 if x == 0 else 0)
    
    X['1class'] = X['Pclass'].map(lambda x: 1 if x == 1 else 0)
    X['2class'] = X['Pclass'].map(lambda x: 1 if x == 2 else 0)
    X['3class'] = X['Pclass'].map(lambda x: 1 if x == 3 else 0)
    X.drop('Pclass', axis = 1, inplace = True)
    X['FareBin'] = pandas.qcut(x = X['Fare'], q = 4, labels = False)
    X['AgeBin'] = pandas.qcut(x = X['Age'], q = 4, labels = False)
    
    X['Age'] = X['Age'].map(np.log1p)
    X['Fare'] = X['Fare'].map(np.log1p)
    
    X['Pass'] = 1
    X = X.join(X.groupby('Ticket')['Pass'].sum().rename('PassInTicket'), on = 'Ticket')
    X = X.join(X.groupby('Ticket')['Fare'].median().rename('TicketFare'), on ='Ticket')
    X['TicketFare'] = X['TicketFare'].divide(X['PassInTicket'])
    X['FamilyFare'] = X['Fare'].divide(X['FamilySize'] + 1)
    X['FamilyName'] = X['Name'].map(lambda x: x.split(',')[0])
    X = X.join(X.groupby('FamilyName')['Pass'].sum().rename('FamilySize_1'), on = 'FamilyName')
    X['FamilyFare_1'] = X['Fare'].divide(X['FamilySize_1'] + 1)
    X = pandas.get_dummies(X, columns = ['FamilyName'], prefix='FName')
    X.drop('Pass', inplace = True, axis = 1)
    
    X['Ttype'] = X['Ticket'].str[0]
    X = pandas.get_dummies(X, columns = ['Ttype'], prefix='Ttype')
    
    X = pandas.get_dummies(X, columns = ['FamilySize'], prefix='FamSize')
    X = pandas.get_dummies(X, columns = ['Parch'], prefix='Parch')
    X = pandas.get_dummies(X, columns = ['SibSp'], prefix='SibSp')
    
    X.drop(['Name', 'Ticket'], axis = 1, inplace = True)
    
    
    return X

X = prepare_data(data)
X_train = X.copy()
X_test = X.copy()
X_train.drop(X_train[X_train.index > train_index[-1]].index, inplace = True)
X_test.drop(X_test[X_test.index <= train_index[-1]].index, inplace = True)

scaler = StandardScaler()
x_train = scaler.fit_transform(X_train)
x_test = scaler.transform(X_test)

X[:5]

In [None]:
cv = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)
gbc = GradientBoostingClassifier(n_estimators = 200, learning_rate = 0.2, subsample = 0.5, max_features = None, random_state = 42)
gbc_scores = np.mean(cross_val_score(estimator = gbc, X = x_train, y = y_train, cv=cv, scoring='roc_auc'))
print ("%.3f" % gbc_scores)

In [None]:
titanic = GradientBoostingClassifier(n_estimators = 200, learning_rate = 0.2, subsample = 0.5, max_features = None, random_state = 42)
titanic.fit(x_train, y_train)
y_test = pandas.DataFrame(data = {'Survived' : titanic.predict(x_test)})
# y_test['Survived'] = pandas.DataFrame(titanic.predict(X_test))
#y_test.index.name = 'PassengerId'
y_test.index = X_test.index
y_test.to_csv('titanic_predict.csv')
y_test[:3]

pandas.Series(titanic.feature_importances_, index = X_train.columns).sort_values(ascending = False)

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components = 8)
pca.fit(X_train)
dispmax = 0.99
disp = 0
i = 0
while (disp < dispmax):
    disp += pca.explained_variance_ratio_[i]
    i += 1    
print('Для описания', dispmax, 'дисперсии требуется', i, 'компонент')

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier

In [None]:
lr = LogisticRegression(C = 0.01)
lr_scores = np.mean(cross_val_score(estimator = lr, X = x_train, y = y_train, cv=cv, scoring='roc_auc'))
print ("%.3f" % lr_scores)

In [None]:
svm = SVC(random_state = 42, C = 10, class_weight = 'balanced')
svm_scores = np.mean(cross_val_score(estimator = svm, X = x_train, y = y_train, cv=cv, scoring='roc_auc'))
print ("%.3f" % svm_scores)

In [None]:
dt = DecisionTreeClassifier(class_weight='balanced', max_depth=5, max_features=None, min_samples_leaf=0.01, min_samples_split=0.05)
dt_scores = np.mean(cross_val_score(estimator = dt, X = x_train, y = y_train, cv=cv, scoring='roc_auc'))
print ("%.3f" % dt_scores)

In [None]:
ada = AdaBoostClassifier(random_state = 42, n_estimators = 100, learning_rate = 0.3)
ada_scores = np.mean(cross_val_score(estimator = ada, X = x_train, y = y_train, cv=cv, scoring='roc_auc'))
print ("%.3f" % ada_scores)

In [None]:
rf = RandomForestClassifier(random_state = 42, n_estimators = 50, class_weight = 'balanced_subsample')
rf_scores = np.mean(cross_val_score(estimator = rf, X = X_train, y = y_train, cv=cv, scoring='roc_auc'))
print ("%.3f" % rf_scores)

In [None]:
from sklearn.ensemble import VotingClassifier

GB = GradientBoostingClassifier(n_estimators = 200, learning_rate = 0.2, subsample = 0.5, max_features = None, random_state = 42)
LR = LogisticRegression(C = 0.01)
SV = SVC(random_state = 42, C = 10, class_weight = 'balanced', probability = True)
ADA = AdaBoostClassifier(random_state = 42, n_estimators = 100, learning_rate = 0.3)
DT = DecisionTreeClassifier(class_weight='balanced', max_depth=5, max_features=None, min_samples_leaf=0.01, min_samples_split=0.05)
RF = RandomForestClassifier(random_state = 42, n_estimators = 50, class_weight = 'balanced_subsample')

VC = VotingClassifier(estimators=[('gbc', GB), ('lr', LR), ('svc', SV), ('ada', ADA), ('dt', DT), ('rf', RF)], voting='soft', n_jobs=-1)

cv = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)
vc_scores = np.mean(cross_val_score(estimator = VC, X = x_train, y = y_train, cv=cv, scoring='roc_auc'))
print ("%.3f" % vc_scores)

VC = VC.fit(x_train, y_train)
y_test = pandas.DataFrame(data = {'Survived' : VC.predict(x_test)})
y_test.index = X_test.index
y_test.to_csv('titanic_predict_voting.csv')