# original

https://www.kaggle.com/yuanxuan/titanic-random-forest-82-78/notebook

# points

- almost all are one hot encoding
- columns: Index(['Survived', 'Age', 'Fare', 'Name_Len', 'Age_Null_Flag',
       'Cabin_num_(1.999, 28.667]', 'Cabin_num_(28.667, 65.667]',
       'Cabin_num_(65.667, 148.0]', 'Ticket_Len', 'Pclass_3', 'Pclass_1',
       'Pclass_2', 'Sex_male', 'Sex_female', 'Embarked_S', 'Embarked_C',
       'Embarked_Q', 'Ticket_Lett_A', 'Ticket_Lett_P', 'Ticket_Lett_S',
       'Ticket_Lett_1', 'Ticket_Lett_3', 'Ticket_Lett_2', 'Ticket_Lett_C',
       'Ticket_Lett_Low_ticket', 'Ticket_Lett_Other_ticket', 'Cabin_Letter_n',
       'Cabin_Letter_C', 'Cabin_Letter_E', 'Cabin_Letter_G', 'Cabin_Letter_D',
       'Cabin_Letter_A', 'Cabin_Letter_B', 'Cabin_Letter_F', 'Name_Title_Mr.',
       'Name_Title_Mrs.', 'Name_Title_Miss.', 'Name_Title_Master.',
       'Name_Title_Rev.', 'Name_Title_Dr.', 'Name_Title_Ms.',
       'Name_Title_Col.', 'Fam_Size_Nuclear', 'Fam_Size_Solo', 'Fam_Size_Big'],
      dtype='object')

	variable	importance
12	Sex_female	0.111215
11	Sex_male	0.109769
33	Name_Title_Mr.	0.109746
1	Fare	0.088209
2	Name_Len	0.087904
0	Age	0.078651
8	Pclass_3	0.043268
35	Name_Title_Miss.	0.031292
7	Ticket_Len	0.031079
34	Name_Title_Mrs.	0.028852
25	Cabin_Letter_n	0.027893
43	Fam_Size_Big	0.025199
41	Fam_Size_Nuclear	0.022704
9	Pclass_1	0.021810
19	Ticket_Lett_1	0.017999
20	Ticket_Lett_3	0.012902
10	Pclass_2	0.012345
36	Name_Title_Master.	0.012098
23	Ticket_Lett_Low_ticket	0.011723
13	Embarked_S	0.011546


In [1]:
import time

# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# train, test, validate
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix

# models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

import xgboost as xgb
from xgboost.sklearn import XGBClassifier


# scaling
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer

# decomposition
from sklearn.decomposition import PCA
from sklearn.decomposition import NMF

# feature engineering
from sklearn.preprocessing import PolynomialFeatures

# feature selection
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import RFE

In [2]:
# Utility function to report best scores
def report(results, n_top=3):
    """Utility function to report best scores
    """
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Rank: {0}".format(i))
            print("Score: {0:f} (std: {1:f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Pars: {0}".format(results['params'][candidate]))
            print("")
            

def report2(results, n_top=3):
    """Utility function to report best scores
    """
    print("Rank|Score(std)|Params", list(results['params'][0].keys()))
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("{0}|".format(i), end="")
            print("{0:f}(std:{1:f})|".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]), end="")
            print("{0}".format(list(results['params'][candidate].values())))

In [3]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

def names(train, test):
    for i in [train, test]:
        i['Name_Len'] = i['Name'].apply(lambda x: len(x))
        i['Name_Title'] = i['Name'].apply(lambda x: x.split(',')[1]).apply(lambda x: x.split()[0])
        del i['Name']
    return train, test

def age_impute(train, test):
    for i in [train, test]:
        i['Age_Null_Flag'] = i['Age'].apply(lambda x: 1 if pd.isnull(x) else 0)
        data = train.groupby(['Name_Title', 'Pclass'])['Age']
        i['Age'] = data.transform(lambda x: x.fillna(x.mean()))
    return train, test


def fam_size(train, test):
    for i in [train, test]:
        i['Fam_Size'] = np.where((i['SibSp']+i['Parch']) == 0 , 'Solo',
                           np.where((i['SibSp']+i['Parch']) <= 3,'Nuclear', 'Big'))
        del i['SibSp']
        del i['Parch']
    return train, test


def ticket_grouped(train, test):
    for i in [train, test]:
        i['Ticket_Lett'] = i['Ticket'].apply(lambda x: str(x)[0])
        i['Ticket_Lett'] = i['Ticket_Lett'].apply(lambda x: str(x))
        i['Ticket_Lett'] = np.where((i['Ticket_Lett']).isin(['1', '2', '3', 'S', 'P', 'C', 'A']), i['Ticket_Lett'],
                                   np.where((i['Ticket_Lett']).isin(['W', '4', '7', '6', 'L', '5', '8']),
                                            'Low_ticket', 'Other_ticket'))
        i['Ticket_Len'] = i['Ticket'].apply(lambda x: len(x))
        del i['Ticket']
    return train, test


def cabin(train, test):
    for i in [train, test]:
        i['Cabin_Letter'] = i['Cabin'].apply(lambda x: str(x)[0])
        del i['Cabin']
    return train, test

def cabin_num(train, test):
    for i in [train, test]:
        i['Cabin_num1'] = i['Cabin'].apply(lambda x: str(x).split(' ')[-1][1:])
        i['Cabin_num1'].replace('an', np.NaN, inplace = True)
        i['Cabin_num1'] = i['Cabin_num1'].apply(lambda x: int(x) if not pd.isnull(x) and x != '' else np.NaN)
        i['Cabin_num'] = pd.qcut(train['Cabin_num1'],3)
    train = pd.concat((train, pd.get_dummies(train['Cabin_num'], prefix = 'Cabin_num')), axis = 1)
    test = pd.concat((test, pd.get_dummies(test['Cabin_num'], prefix = 'Cabin_num')), axis = 1)
    del train['Cabin_num']
    del test['Cabin_num']
    del train['Cabin_num1']
    del test['Cabin_num1']
    return train, test


def embarked_impute(train, test):
    for i in [train, test]:
        i['Embarked'] = i['Embarked'].fillna('S')
    return train, test

def dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked', 'Ticket_Lett', 'Cabin_Letter', 'Name_Title', 'Fam_Size']):
    for column in columns:
        train[column] = train[column].apply(lambda x: str(x))
        test[column] = test[column].apply(lambda x: str(x))
        good_cols = [column+'_'+i for i in train[column].unique() if i in test[column].unique()]
        train = pd.concat((train, pd.get_dummies(train[column], prefix = column)[good_cols]), axis = 1)
        test = pd.concat((test, pd.get_dummies(test[column], prefix = column)[good_cols]), axis = 1)
        del train[column]
        del test[column]
    return train, test

def drop(train, test, bye = ['PassengerId']):
    for i in [train, test]:
        for z in bye:
            del i[z]
    return train, test


train = pd.read_csv(os.path.join('../input', 'train.csv'))
test = pd.read_csv(os.path.join('../input', 'test.csv'))
train, test = names(train, test)
train, test = age_impute(train, test)
train, test = cabin_num(train, test)
train, test = cabin(train, test)
train, test = embarked_impute(train, test)
train, test = fam_size(train, test)
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
train, test = ticket_grouped(train, test)
train, test = dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked', 'Ticket_Lett',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
train, test = drop(train, test)

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(max_features='auto', oob_score=True, random_state=1, n_jobs=3)

param_grid = { "criterion" : ["gini", "entropy"], 
              "min_samples_leaf" : [1, 5, 10], 
              "min_samples_split" : [2, 4, 10, 12, 16],
              "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf, param_grid=param_grid, scoring='accuracy', 
                  cv=3, n_jobs=6, verbose=1)

gs = gs.fit(train.iloc[:, 1:], train.iloc[:, 0])

print(gs.best_score_)
print(gs.best_params_)
print(gs.cv_results_)

Fitting 3 folds for each of 150 candidates, totalling 450 fits


[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    7.1s
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed:   35.0s
[Parallel(n_jobs=6)]: Done 438 tasks      | elapsed:  1.4min
[Parallel(n_jobs=6)]: Done 450 out of 450 | elapsed:  1.4min finished


0.8383838383838383
{'criterion': 'gini', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 700}
{'mean_fit_time': array([0.15842684, 0.20610189, 0.77891509, 1.3159349 , 1.81066648,
       0.15871294, 0.21484613, 0.73826059, 1.30018926, 1.84731388,
       0.16110396, 0.21298226, 0.74133595, 1.26565758, 1.74319816,
       0.15601206, 0.20452531, 0.71357481, 1.2697436 , 1.79620226,
       0.16151484, 0.20147006, 0.73195982, 1.28742313, 1.71358395,
       0.15810053, 0.21529174, 0.70337979, 1.25526365, 1.74323511,
       0.17875409, 0.20810652, 0.77733835, 1.246116  , 1.71341729,
       0.16111787, 0.21901266, 0.74701532, 1.20088951, 1.68733319,
       0.16582712, 0.21258068, 0.74446193, 1.3210837 , 1.65187192,
       0.16184139, 0.21935638, 0.68294748, 1.18026821, 1.67313457,
       0.16449817, 0.2005802 , 0.72505887, 1.20647383, 1.70750117,
       0.16628273, 0.21318165, 0.78740104, 1.23061347, 1.77697086,
       0.16222143, 0.2145226 , 0.75279522, 1.28087473, 1.66462541,
 

In [9]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=700,
                             min_samples_split=10,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 1:], train.iloc[:, 0])
print("%.4f" % rf.oob_score_)

0.8294


----

In [10]:
print("best parameters:", gs.best_params_)
print("Mean cross-validated score of the best_estimator: ", gs.best_score_)
print("")
report2(gs.cv_results_, n_top=10)

best parameters: {'criterion': 'gini', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 700}
Mean cross-validated score of the best_estimator:  0.8383838383838383

Rank|Score(std)|Params ['criterion', 'min_samples_leaf', 'min_samples_split', 'n_estimators']
1|0.838384(std:0.018027)|['gini', 1, 10, 700]
1|0.838384(std:0.016495)|['gini', 1, 10, 1000]
1|0.838384(std:0.009912)|['entropy', 1, 4, 400]
1|0.838384(std:0.009912)|['entropy', 1, 12, 50]
5|0.837262(std:0.013561)|['gini', 1, 4, 50]
5|0.837262(std:0.004199)|['gini', 1, 4, 100]
5|0.837262(std:0.011111)|['gini', 1, 4, 700]
5|0.837262(std:0.016798)|['entropy', 1, 12, 1000]
9|0.836139(std:0.011446)|['gini', 1, 4, 400]
9|0.836139(std:0.009655)|['gini', 1, 4, 1000]
9|0.836139(std:0.022050)|['entropy', 1, 4, 100]
9|0.836139(std:0.009655)|['entropy', 1, 4, 700]
9|0.836139(std:0.019888)|['entropy', 1, 16, 100]


In [11]:
train.columns

Index(['Survived', 'Age', 'Fare', 'Name_Len', 'Age_Null_Flag',
       'Cabin_num_(1.999, 28.667]', 'Cabin_num_(28.667, 65.667]',
       'Cabin_num_(65.667, 148.0]', 'Ticket_Len', 'Pclass_3', 'Pclass_1',
       'Pclass_2', 'Sex_male', 'Sex_female', 'Embarked_S', 'Embarked_C',
       'Embarked_Q', 'Ticket_Lett_A', 'Ticket_Lett_P', 'Ticket_Lett_S',
       'Ticket_Lett_1', 'Ticket_Lett_3', 'Ticket_Lett_2', 'Ticket_Lett_C',
       'Ticket_Lett_Low_ticket', 'Ticket_Lett_Other_ticket', 'Cabin_Letter_n',
       'Cabin_Letter_C', 'Cabin_Letter_E', 'Cabin_Letter_G', 'Cabin_Letter_D',
       'Cabin_Letter_A', 'Cabin_Letter_B', 'Cabin_Letter_F', 'Name_Title_Mr.',
       'Name_Title_Mrs.', 'Name_Title_Miss.', 'Name_Title_Master.',
       'Name_Title_Rev.', 'Name_Title_Dr.', 'Name_Title_Ms.',
       'Name_Title_Col.', 'Fam_Size_Nuclear', 'Fam_Size_Solo', 'Fam_Size_Big'],
      dtype='object')

In [14]:
pd.concat((pd.DataFrame(train.columns, columns = ['variable']), 
           pd.DataFrame(gs.best_estimator_.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)

Unnamed: 0,variable,importance
12,Sex_male,0.111215
11,Pclass_2,0.109769
33,Cabin_Letter_F,0.109746
1,Age,0.088209
2,Fare,0.087904
0,Survived,0.078651
8,Ticket_Len,0.043268
35,Name_Title_Mrs.,0.031292
7,"Cabin_num_(65.667, 148.0]",0.031079
34,Name_Title_Mr.,0.028852


In [18]:
# get passenger id
_test = pd.read_csv(os.path.join('../input', 'test.csv'))

y_pred = gs.best_estimator_.predict(test).astype(int)

submission = pd.DataFrame({
        "PassengerId": _test["PassengerId"].astype(int),
        "Survived": y_pred
    })
submission.to_csv('../output/_titanic-random-forest-82-78.csv', index=False)

----

# original

https://www.kaggle.com/konstantinmasich/titanic-0-82-0-83/

# points



----

# original

https://www.kaggle.com/francksylla/titanic-machine-learning-from-disaster

# points