Go to <a href=#bookmark>my bookmark</a>

In [63]:
import time

# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# train, test, validate
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix

# models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

# scaling
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer

# decomposition
from sklearn.decomposition import PCA
from sklearn.decomposition import NMF

# feature engineering
from sklearn.preprocessing import PolynomialFeatures

# feature selection
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import RFE

In [64]:
train_df = pd.read_csv('../input/train.csv')
test_df = pd.read_csv('../input/test.csv')
combine = [train_df, test_df]

In [65]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [66]:
# del Ticket, Cabin columns
train_df = train_df.drop(['Ticket', 'Cabin'], axis=1)
test_df = test_df.drop(['Ticket', 'Cabin'], axis=1)
combine = [train_df, test_df]

# add title

In [67]:
# add title

for dataset in combine:
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
train_df.Title.value_counts()

Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Col           2
Major         2
Mlle          2
Mme           1
Lady          1
Countess      1
Sir           1
Don           1
Ms            1
Jonkheer      1
Capt          1
Name: Title, dtype: int64

## try one hote encoding without delete rare title

In [68]:
# try one hote encoding without delete rare title
#
# concat train and test data. and apply get_dummies for Title. 
# then split to original size. also drop Survived column from test_df
print(train_df.shape, test_df.shape)

# concat train and test data
#   test_df's Survived column is filled with NaN
train_test_df = pd.concat((train_df, test_df))

print(train_test_df.shape)

# apply get_dummies for Title
train_test_df = pd.get_dummies(train_test_df, columns=["Title"])

#train_test_df.head()
train_df = train_test_df.iloc[:train_df.shape[0]]
test_df = train_test_df.iloc[train_df.shape[0]:]

# drop added Survived column from test_df
test_df = test_df.drop("Survived", axis=1)
print(train_df.shape, test_df.shape)

combine = [train_df, test_df]

(891, 11) (418, 10)
(1309, 11)
(891, 28) (418, 27)


In [69]:
test_df.head()

Unnamed: 0,Age,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Title_Capt,...,Title_Major,Title_Master,Title_Miss,Title_Mlle,Title_Mme,Title_Mr,Title_Mrs,Title_Ms,Title_Rev,Title_Sir
0,34.5,Q,7.8292,"Kelly, Mr. James",0,892,3,male,0,0,...,0,0,0,0,0,1,0,0,0,0
1,47.0,S,7.0,"Wilkes, Mrs. James (Ellen Needs)",0,893,3,female,1,0,...,0,0,0,0,0,0,1,0,0,0
2,62.0,Q,9.6875,"Myles, Mr. Thomas Francis",0,894,2,male,0,0,...,0,0,0,0,0,1,0,0,0,0
3,27.0,S,8.6625,"Wirz, Mr. Albert",0,895,3,male,0,0,...,0,0,0,0,0,1,0,0,0,0
4,22.0,S,12.2875,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,896,3,female,1,0,...,0,0,0,0,0,0,1,0,0,0


## del rare title and map value

# drop Name, PassengerId

In [70]:
train_df = train_df.drop(['Name', 'PassengerId'], axis=1)
test_df = test_df.drop(['Name'], axis=1)
combine = [train_df, test_df]
train_df.head()

Unnamed: 0,Age,Embarked,Fare,Parch,Pclass,Sex,SibSp,Survived,Title_Capt,Title_Col,...,Title_Major,Title_Master,Title_Miss,Title_Mlle,Title_Mme,Title_Mr,Title_Mrs,Title_Ms,Title_Rev,Title_Sir
0,22.0,S,7.25,0,3,male,1,0.0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,38.0,C,71.2833,0,1,female,1,1.0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,26.0,S,7.925,0,3,female,0,1.0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,35.0,S,53.1,0,1,female,1,1.0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,35.0,S,8.05,0,3,male,0,0.0,0,0,...,0,0,0,0,0,1,0,0,0,0


# map value to Sex 

In [71]:
for dataset in combine:
    dataset["Sex"] = dataset["Sex"].map({'female':1, 'male':0}).astype(int)
train_df.head()

Unnamed: 0,Age,Embarked,Fare,Parch,Pclass,Sex,SibSp,Survived,Title_Capt,Title_Col,...,Title_Major,Title_Master,Title_Miss,Title_Mlle,Title_Mme,Title_Mr,Title_Mrs,Title_Ms,Title_Rev,Title_Sir
0,22.0,S,7.25,0,3,0,1,0.0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,38.0,C,71.2833,0,1,1,1,1.0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,26.0,S,7.925,0,3,1,0,1.0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,35.0,S,53.1,0,1,1,1,1.0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,35.0,S,8.05,0,3,0,0,0.0,0,0,...,0,0,0,0,0,1,0,0,0,0


# fill na value

In [72]:
train_df.isna().any()

Age                True
Embarked           True
Fare              False
Parch             False
Pclass            False
Sex               False
SibSp             False
Survived          False
Title_Capt        False
Title_Col         False
Title_Countess    False
Title_Don         False
Title_Dona        False
Title_Dr          False
Title_Jonkheer    False
Title_Lady        False
Title_Major       False
Title_Master      False
Title_Miss        False
Title_Mlle        False
Title_Mme         False
Title_Mr          False
Title_Mrs         False
Title_Ms          False
Title_Rev         False
Title_Sir         False
dtype: bool

In [73]:
test_df.isna().any()

Age                True
Embarked          False
Fare               True
Parch             False
PassengerId       False
Pclass            False
Sex               False
SibSp             False
Title_Capt        False
Title_Col         False
Title_Countess    False
Title_Don         False
Title_Dona        False
Title_Dr          False
Title_Jonkheer    False
Title_Lady        False
Title_Major       False
Title_Master      False
Title_Miss        False
Title_Mlle        False
Title_Mme         False
Title_Mr          False
Title_Mrs         False
Title_Ms          False
Title_Rev         False
Title_Sir         False
dtype: bool

## fill na of Age

In [74]:
guess_ages = np.zeros((2,3))

for dataset in combine:
    for i in range(0, 2):
        for j in range(0, 3):
            guess_df = dataset[(dataset['Sex'] == i) & \
                                  (dataset['Pclass'] == j+1)]['Age'].dropna()

            # age_mean = guess_df.mean()
            # age_std = guess_df.std()
            # age_guess = rnd.uniform(age_mean - age_std, age_mean + age_std)

            age_guess = guess_df.median()

            # Convert random age float to nearest .5 age
            guess_ages[i,j] = int( age_guess/0.5 + 0.5 ) * 0.5
            
    for i in range(0, 2):
        for j in range(0, 3):
            dataset.loc[ (dataset.Age.isnull()) & (dataset.Sex == i) & (dataset.Pclass == j+1),\
                    'Age'] = guess_ages[i,j]

    dataset['Age'] = dataset['Age'].astype(int)

train_df.head()

Unnamed: 0,Age,Embarked,Fare,Parch,Pclass,Sex,SibSp,Survived,Title_Capt,Title_Col,...,Title_Major,Title_Master,Title_Miss,Title_Mlle,Title_Mme,Title_Mr,Title_Mrs,Title_Ms,Title_Rev,Title_Sir
0,22,S,7.25,0,3,0,1,0.0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,38,C,71.2833,0,1,1,1,1.0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,26,S,7.925,0,3,1,0,1.0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,35,S,53.1,0,1,1,1,1.0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,35,S,8.05,0,3,0,0,0.0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [75]:
train_df.isna().any()

Age               False
Embarked           True
Fare              False
Parch             False
Pclass            False
Sex               False
SibSp             False
Survived          False
Title_Capt        False
Title_Col         False
Title_Countess    False
Title_Don         False
Title_Dona        False
Title_Dr          False
Title_Jonkheer    False
Title_Lady        False
Title_Major       False
Title_Master      False
Title_Miss        False
Title_Mlle        False
Title_Mme         False
Title_Mr          False
Title_Mrs         False
Title_Ms          False
Title_Rev         False
Title_Sir         False
dtype: bool

### tried keep Age feature and don't add AgeBand numerical feature
if both are there, it is duplicate information

### 2018/03/17 tried Age instead of Age band. But AgeBand is better score for almost all models.
svc score was same of little bit better.
random forest score became worse.
so AgeBand is better

## add age band

In [76]:
train_df['AgeBand'] = pd.cut(train_df['Age'], 5)
train_df.head()

Unnamed: 0,Age,Embarked,Fare,Parch,Pclass,Sex,SibSp,Survived,Title_Capt,Title_Col,...,Title_Master,Title_Miss,Title_Mlle,Title_Mme,Title_Mr,Title_Mrs,Title_Ms,Title_Rev,Title_Sir,AgeBand
0,22,S,7.25,0,3,0,1,0.0,0,0,...,0,0,0,0,1,0,0,0,0,"(16.0, 32.0]"
1,38,C,71.2833,0,1,1,1,1.0,0,0,...,0,0,0,0,0,1,0,0,0,"(32.0, 48.0]"
2,26,S,7.925,0,3,1,0,1.0,0,0,...,0,1,0,0,0,0,0,0,0,"(16.0, 32.0]"
3,35,S,53.1,0,1,1,1,1.0,0,0,...,0,0,0,0,0,1,0,0,0,"(32.0, 48.0]"
4,35,S,8.05,0,3,0,0,0.0,0,0,...,0,0,0,0,1,0,0,0,0,"(32.0, 48.0]"


## Overwrite AgeBand number on Age. means, drop Age and AgeBand text column

In [77]:
for dataset in combine:
    dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age'] = 4
train_df = train_df.drop(['AgeBand'], axis=1)
combine = [train_df, test_df]
train_df.head()

Unnamed: 0,Age,Embarked,Fare,Parch,Pclass,Sex,SibSp,Survived,Title_Capt,Title_Col,...,Title_Major,Title_Master,Title_Miss,Title_Mlle,Title_Mme,Title_Mr,Title_Mrs,Title_Ms,Title_Rev,Title_Sir
0,1,S,7.25,0,3,0,1,0.0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,2,C,71.2833,0,1,1,1,1.0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,1,S,7.925,0,3,1,0,1.0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,2,S,53.1,0,1,1,1,1.0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,2,S,8.05,0,3,0,0,0.0,0,0,...,0,0,0,0,0,1,0,0,0,0


# Create new feature

In [78]:
for dataset in combine:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

for dataset in combine:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1

for dataset in combine:
    dataset['Age*Class'] = dataset.Age * dataset.Pclass
    

train_df.head()

Unnamed: 0,Age,Embarked,Fare,Parch,Pclass,Sex,SibSp,Survived,Title_Capt,Title_Col,...,Title_Mlle,Title_Mme,Title_Mr,Title_Mrs,Title_Ms,Title_Rev,Title_Sir,FamilySize,IsAlone,Age*Class
0,1,S,7.25,0,3,0,1,0.0,0,0,...,0,0,1,0,0,0,0,2,0,3
1,2,C,71.2833,0,1,1,1,1.0,0,0,...,0,0,0,1,0,0,0,2,0,2
2,1,S,7.925,0,3,1,0,1.0,0,0,...,0,0,0,0,0,0,0,1,1,3
3,2,S,53.1,0,1,1,1,1.0,0,0,...,0,0,0,1,0,0,0,2,0,2
4,2,S,8.05,0,3,0,0,0.0,0,0,...,0,0,1,0,0,0,0,1,1,6


# select family related feature
Parch, SibSp, FaimilySize, IsAlone

2018/03/18 Parch and SibSp only was best for almost all models

In [79]:
# keep Parch, SibSp only. this was best amoung familly related features

train_df = train_df.drop(['FamilySize', 'IsAlone'], axis=1)
test_df = test_df.drop(['FamilySize', 'IsAlone'], axis=1)
combine = [train_df, test_df]

train_df.head()

Unnamed: 0,Age,Embarked,Fare,Parch,Pclass,Sex,SibSp,Survived,Title_Capt,Title_Col,...,Title_Master,Title_Miss,Title_Mlle,Title_Mme,Title_Mr,Title_Mrs,Title_Ms,Title_Rev,Title_Sir,Age*Class
0,1,S,7.25,0,3,0,1,0.0,0,0,...,0,0,0,0,1,0,0,0,0,3
1,2,C,71.2833,0,1,1,1,1.0,0,0,...,0,0,0,0,0,1,0,0,0,2
2,1,S,7.925,0,3,1,0,1.0,0,0,...,0,1,0,0,0,0,0,0,0,3
3,2,S,53.1,0,1,1,1,1.0,0,0,...,0,0,0,0,0,1,0,0,0,2
4,2,S,8.05,0,3,0,0,0.0,0,0,...,0,0,0,0,1,0,0,0,0,6


# fill missing Embarked 

In [80]:
freq_port = train_df.Embarked.dropna().mode()[0]
for dataset in combine:
    dataset['Embarked'] = dataset['Embarked'].fillna(freq_port)


# Converting Embarked categorical feature to numeric

# try one hot encoding for Embarked categorical feature
2018/03/18 this is better than using converting categorical to numeric

In [81]:
# try one hote encoding for Embarked
#
# concat train and test data. and apply get_dummies for Title. 
# then split to original size. also drop Survived column from test_df
print(train_df.shape, test_df.shape)

# concat train and test data
#   test_df's Survived column is filled with NaN
train_test_df = pd.concat((train_df, test_df))

print(train_test_df.shape)

# apply get_dummies for Title
train_test_df = pd.get_dummies(train_test_df, columns=["Embarked"])

#train_test_df.head()
train_df = train_test_df.iloc[:train_df.shape[0]]
test_df = train_test_df.iloc[train_df.shape[0]:]

# drop added Survived column from test_df
test_df = test_df.drop("Survived", axis=1)
print(train_df.shape, test_df.shape)

combine = [train_df, test_df]

train_df.head()

(891, 27) (418, 27)
(1309, 28)
(891, 30) (418, 29)


Unnamed: 0,Age,Age*Class,Fare,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Title_Capt,...,Title_Mlle,Title_Mme,Title_Mr,Title_Mrs,Title_Ms,Title_Rev,Title_Sir,Embarked_C,Embarked_Q,Embarked_S
0,1,3,7.25,0,,3,0,1,0.0,0,...,0,0,1,0,0,0,0,0,0,1
1,2,2,71.2833,0,,1,1,1,1.0,0,...,0,0,0,1,0,0,0,1,0,0
2,1,3,7.925,0,,3,1,0,1.0,0,...,0,0,0,0,0,0,0,0,0,1
3,2,2,53.1,0,,1,1,1,1.0,0,...,0,0,0,1,0,0,0,0,0,1
4,2,6,8.05,0,,3,0,0,0.0,0,...,0,0,1,0,0,0,0,0,0,1


# fill na of test data Fare

In [82]:
test_df['Fare'].fillna(test_df['Fare'].dropna().median(), inplace=True)
test_df.head()

Unnamed: 0,Age,Age*Class,Fare,Parch,PassengerId,Pclass,Sex,SibSp,Title_Capt,Title_Col,...,Title_Mlle,Title_Mme,Title_Mr,Title_Mrs,Title_Ms,Title_Rev,Title_Sir,Embarked_C,Embarked_Q,Embarked_S
0,2,6,7.8292,0,892.0,3,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
1,2,6,7.0,0,893.0,3,1,1,0,0,...,0,0,0,1,0,0,0,0,0,1
2,3,6,9.6875,0,894.0,2,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
3,1,3,8.6625,0,895.0,3,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
4,1,3,12.2875,1,896.0,3,1,1,0,0,...,0,0,0,1,0,0,0,0,0,1


# make Fareband feature

In [83]:
train_df['FareBand'] = pd.qcut(train_df['Fare'], 4)

for dataset in combine:
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)

train_df = train_df.drop(['FareBand'], axis=1)

combine = [train_df, test_df]
    
train_df.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,Age,Age*Class,Fare,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Title_Capt,...,Title_Mlle,Title_Mme,Title_Mr,Title_Mrs,Title_Ms,Title_Rev,Title_Sir,Embarked_C,Embarked_Q,Embarked_S
0,1,3,0,0,,3,0,1,0.0,0,...,0,0,1,0,0,0,0,0,0,1
1,2,2,3,0,,1,1,1,1.0,0,...,0,0,0,1,0,0,0,1,0,0
2,1,3,1,0,,3,1,0,1.0,0,...,0,0,0,0,0,0,0,0,0,1
3,2,2,3,0,,1,1,1,1.0,0,...,0,0,0,1,0,0,0,0,0,1
4,2,6,1,0,,3,0,0,0.0,0,...,0,0,1,0,0,0,0,0,0,1
5,1,3,1,0,,3,0,0,0.0,0,...,0,0,1,0,0,0,0,0,1,0
6,3,3,3,0,,1,0,0,0.0,0,...,0,0,1,0,0,0,0,0,0,1
7,0,0,2,1,,3,0,3,0.0,0,...,0,0,0,0,0,0,0,0,0,1
8,1,3,1,2,,3,1,0,1.0,0,...,0,0,0,1,0,0,0,0,0,1
9,0,0,2,0,,2,1,1,1.0,0,...,0,0,0,1,0,0,0,1,0,0


## try more fare band number

- no difference

## keep Fare feature and add FareBand numerical feature¶

- not good result

----

# model and estimate

In [84]:
train_df.head()

Unnamed: 0,Age,Age*Class,Fare,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Title_Capt,...,Title_Mlle,Title_Mme,Title_Mr,Title_Mrs,Title_Ms,Title_Rev,Title_Sir,Embarked_C,Embarked_Q,Embarked_S
0,1,3,0,0,,3,0,1,0.0,0,...,0,0,1,0,0,0,0,0,0,1
1,2,2,3,0,,1,1,1,1.0,0,...,0,0,0,1,0,0,0,1,0,0
2,1,3,1,0,,3,1,0,1.0,0,...,0,0,0,0,0,0,0,0,0,1
3,2,2,3,0,,1,1,1,1.0,0,...,0,0,0,1,0,0,0,0,0,1
4,2,6,1,0,,3,0,0,0.0,0,...,0,0,1,0,0,0,0,0,0,1


In [85]:
train_df[train_df <  0].any()

Age               False
Age*Class         False
Fare              False
Parch             False
PassengerId       False
Pclass            False
Sex               False
SibSp             False
Survived          False
Title_Capt        False
Title_Col         False
Title_Countess    False
Title_Don         False
Title_Dona        False
Title_Dr          False
Title_Jonkheer    False
Title_Lady        False
Title_Major       False
Title_Master      False
Title_Miss        False
Title_Mlle        False
Title_Mme         False
Title_Mr          False
Title_Mrs         False
Title_Ms          False
Title_Rev         False
Title_Sir         False
Embarked_C        False
Embarked_Q        False
Embarked_S        False
dtype: bool

In [86]:
train_df["Parch"].value_counts()

0    678
1    118
2     80
5      5
3      5
4      4
6      1
Name: Parch, dtype: int64

----

# make train, test data set from train.csv

In [87]:
X_train, X_test, y_train, y_test = train_test_split(X_train_df, y_train_df, test_size=0.33, random_state=42)
print("train/test data shape", X_train.shape, X_test.shape)

train/test data shape (596, 28) (295, 28)


----
# for submit, copy the best model and export output csv


In [88]:
# grid search for pipeline
param_grid = {'svc__C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
              'svc__gamma': [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]}

pipe = make_pipeline(RobustScaler(), SVC())

grid_search = GridSearchCV(pipe, param_grid=param_grid, cv=5, n_jobs=3)
grid_search.fit(X_train, y_train)

print("Mean cross-validated score of the best_estimator: ", grid_search.best_score_)
print("best parameters:", grid_search.best_params_)
print("test: ", grid_search.score(X_test, y_test))

Mean cross-validated score of the best_estimator:  0.8171140939597316
best parameters: {'svc__C': 10, 'svc__gamma': 0.01}
test:  0.8440677966101695


In [92]:
grid_search.fit(X_train_df, y_train_df)
print("Mean cross-validated score of the best_estimator: ", grid_search.best_score_)
print("best parameters:", grid_search.best_params_)


test_df_noid = test_df.drop("PassengerId", axis=1).copy()
y_pred = grid_search.predict(test_df_noid).astype(int)

submission = pd.DataFrame({
        "PassengerId": test_df["PassengerId"].astype(int),
        "Survived": y_pred
    })
submission.to_csv('../output/submission_robustscaler_svc.csv', index=False)

Mean cross-validated score of the best_estimator:  0.8316498316498316
best parameters: {'svc__C': 100, 'svc__gamma': 0.01}
