In [1]:
import pprint, itertools
import numpy as np
import pandas as pd
import scipy as sp
from sklearn import neighbors, metrics, model_selection
from matplotlib import pyplot as plt

In [2]:
train = pd.read_csv('data_preprocess_fill_train.csv', sep='|', index_col='PassengerId')
test = pd.read_csv('data_preprocess_fill_test.csv', sep='|', index_col='PassengerId')
survived_train = pd.read_csv('survived_train.csv', sep='|', index_col='PassengerId')
survived_test = pd.read_csv('survived_test.csv', sep='|', index_col='PassengerId')

In [3]:
train.shape[0], survived_train.shape[0]

(891, 891)

In [4]:
scores = dict()

In [5]:
train.head()

Unnamed: 0_level_0,Pclass,Name,Nickname,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Level,Bridge,Cabin
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,3,"Braund, Mr. Owen Harris",Braund,1,22,1,0,A/5 21171,7,2,F,F,F G73
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",Cumings,0,38,1,0,PC 17599,71,0,C,C8,C85
3,3,"Heikkinen, Miss. Laina",Heikkinen,0,26,0,0,STON/O2. 3101282,7,2,E,E1,E101
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",Futrelle,0,35,1,0,113803,53,2,C,C1,C123
5,3,"Allen, Mr. William Henry",Allen,1,35,0,0,373450,8,2,B,B5,B51 B53 B55


### <font color="brown">Preprocess Level</font>

In [6]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

le.fit(list(train['Level'].unique()))

print train['Level'].unique()
print le.classes_
print le.transform(train['Level'])[:10]
print le.transform(test['Level'])[:10] 
print (train['Level'] == le.inverse_transform(le.transform(train['Level']))).all()

new_train_level = le.transform(train['Level'])
new_test_level = le.transform(test['Level'])

['F' 'C' 'E' 'B' 'G' 'A' 'D' 'T']
['A' 'B' 'C' 'D' 'E' 'F' 'G' 'T']
[5 2 4 2 1 5 4 5 4 1]
[3 1 1 5 3 3 5 2 5 2]
True


In [7]:
train_index = train['Level'].index
test_index = test['Level'].index

train['Level'] = pd.Series(new_train_level, index=train_index)
test['Level'] = pd.Series(new_test_level, index=test_index)

train.head(), test.head()

(             Pclass                                               Name  \
 PassengerId                                                              
 1                 3                            Braund, Mr. Owen Harris   
 2                 1  Cumings, Mrs. John Bradley (Florence Briggs Th...   
 3                 3                             Heikkinen, Miss. Laina   
 4                 1       Futrelle, Mrs. Jacques Heath (Lily May Peel)   
 5                 3                           Allen, Mr. William Henry   
 
               Nickname  Sex  Age  SibSp  Parch            Ticket  Fare  \
 PassengerId                                                              
 1               Braund    1   22      1      0         A/5 21171     7   
 2              Cumings    0   38      1      0          PC 17599    71   
 3            Heikkinen    0   26      0      0  STON/O2. 3101282     7   
 4             Futrelle    0   35      1      0            113803    53   
 5                Allen

### <font color="brown">Score du Classifieur KNN</font>

In [10]:
train_without_name_nickname_ticket_bridge_cabin = train.drop(['Name', 'Nickname', 'Ticket', 'Bridge', 'Cabin'], axis=1)

test_without_name_nickname_ticket_bridge_cabin = test.drop(['Name', 'Nickname', 'Ticket', 'Bridge', 'Cabin'], axis=1)

train_without_name_nickname_ticket_bridge_cabin.shape[0], survived_train.shape[0]

(891, 891)

In [11]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(train_without_name_nickname_ticket_bridge_cabin.values, survived_train.values.flatten())
score_without_name_nickname_ticket_bridge_cabin = neigh.score(test_without_name_nickname_ticket_bridge_cabin.values, survived_test.values.flatten())
print('score_without_name_nickname_ticket_bridge_cabin = ' + str(score_without_name_nickname_ticket_bridge_cabin))

scores['without_name_nickname_ticket_bridge_cabin'] = score_without_name_nickname_ticket_bridge_cabin

score_without_name_nickname_ticket_bridge_cabin = 0.645933014354


### <font color="brown">Meilleur score du Classifieur KNN par validation croisée stratifiée</font>

In [12]:
train_without_name_nickname_ticket_bridge_cabin.values

array([[ 3,  1, 22, ...,  7,  2,  5],
       [ 1,  0, 38, ..., 71,  0,  2],
       [ 3,  0, 26, ...,  7,  2,  4],
       ..., 
       [ 3,  0, 29, ..., 23,  2,  0],
       [ 1,  1, 26, ..., 30,  0,  2],
       [ 3,  1, 32, ...,  7,  1,  1]])

In [13]:
test_without_name_nickname_ticket_bridge_cabin.values

array([[  3.    ,   1.    ,  34.    , ...,   7.8292,   1.    ,   3.    ],
       [  3.    ,   0.    ,  47.    , ...,   7.    ,   2.    ,   1.    ],
       [  2.    ,   1.    ,  62.    , ...,   9.6875,   1.    ,   1.    ],
       ..., 
       [  3.    ,   1.    ,  38.    , ...,   7.25  ,   2.    ,   3.    ],
       [  3.    ,   1.    ,  30.    , ...,   8.05  ,   2.    ,   5.    ],
       [  3.    ,   1.    ,  30.    , ...,  22.3583,   0.    ,   2.    ]])

In [14]:
survived_train.values.flatten()

array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1,

In [15]:
print('\nClassifieur kNN par validation croisée stratifiée sur 5 folds avec optimisation de l\'hyperparamètre k')
# Choisir un score à optimiser, ici l'accuracy (proportion de prédictions correctes)
estimator = neighbors.KNeighborsClassifier()
param_grid={'n_neighbors':[3, 5, 7, 9, 11, 13, 15]}
score = 'accuracy'
# Créer un classifieur kNN avec recherche d'hyperparamètre par validation croisée
clf = model_selection.GridSearchCV(
	estimator, # un classifieur kNN
	param_grid, # hyperparamètres à tester
	cv=5, # nombre de folds de validation croisée
	scoring=score # score à optimiser
	)


Classifieur kNN par validation croisée stratifiée sur 5 folds avec optimisation de l'hyperparamètre k


In [16]:
clf.fit(train_without_name_nickname_ticket_bridge_cabin.values, survived_train.values.flatten())
survived_predict_test = clf.predict(test_without_name_nickname_ticket_bridge_cabin.values)

In [17]:
clf.best_params_

{'n_neighbors': 5}

In [18]:
clf.best_score_

0.70707070707070707

In [19]:
clf.cv_results_

{'mean_fit_time': array([ 0.00094185,  0.00046663,  0.00041981,  0.00050025,  0.00069122,
         0.0005302 ,  0.00065818]),
 'mean_score_time': array([ 0.00278521,  0.0013968 ,  0.00144343,  0.00163488,  0.00223799,
         0.0023181 ,  0.00212083]),
 'mean_test_score': array([ 0.70258137,  0.70707071,  0.69921437,  0.70482604,  0.70258137,
         0.70145903,  0.6969697 ]),
 'mean_train_score': array([ 0.83810835,  0.80556739,  0.78395983,  0.77021231,  0.76123339,
         0.74776442,  0.74130965]),
 'param_n_neighbors': masked_array(data = [3 5 7 9 11 13 15],
              mask = [False False False False False False False],
        fill_value = ?),
 'params': ({'n_neighbors': 3},
  {'n_neighbors': 5},
  {'n_neighbors': 7},
  {'n_neighbors': 9},
  {'n_neighbors': 11},
  {'n_neighbors': 13},
  {'n_neighbors': 15}),
 'rank_test_score': array([3, 1, 6, 2, 3, 5, 7], dtype=int32),
 'split0_test_score': array([ 0.63687151,  0.65363128,  0.63687151,  0.63128492,  0.62569832,
         0.

In [20]:
np.sum(survived_test.values.flatten() == survived_predict_test)

283

In [21]:
survived_predict_test.shape[0]

418

In [22]:
metrics.accuracy_score(survived_test.values.flatten(), survived_predict_test)

0.67703349282296654

In [23]:
283./418

0.6770334928229665

### <font color="brown">Transform Name Nickname to vector</font>

In [24]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
train['Nickname'] = count_vect.fit_transform(train['Nickname'])

In [25]:
train.head()

Unnamed: 0_level_0,Pclass,Name,Nickname,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Level,Bridge,Cabin
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,3,"Braund, Mr. Owen Harris","(0, 76)\t1\n (1, 145)\t1\n (2, 266)\t1\n ...",1,22,1,0,A/5 21171,7,2,5,F,F G73
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...","(0, 76)\t1\n (1, 145)\t1\n (2, 266)\t1\n ...",0,38,1,0,PC 17599,71,0,2,C8,C85
3,3,"Heikkinen, Miss. Laina","(0, 76)\t1\n (1, 145)\t1\n (2, 266)\t1\n ...",0,26,0,0,STON/O2. 3101282,7,2,4,E1,E101
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)","(0, 76)\t1\n (1, 145)\t1\n (2, 266)\t1\n ...",0,35,1,0,113803,53,2,2,C1,C123
5,3,"Allen, Mr. William Henry","(0, 76)\t1\n (1, 145)\t1\n (2, 266)\t1\n ...",1,35,0,0,373450,8,2,1,B5,B51 B53 B55


In [26]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
train['Name'] = count_vect.fit_transform(train['Name'])

In [27]:
train.head()

Unnamed: 0_level_0,Pclass,Name,Nickname,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Level,Bridge,Cabin
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,3,"(0, 580)\t1\n (0, 1096)\t1\n (0, 1012)\t1\...","(0, 76)\t1\n (1, 145)\t1\n (2, 266)\t1\n ...",1,22,1,0,A/5 21171,7,2,5,F,F G73
2,1,"(0, 580)\t1\n (0, 1096)\t1\n (0, 1012)\t1\...","(0, 76)\t1\n (1, 145)\t1\n (2, 266)\t1\n ...",0,38,1,0,PC 17599,71,0,2,C8,C85
3,3,"(0, 580)\t1\n (0, 1096)\t1\n (0, 1012)\t1\...","(0, 76)\t1\n (1, 145)\t1\n (2, 266)\t1\n ...",0,26,0,0,STON/O2. 3101282,7,2,4,E1,E101
4,1,"(0, 580)\t1\n (0, 1096)\t1\n (0, 1012)\t1\...","(0, 76)\t1\n (1, 145)\t1\n (2, 266)\t1\n ...",0,35,1,0,113803,53,2,2,C1,C123
5,3,"(0, 580)\t1\n (0, 1096)\t1\n (0, 1012)\t1\...","(0, 76)\t1\n (1, 145)\t1\n (2, 266)\t1\n ...",1,35,0,0,373450,8,2,1,B5,B51 B53 B55


### <font color="brown">Preprocess Bridge</font>

In [33]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

le.fit(list(train['Bridge'].unique())+list(test['Bridge'].unique()))

print train['Bridge'].unique()
print le.classes_
print le.transform(train['Bridge'])[:10]
print le.transform(test['Bridge'])[:10] 
print (train['Bridge'] == le.inverse_transform(le.transform(train['Bridge']))).all()

new_train_bridge = le.transform(train['Bridge'])
new_test_bridge = le.transform(test['Bridge'])

[43 23 34 16 11 45 37 44 10 47  2 25 30 35  4 17 13  0 46 28  9 20 27  8  1
 36  3 26 29 14 15  7  5 19 24 22 39 32 21 18 48 38 33 41 40 31 12]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48]
[43 23 34 16 11 45 37 44 34 10]
[25 11  8 43 28 28 43 16 42 16]
True


In [34]:
train_index = train['Bridge'].index
test_index = test['Bridge'].index

train['Bridge'] = pd.Series(new_train_bridge, index=train_index)
test['Bridge'] = pd.Series(new_test_bridge, index=test_index)

train.head(), test.head()

(             Pclass                                               Name  \
 PassengerId                                                              
 1                 3    (0, 580)\t1\n  (0, 1096)\t1\n  (0, 1012)\t1\...   
 2                 1    (0, 580)\t1\n  (0, 1096)\t1\n  (0, 1012)\t1\...   
 3                 3    (0, 580)\t1\n  (0, 1096)\t1\n  (0, 1012)\t1\...   
 4                 1    (0, 580)\t1\n  (0, 1096)\t1\n  (0, 1012)\t1\...   
 5                 3    (0, 580)\t1\n  (0, 1096)\t1\n  (0, 1012)\t1\...   
 
                                                       Nickname  Sex  Age  \
 PassengerId                                                                
 1              (0, 76)\t1\n  (1, 145)\t1\n  (2, 266)\t1\n  ...    1   22   
 2              (0, 76)\t1\n  (1, 145)\t1\n  (2, 266)\t1\n  ...    0   38   
 3              (0, 76)\t1\n  (1, 145)\t1\n  (2, 266)\t1\n  ...    0   26   
 4              (0, 76)\t1\n  (1, 145)\t1\n  (2, 266)\t1\n  ...    0   35   
 5         

In [35]:
train_without_ticket_cabin = train.drop(['Ticket', 'Cabin'], axis=1)

test_without_ticket_cabin = test.drop(['Ticket', 'Cabin'], axis=1)

train_without_ticket_cabin.shape[0], survived_train.shape[0]

(891, 891)

In [41]:
train_without_ticket_cabin.head()

Unnamed: 0_level_0,Pclass,Name,Nickname,Sex,Age,SibSp,Parch,Fare,Embarked,Level,Bridge
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,3,"(0, 580)\t1\n (0, 1096)\t1\n (0, 1012)\t1\...","(0, 76)\t1\n (1, 145)\t1\n (2, 266)\t1\n ...",1,22,1,0,7,2,5,43
2,1,"(0, 580)\t1\n (0, 1096)\t1\n (0, 1012)\t1\...","(0, 76)\t1\n (1, 145)\t1\n (2, 266)\t1\n ...",0,38,1,0,71,0,2,23
3,3,"(0, 580)\t1\n (0, 1096)\t1\n (0, 1012)\t1\...","(0, 76)\t1\n (1, 145)\t1\n (2, 266)\t1\n ...",0,26,0,0,7,2,4,34
4,1,"(0, 580)\t1\n (0, 1096)\t1\n (0, 1012)\t1\...","(0, 76)\t1\n (1, 145)\t1\n (2, 266)\t1\n ...",0,35,1,0,53,2,2,16
5,3,"(0, 580)\t1\n (0, 1096)\t1\n (0, 1012)\t1\...","(0, 76)\t1\n (1, 145)\t1\n (2, 266)\t1\n ...",1,35,0,0,8,2,1,11


In [40]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(train_without_ticket_cabin.values, survived_train.values.flatten())

ValueError: setting an array element with a sequence.

In [37]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(train_without_ticket_cabin.values, survived_train.values.flatten())

ValueError: setting an array element with a sequence.

In [36]:
score_without_ticket_cabin = neigh.score(test_without_ticket_cabin.values, survived_test.values.flatten())
print('score_without_name_nickname_ticket_bridge_cabin = ' + str(score_without_ticket_cabin))

scores['without_name_nickname_ticket_bridge_cabin'] = score_without_ticket_cabin

ValueError: setting an array element with a sequence.