In [1]:
import pprint, itertools
import numpy as np
import pandas as pd
import scipy as sp
from sklearn import neighbors, metrics, model_selection
from matplotlib import pyplot as plt

In [2]:
train = pd.read_csv('clean_train_embarked_age_drop_cabin_bridge_name_ticket.csv', sep='|', index_col='PassengerId')
test = pd.read_csv('clean_test_embarked_age_drop_cabin_bridge_name_ticket.csv', sep='|', index_col='PassengerId')
survived_train = pd.read_csv('survived_train_final.csv', sep='|', index_col='PassengerId')
survived_test = pd.read_csv('survived_test_final.csv', sep='|', index_col='PassengerId')

In [3]:
scores = dict()

### <font color="brown">Set without sex and Embarked Drop cabin name ticket bridge</font>

In [4]:
train_without_sex_embarked = train[['Age', 'Pclass', 'SibSp', 'Parch', 'Fare']]
test_without_sex_embarked = test[['Age', 'Pclass', 'SibSp', 'Parch', 'Fare']]

In [5]:
train_without_sex_embarked.shape, survived_train.shape[0]

train_without_sex_embarked.values, train_without_sex_embarked.keys

type(train_without_sex_embarked)

pandas.core.frame.DataFrame

### <font color="brown">Score du Classifieur KNN</font>

In [6]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(train_without_sex_embarked.values, survived_train.values.flatten())
score_without_sex_embarked = neigh.score(test_without_sex_embarked.values, survived_test.values.flatten())
print('score_without_sex_embarked = ' + str(score_without_sex_embarked))

scores['Drop cabin name ticket bridge Without sex embarked'] = score_without_sex_embarked

score_without_sex_embarked = 0.570996978852


In [7]:
scores['Drop cabin name ticket bridge Without sex embarked'] = score_without_sex_embarked

### <font color="brown">Set only with Age and Pclass</font>

In [8]:
train_age_pclass = train[['Age', 'Pclass']]
test_age_pclass = test[['Age', 'Pclass']]

train_age_pclass.shape, survived_train.shape[0]

train_age_pclass.values, train_age_pclass.keys
type(train_age_pclass)

pandas.core.frame.DataFrame

### <font color="brown">Score du Classifieur KNN</font>

In [9]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(train_age_pclass.values, survived_train.values.flatten())
score_age_pclass = neigh.score(test_age_pclass.values, survived_test.values.flatten())
print('score_age_pclass = ' + str(score_age_pclass))

score_age_pclass = 0.546827794562


In [10]:
scores['Drop cabin name ticket bridge only age pclass'] = score_age_pclass

In [11]:
pprint.pprint(scores)

{'Drop cabin name ticket bridge Without sex embarked': 0.57099697885196377,
 'Drop cabin name ticket bridge only age pclass': 0.54682779456193353}


### <font color="brown">Preprocess Sex</font>

In [12]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(['male', 'female'])

print le.classes_
print le.transform(train['Sex'])[:10] 
print (train['Sex'] == le.inverse_transform(le.transform(train['Sex']))).all()

new_train_sex = le.transform(train['Sex'])
new_test_sex = le.transform(test['Sex'])

['female' 'male']
[1 0 0 0 1 1 1 0 0 0]
True


In [13]:
train_index = train['Sex'].index
test_index = test['Sex'].index

train['Sex'] = pd.Series(new_train_sex, index=train_index)
test['Sex'] = pd.Series(new_test_sex, index=test_index)

train.head(), test.head()

(             Pclass  Sex  Age  SibSp  Parch  Fare Embarked
 PassengerId                                               
 1                 3    1   22      1      0     7        S
 2                 1    0   38      1      0    71        C
 3                 3    0   26      0      0     7        S
 4                 1    0   35      1      0    53        S
 5                 3    1   35      0      0     8        S,
              Pclass  Sex  Age  SibSp  Parch  Fare Embarked
 PassengerId                                               
 892               3    1   34      0      0     7        Q
 893               3    0   47      1      0     7        S
 894               2    1   62      0      0     9        Q
 895               3    1   27      0      0     8        S
 896               3    0   22      1      1    12        S)

### <font color="brown">Preprocess Embarked</font>

In [14]:
le.fit(['S', 'C', 'Q'])

print train['Embarked'].unique()
print le.classes_
print le.transform(train['Embarked'])[:10]
print le.transform(test['Embarked'])[:10] 
print (train['Embarked'] == le.inverse_transform(le.transform(train['Embarked']))).all()

new_train_embarked = le.transform(train['Embarked'])
new_test_embarked = le.transform(test['Embarked'])

['S' 'C' 'Q']
['C' 'Q' 'S']
[2 0 2 2 2 2 2 2 0 2]
[1 2 1 2 2 2 1 2 0 2]
True


In [15]:
train_index = train['Embarked'].index
train['Embarked'] = pd.Series(new_train_embarked, index=train_index)
train.head()

test_index = test['Embarked'].index
test['Embarked'] = pd.Series(new_test_embarked, index=test_index)
test.head()

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
892,3,1,34,0,0,7,1
893,3,0,47,1,0,7,2
894,2,1,62,0,0,9,1
895,3,1,27,0,0,8,2
896,3,0,22,1,1,12,2


In [16]:
train.head()

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,3,1,22,1,0,7,2
2,1,0,38,1,0,71,0
3,3,0,26,0,0,7,2
4,1,0,35,1,0,53,2
5,3,1,35,0,0,8,2


### <font color="brown">Set without embarked</font>

In [17]:
train_without_embarked = train[['Age', 'Pclass', 'SibSp', 'Parch', 'Fare', 'Sex']]
test_without_embarked = test[['Age', 'Pclass', 'SibSp', 'Parch', 'Fare', 'Sex']]

train_without_embarked.shape, survived_train.shape[0]

train_without_embarked.values, train_without_embarked.keys
type(train_age_pclass)

pandas.core.frame.DataFrame

In [18]:
train_without_embarked.head()

Unnamed: 0_level_0,Age,Pclass,SibSp,Parch,Fare,Sex
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,22,3,1,0,7,1
2,38,1,1,0,71,0
3,26,3,0,0,7,0
4,35,1,1,0,53,0
5,35,3,0,0,8,1


In [19]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
train_without_embarked.values

array([[22,  3,  1,  0,  7,  1],
       [38,  1,  1,  0, 71,  0],
       [26,  3,  0,  0,  7,  0],
       ..., 
       [19,  1,  0,  0, 30,  0],
       [26,  1,  0,  0, 30,  1],
       [32,  3,  0,  0,  7,  1]])

In [20]:
neigh.fit(train_without_embarked.values, survived_train.values.flatten())
score_without_embarked = neigh.score(test_without_embarked.values, survived_test.values.flatten())
print('score_without_embarked = ' + str(score_without_embarked))

score_without_embarked = 0.622356495468


In [21]:
scores['Drop cabin name ticket bridge without embarked'] = score_without_embarked

### <font color="brown">Set without sex</font>

In [22]:
train_without_sex = train[['Age', 'Pclass', 'SibSp', 'Parch', 'Fare', 'Embarked']]
test_without_sex = test[['Age', 'Pclass', 'SibSp', 'Parch', 'Fare', 'Embarked']]

train_without_sex.shape, survived_train.shape[0]

train_without_sex.values, train_without_sex.keys
type(train_age_pclass)

pandas.core.frame.DataFrame

In [23]:
train_without_sex.head()

Unnamed: 0_level_0,Age,Pclass,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,22,3,1,0,7,2
2,38,1,1,0,71,0
3,26,3,0,0,7,2
4,35,1,1,0,53,2
5,35,3,0,0,8,2


In [24]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
train_without_sex.values

array([[22,  3,  1,  0,  7,  2],
       [38,  1,  1,  0, 71,  0],
       [26,  3,  0,  0,  7,  2],
       ..., 
       [19,  1,  0,  0, 30,  2],
       [26,  1,  0,  0, 30,  0],
       [32,  3,  0,  0,  7,  1]])

In [25]:
neigh.fit(train_without_sex.values, survived_train.values.flatten())
score_without_sex = neigh.score(test_without_sex.values, survived_test.values.flatten())
print('score_without_sex = ' + str(score_without_sex))

score_without_sex = 0.577039274924


In [26]:
scores['Drop cabin name ticket bridge without sex'] = score_without_sex

### <font color="brown">Set with all</font>

In [27]:
train.head()

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,3,1,22,1,0,7,2
2,1,0,38,1,0,71,0
3,3,0,26,0,0,7,2
4,1,0,35,1,0,53,2
5,3,1,35,0,0,8,2


In [28]:
train_age = train[['Age', 'Pclass', 'SibSp', 'Parch', 'Fare', 'Sex', 'Embarked']]
test_age = test[['Age', 'Pclass', 'SibSp', 'Parch', 'Fare', 'Sex', 'Embarked']]

train_age.shape, survived_train.shape[0]

train_age.values, train_age.keys
type(train_age)

pandas.core.frame.DataFrame

In [29]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
train.values

array([[ 3,  1, 22, ...,  0,  7,  2],
       [ 1,  0, 38, ...,  0, 71,  0],
       [ 3,  0, 26, ...,  0,  7,  2],
       ..., 
       [ 1,  0, 19, ...,  0, 30,  2],
       [ 1,  1, 26, ...,  0, 30,  0],
       [ 3,  1, 32, ...,  0,  7,  1]])

In [30]:
neigh.fit(train.values, survived_train.values.flatten()) 
score_drop = neigh.score(test.values, survived_test.values.flatten())
print('score_drop = ' + str(score_drop))

score_drop = 0.604229607251


In [31]:
scores['Drop cabin name ticket bridge'] = score_drop

In [32]:
pprint.pprint(scores)

{'Drop cabin name ticket bridge': 0.60422960725075525,
 'Drop cabin name ticket bridge Without sex embarked': 0.57099697885196377,
 'Drop cabin name ticket bridge only age pclass': 0.54682779456193353,
 'Drop cabin name ticket bridge without embarked': 0.62235649546827798,
 'Drop cabin name ticket bridge without sex': 0.57703927492447127}


## <font color="purple">KNN Classifier sur toutes les sous-Combinaisons train</font>

In [33]:
train.columns.values

array(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'], dtype=object)

In [34]:
liste = list(train.columns.values)
for i, e in enumerate(itertools.combinations(liste, 2)) : print i, list(e)
for i, e in enumerate(itertools.combinations(liste, 7)) : print i, list(e)
print len(liste)

0 ['Pclass', 'Sex']
1 ['Pclass', 'Age']
2 ['Pclass', 'SibSp']
3 ['Pclass', 'Parch']
4 ['Pclass', 'Fare']
5 ['Pclass', 'Embarked']
6 ['Sex', 'Age']
7 ['Sex', 'SibSp']
8 ['Sex', 'Parch']
9 ['Sex', 'Fare']
10 ['Sex', 'Embarked']
11 ['Age', 'SibSp']
12 ['Age', 'Parch']
13 ['Age', 'Fare']
14 ['Age', 'Embarked']
15 ['SibSp', 'Parch']
16 ['SibSp', 'Fare']
17 ['SibSp', 'Embarked']
18 ['Parch', 'Fare']
19 ['Parch', 'Embarked']
20 ['Fare', 'Embarked']
0 ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
7


In [35]:
def scores_samples(max_neighbors, train_global, test_global, survived_train_global, survived_test_global):
    scores = dict()
    columns = list(train_global.columns.values)
    for size in range(2, len(columns)+1) :
        samples = itertools.combinations(columns, size)
        for i, sample in enumerate(samples) :
            train = train_global[list(sample)]
            test = test_global[list(sample)]
            for n in range(3, max_neighbors):
                neigh = KNeighborsClassifier(n_neighbors=n)
                neigh.fit(train.values, survived_train_global.values.flatten())
                score = neigh.score(test.values, survived_test_global.values.flatten())
                scores[size, sample, n] = score
    return scores

In [36]:
all_sets = train, test, survived_train, survived_test
scores_samples(4, *all_sets)

{(2, ('Age', 'Embarked'), 3): 0.54380664652567978,
 (2, ('Age', 'Fare'), 3): 0.59818731117824775,
 (2, ('Age', 'Parch'), 3): 0.51359516616314205,
 (2, ('Age', 'SibSp'), 3): 0.54078549848942603,
 (2, ('Fare', 'Embarked'), 3): 0.5226586102719033,
 (2, ('Parch', 'Embarked'), 3): 0.61329305135951662,
 (2, ('Parch', 'Fare'), 3): 0.58912386706948638,
 (2, ('Pclass', 'Age'), 3): 0.54682779456193353,
 (2, ('Pclass', 'Embarked'), 3): 0.58610271903323263,
 (2, ('Pclass', 'Fare'), 3): 0.55891238670694865,
 (2, ('Pclass', 'Parch'), 3): 0.63141993957703924,
 (2, ('Pclass', 'Sex'), 3): 1.0,
 (2, ('Pclass', 'SibSp'), 3): 0.52870090634441091,
 (2, ('Sex', 'Age'), 3): 0.78247734138972813,
 (2, ('Sex', 'Embarked'), 3): 0.7643504531722054,
 (2, ('Sex', 'Fare'), 3): 0.86404833836858008,
 (2, ('Sex', 'Parch'), 3): 0.99697885196374625,
 (2, ('Sex', 'SibSp'), 3): 0.98187311178247738,
 (2, ('SibSp', 'Embarked'), 3): 0.59214501510574014,
 (2, ('SibSp', 'Fare'), 3): 0.54682779456193353,
 (2, ('SibSp', 'Parch'),

## <font color="purple">Pourquoi score = 1 ?? </font>

In [37]:
mean_scores = np.array(scores_samples(4, *all_sets).values()).mean()
max_scores = np.array(scores_samples(4, *all_sets).values()).max()
print mean_scores, max_scores
for key, value in scores_samples(4, *all_sets).iteritems() :
    if value == 1 :
        print key
    if value == max_scores :
        print key

0.673011077543 1.0
(2, ('Pclass', 'Sex'), 3)
(2, ('Pclass', 'Sex'), 3)
(3, ('Pclass', 'Sex', 'Embarked'), 3)
(3, ('Pclass', 'Sex', 'Embarked'), 3)


In [38]:
train_sex_pclass = train[['Sex', 'Pclass']]
test_sex_pclass = test[['Sex', 'Pclass']]

from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(train_sex_pclass.values, survived_train.values.flatten())
score_sex_pclass = neigh.score(test_sex_pclass.values, survived_test.values.flatten())
print('score_sex_pclass = ' + str(score_sex_pclass))

score_sex_pclass = 1.0


In [39]:
type(train_sex_pclass)

pandas.core.frame.DataFrame

In [40]:
type(test_sex_pclass)

pandas.core.frame.DataFrame

In [41]:
sex_pclass = pd.concat([train_sex_pclass, test_sex_pclass])
survived_sex_pclass = pd.concat([survived_train, survived_test])
sex_pclass = pd.concat([sex_pclass, survived_sex_pclass], axis=1)

In [42]:
sex_pclass.head()

Unnamed: 0_level_0,Sex,Pclass,Survived
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,3,0
2,0,1,1
3,0,3,1
4,0,1,1
5,1,3,0


In [43]:
sex_pclass.duplicated().value_counts()

True     1031
False      12
dtype: int64

#### <font color="blue">Le score = 1 car le data set sélectionné sur les colonnes sex, embarked et sa sortie survived forme un ensemble de 12 combinaisons possibles et celle-ci se répètent forcèment !!!!!</font>

In [44]:
sex_pclass.groupby(['Sex', 'Pclass', 'Survived']).size()

Sex  Pclass  Survived
0    1       0             3
             1           128
     2       0             6
             1            97
     3       0            55
             1            97
1    1       0           111
             1            40
     2       0           143
             1            15
     3       0           310
             1            38
dtype: int64