In [109]:
import pprint, itertools
import numpy as np
import pandas as pd
import scipy as sp
from sklearn import neighbors, metrics, model_selection
from matplotlib import pyplot as plt

In [110]:
survived_train = pd.read_csv('survived_train.csv', sep='|', index_col='PassengerId')
survived_test = pd.read_csv('survived_test.csv', sep='|', index_col='PassengerId')
train = pd.read_csv('data_train.csv', sep='|', index_col='PassengerId')
test = pd.read_csv('data_test.csv', sep='|', index_col='PassengerId')

In [111]:
survived_train.head()

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
1,0
2,1
3,1
4,1
5,0


In [112]:
survived_test.head()

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,1
894,0
895,0
896,1


In [113]:
train.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Bridge
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,C8
3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,C1
5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,


In [114]:
test.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Bridge
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,


# <font color="red">Vii) Modelisation</font>

### <font color="purple">Impact des supressions des nan</font>
#### <font color="blue">Supprimons les lignes de train qui contiennent des NaN</font>
#### <font color="blue">Cela concerne Age, Embarked, Cabin</font> 

In [115]:
clean_train_embarked = train[train['Embarked'].notnull()]

str(round(float(clean_train_embarked.shape[0])/train.shape[0]*100)) + ' %'

'100.0 %'

In [116]:
clean_test_embarked = test[test['Embarked'].notnull()]

str(round(float(clean_test_embarked.shape[0])/test.shape[0]*100)) + ' %'

'100.0 %'

#### <font color="blue">Le nettoyage avec le Embarked ne supprime pas beaucoup de données.</font>

In [117]:
clean_train_age = train[train['Age'].notnull()]

str(round(float(clean_train_age.shape[0])/train.shape[0]*100)) + ' %'

'80.0 %'

In [118]:
clean_test_age = test[test['Age'].notnull()]

str(round(float(clean_test_age.shape[0])/test.shape[0]*100)) + ' %'

'79.0 %'

#### <font color="blue">Le nettoyage avec l'âge supprime tout de même 20% des données</font> 

In [119]:
clean_train_cabin = train[train['Cabin'].notnull()]

str(round(float(clean_train_cabin.shape[0])/train.shape[0]*100)) + ' %'

'23.0 %'

In [120]:
clean_test_cabin = test[test['Cabin'].notnull()]

float(clean_test_cabin.shape[0])/test.shape[0]*100

21.770334928229666

#### <font color="blue">Le nettoyage avec Cabin ne garde que 20% des données</font>

## <font color="purple">Nettoyage</font>

### <font color="brown">Clean avec Age et Embarked</font>

In [121]:
clean_train_embarked_age = train[train['Age'].notnull() & train['Embarked'].notnull()]

str(round(float(clean_train_embarked_age.shape[0])/train.shape[0]*100)) + ' %'

'80.0 %'

In [122]:
clean_test_embarked_age = test[test['Age'].notnull() & test['Embarked'].notnull()]

str(round(float(clean_test_embarked_age.shape[0])/test.shape[0]*100)) + ' %'

'79.0 %'

#### <font color="blue">Le nettoyage du train avec l'age et l'embarked correspond au nettoyage du train avec l'age uniquement</font> 
#### <font color="blue">Les nan embarked sont donc quasiment tous inclus dans les nan de l'age</font> 

In [123]:
index = pd.Series(clean_train_embarked.index.isin(clean_train_age.index))

index.value_counts()/index.shape[0]*100

True     80.089989
False    19.910011
dtype: float64

In [124]:
index = pd.Series(clean_test_embarked.index.isin(clean_test_age.index))

index.value_counts()/index.shape[0]*100

True     79.425837
False    20.574163
dtype: float64

#### <font color="blue">oui les nan embarked sont à 80% inclus dans les nan Age</font> 
#### <font color="blue">on va donc conserver ce nettoyage et on verra plutard pour les numéros de cabines</font> 

In [125]:
clean_train_embarked_age.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Bridge
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,C8
3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,C1
5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,


### <font color="brown">Drop avec Cabin, Bridge, Name et Ticket</font>

In [126]:
train['Bridge'] = train['Cabin'].str.slice(0,2)
test['Bridge'] = test['Cabin'].str.slice(0,2)

In [127]:
clean_train_embarked_age = clean_train_embarked_age.drop(['Cabin', 'Bridge'], axis=1)