# Faire du Stacking avec Titanic 

On va faire notre premier stacking ensemble. Voyons comment cela peut améliorer notre score sur le dataset du Titanic 

1. Importez les librairies usuelles 

In [14]:
import numpy as np
import pandas as pd 
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns

2. Importez le dataset dans un DataFrame 

In [15]:
dataset = pd.read_csv("titanic.csv")

dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


3. Remplacez la colonne 

```name```
```Ticket```

Par le nombre de caractères qu'il y a dans la cellule 

In [16]:
dataset["Ticket"] = dataset["Ticket"].apply(len)
dataset["Name"] = dataset["Name"].apply(len)

dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,23,male,22.0,1,0,9,7.25,,S
1,2,1,1,51,female,38.0,1,0,8,71.2833,C85,C
2,3,1,3,22,female,26.0,0,0,16,7.925,,S
3,4,1,1,44,female,35.0,1,0,6,53.1,C123,S
4,5,0,3,24,male,35.0,0,0,6,8.05,,S


4. Créez une nouvelle colonne ```FamilySize``` qui sera la somme des colonnes  ```SibSp``` & ```Parch```

In [17]:
dataset["FamilySize"] = dataset["SibSp"] + dataset["Parch"]
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize
0,1,0,3,23,male,22.0,1,0,9,7.25,,S,1
1,2,1,1,51,female,38.0,1,0,8,71.2833,C85,C,1
2,3,1,3,22,female,26.0,0,0,16,7.925,,S,0
3,4,1,1,44,female,35.0,1,0,6,53.1,C123,S,1
4,5,0,3,24,male,35.0,0,0,6,8.05,,S,0


5. Créez une nouvelle colonne ```IsAlone``` qui indique si la personne a une famille ou non 

In [20]:
dataset["IsAlone"] = dataset["FamilySize"].apply(lambda x:0 if x>0 else 1)
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize,IsAlone
0,1,0,3,23,male,22.0,1,0,9,7.25,NoCabin,S,1,0
1,2,1,1,51,female,38.0,1,0,8,71.2833,C,C,1,0
2,3,1,3,22,female,26.0,0,0,16,7.925,NoCabin,S,0,1
3,4,1,1,44,female,35.0,1,0,6,53.1,C,S,1,0
4,5,0,3,24,male,35.0,0,0,6,8.05,NoCabin,S,0,1


6. Dans la colonne ```Cabin``` Faites en sorte de ne garder uniquement la première lettre de la cabine

In [21]:
dataset["Cabin"] = dataset["Cabin"].apply(lambda x:x[0] if type(x) == str else "NoCabin")
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize,IsAlone
0,1,0,3,23,male,22.0,1,0,9,7.25,N,S,1,0
1,2,1,1,51,female,38.0,1,0,8,71.2833,C,C,1,0
2,3,1,3,22,female,26.0,0,0,16,7.925,N,S,0,1
3,4,1,1,44,female,35.0,1,0,6,53.1,C,S,1,0
4,5,0,3,24,male,35.0,0,0,6,8.05,N,S,0,1


7. Faites une interpolation linéaire pour remplacer les valeurs manquantes dans la colonne ```Age```

In [22]:
dataset["Age"] = dataset["Age"].interpolate()
dataset.isnull().any()

PassengerId    False
Survived       False
Pclass         False
Name           False
Sex            False
Age            False
SibSp          False
Parch          False
Ticket         False
Fare           False
Cabin          False
Embarked        True
FamilySize     False
IsAlone        False
dtype: bool

8. Comptez le nombre de NaN qu'il y a dans la colonne Embarked et adoptez la stratégie qui vous semble le plus adapté pour gérer ces valeurs 

In [23]:
dataset.isnull().sum()/len(dataset)

PassengerId    0.000000
Survived       0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
Age            0.000000
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.000000
Cabin          0.000000
Embarked       0.002245
FamilySize     0.000000
IsAlone        0.000000
dtype: float64

In [24]:
dataset = dataset.dropna()

In [25]:
dataset.isnull().any()

PassengerId    False
Survived       False
Pclass         False
Name           False
Sex            False
Age            False
SibSp          False
Parch          False
Ticket         False
Fare           False
Cabin          False
Embarked       False
FamilySize     False
IsAlone        False
dtype: bool

9. Dummyfiez les variables catégoriques 

In [26]:
dataset = pd.get_dummies(dataset, drop_first=True)
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,FamilySize,...,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_N,Cabin_T,Embarked_Q,Embarked_S
0,1,0,3,23,22.0,1,0,9,7.25,1,...,0,0,0,0,0,0,1,0,0,1
1,2,1,1,51,38.0,1,0,8,71.2833,1,...,0,1,0,0,0,0,0,0,0,0
2,3,1,3,22,26.0,0,0,16,7.925,0,...,0,0,0,0,0,0,1,0,0,1
3,4,1,1,44,35.0,1,0,6,53.1,1,...,0,1,0,0,0,0,0,0,0,1
4,5,0,3,24,35.0,0,0,6,8.05,0,...,0,0,0,0,0,0,1,0,0,1


10. Séparez votre dataset en X & y qui sont respectivement les variables explicatives et la variable cible 

In [27]:
X = dataset.iloc[:,2:]
y = dataset.iloc[:, dataset.columns=="Survived"]

11. Faites un train_test_split

In [28]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y)

12. Normalisez les données

In [29]:


from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
X_train = sc_x.fit_transform(X_train)
X_test = sc_x.transform(X_test)

13. Faites une première prédiction grâce à une régression logistique et regardez votre score 

In [30]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
print(classifier.score(X_train, y_train))
print(classifier.score(X_test, y_test))

0.8048048048048048
0.8251121076233184


14. Faites une prédiction pour tout votre dataset X et concaténez ces prédiction dans un dataframe qu'on appellera X_new 

In [40]:
prediction_train = classifier.predict_proba(X_train)[:,0]
X_train_new = pd.concat([pd.DataFrame(X_train), pd.DataFrame(prediction_train)], axis=1)
prediction_test =  classifier.predict_proba(X_test)[:,0]
X_test_new = pd.concat([pd.DataFrame(X_test), pd.DataFrame(prediction_test)], axis=1)
X_train_new.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,0.1
0,-1.508859,0.291333,-0.006712,-0.478697,-0.46724,0.480171,0.425288,-0.564459,0.830876,-1.298262,...,-0.278693,-0.197488,5.063596,-0.123466,-0.077732,-1.75667,-0.038778,-0.302991,-1.606439,0.021717
1,-1.508859,-0.234773,1.437281,0.419366,-0.46724,-0.640602,0.406077,0.042859,-1.203549,0.770261,...,-0.278693,-0.197488,5.063596,-0.123466,-0.077732,-1.75667,-0.038778,-0.302991,0.622495,0.56948
2,-0.323835,-0.02433,2.881274,-0.478697,-0.46724,1.227352,-0.44138,-0.564459,0.830876,0.770261,...,-0.278693,-0.197488,-0.197488,-0.123466,-0.077732,0.569259,-0.038778,-0.302991,0.622495,0.957865
3,-1.508859,-0.8661,1.112383,-0.478697,-0.46724,-0.267011,-0.105384,-0.564459,0.830876,0.770261,...,3.588175,-0.197488,-0.197488,-0.123466,-0.077732,-1.75667,-0.038778,-0.302991,0.622495,0.812744
4,0.861188,-0.02433,-0.800909,-0.478697,-0.46724,-0.267011,-0.492247,-0.564459,0.830876,0.770261,...,-0.278693,-0.197488,-0.197488,-0.123466,-0.077732,0.569259,-0.038778,-0.302991,0.622495,0.883002


15. Importez Adaboost et entrainez votre nouveau modèle sur ```X_new```

In [41]:
#from sklearn.ensemble import AdaBoostClassifier
lg = LogisticRegression()
lg.fit(X_train_new, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

16. Regardez votre nouveau score 

In [42]:
lg.score(X_test_new, y_test)

0.7533632286995515