In [39]:
import pandas as pd
import numpy as np
import scipy.stats 
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
import seaborn as sns
import matplotlib.pyplot as plt

In [59]:
train_clean=pd.read_csv('data/train_clean.csv')
test_clean=pd.read_csv('data/test_clean.csv')

 # les femmes ayant embarqué sur le titanic avaient-elles plus de chance de survie que les hommes?

## Ho: il n'existe pas de corrélation entre le sexe du passager et la survie
## H1: il existe une corrélation entre le sexe du passager et la survie

In [3]:
train_clean.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.25,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.925,S
3,4,1,1,female,35.0,1,0,53.1,S
4,5,0,3,male,35.0,0,0,8.05,S


In [41]:
train_clean=pd.get_dummies(train_clean)


In [5]:
train_clean.corr(method='pearson')

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
PassengerId,1.0,-0.005007,-0.035144,0.034212,-0.057527,-0.001652,0.012658,-0.042939,0.042939,-0.001205,-0.033606,0.022204
Survived,-0.005007,1.0,-0.338481,-0.06491,-0.035322,0.081629,0.257307,0.543351,-0.543351,0.16824,0.00365,-0.149683
Pclass,-0.035144,-0.338481,1.0,-0.339898,0.083081,0.018443,-0.5495,-0.1319,0.1319,-0.243292,0.221009,0.074053
Age,0.034212,-0.06491,-0.339898,1.0,-0.233296,-0.172482,0.096688,-0.081163,0.081163,0.030248,-0.031415,-0.006729
SibSp,-0.057527,-0.035322,0.083081,-0.233296,1.0,0.414838,0.159651,0.114631,-0.114631,-0.059528,-0.026354,0.068734
Parch,-0.001652,0.081629,0.018443,-0.172482,0.414838,1.0,0.216225,0.245489,-0.245489,-0.011069,-0.081228,0.060814
Fare,0.012658,0.257307,-0.5495,0.096688,0.159651,0.216225,1.0,0.182333,-0.182333,0.269335,-0.117216,-0.162184
Sex_female,-0.042939,0.543351,-0.1319,-0.081163,0.114631,0.245489,0.182333,1.0,-1.0,0.082853,0.074115,-0.119224
Sex_male,0.042939,-0.543351,0.1319,0.081163,-0.114631,-0.245489,-0.182333,-1.0,1.0,-0.082853,-0.074115,0.119224
Embarked_C,-0.001205,0.16824,-0.243292,0.030248,-0.059528,-0.011069,0.269335,0.082853,-0.082853,1.0,-0.148258,-0.782742


In [6]:
train_clean.corr(method='spearman')

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
PassengerId,1.0,-0.005007,-0.034091,0.034584,-0.061161,0.001235,-0.013975,-0.042939,0.042939,-0.001205,-0.033606,0.022204
Survived,-0.005007,1.0,-0.339668,-0.037004,0.088879,0.138266,0.323736,0.543351,-0.543351,0.16824,0.00365,-0.149683
Pclass,-0.034091,-0.339668,1.0,-0.317406,-0.043019,-0.022801,-0.688032,-0.135775,0.135775,-0.220995,0.229763,0.049011
Age,0.034584,-0.037004,-0.317406,1.0,-0.145316,-0.217488,0.126006,-0.067663,0.067663,0.019634,-0.02095,-0.004016
SibSp,-0.061161,0.088879,-0.043019,-0.145316,1.0,0.450014,0.447113,0.195204,-0.195204,0.014188,-0.048537,0.018115
Parch,0.001235,0.138266,-0.022801,-0.217488,0.450014,1.0,0.410074,0.254512,-0.254512,0.023115,-0.09925,0.042209
Fare,-0.013975,0.323736,-0.688032,0.126006,0.447113,0.410074,1.0,0.259593,-0.259593,0.204104,-0.232149,-0.032712
Sex_female,-0.042939,0.543351,-0.135775,-0.067663,0.195204,0.254512,0.259593,1.0,-1.0,0.082853,0.074115,-0.119224
Sex_male,0.042939,-0.543351,0.135775,0.067663,-0.195204,-0.254512,-0.259593,-1.0,1.0,-0.082853,-0.074115,0.119224
Embarked_C,-0.001205,0.16824,-0.220995,0.019634,0.014188,0.023115,0.204104,0.082853,-0.082853,1.0,-0.148258,-0.782742


In [7]:
train_clean.corr(method='spearman').style.format("{:.2}").background_gradient(cmap=plt.get_cmap('coolwarm'))

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
PassengerId,1.0,-0.005,-0.034,0.035,-0.061,0.0012,-0.014,-0.043,0.043,-0.0012,-0.034,0.022
Survived,-0.005,1.0,-0.34,-0.037,0.089,0.14,0.32,0.54,-0.54,0.17,0.0037,-0.15
Pclass,-0.034,-0.34,1.0,-0.32,-0.043,-0.023,-0.69,-0.14,0.14,-0.22,0.23,0.049
Age,0.035,-0.037,-0.32,1.0,-0.15,-0.22,0.13,-0.068,0.068,0.02,-0.021,-0.004
SibSp,-0.061,0.089,-0.043,-0.15,1.0,0.45,0.45,0.2,-0.2,0.014,-0.049,0.018
Parch,0.0012,0.14,-0.023,-0.22,0.45,1.0,0.41,0.25,-0.25,0.023,-0.099,0.042
Fare,-0.014,0.32,-0.69,0.13,0.45,0.41,1.0,0.26,-0.26,0.2,-0.23,-0.033
Sex_female,-0.043,0.54,-0.14,-0.068,0.2,0.25,0.26,1.0,-1.0,0.083,0.074,-0.12
Sex_male,0.043,-0.54,0.14,0.068,-0.2,-0.25,-0.26,-1.0,1.0,-0.083,-0.074,0.12
Embarked_C,-0.0012,0.17,-0.22,0.02,0.014,0.023,0.2,0.083,-0.083,1.0,-0.15,-0.78


In [8]:
from scipy.stats import chi2_contingency
contingence = pd.crosstab(train_clean['Sex_female'],train_clean['Survived'],  margins=True)
chi2, p, dof, ex = chi2_contingency(contingence)
alpha = 1.0 - 0.95
print('significance=%.3f, p=%.3f' % (alpha, p))
if p <= alpha:
    print('Dependent (reject H0)')
else:
    print('Independent (fail to reject H0)')

significance=0.050, p=0.000
Dependent (reject H0)


Analyse: la p value (0.000) est inférieure à 0.05 donc le fait de connaitre le sexe d'un passager a une influence sur la survie lors du naufrage

In [9]:
from scipy.stats import stats
contingence2 = pd.crosstab(train_clean['Sex_male'],train_clean['Survived'])
oddsratio, pvalue = stats.fisher_exact(contingence2)
pvalue

6.46392156458115e-60

le test de Fisher confirme la dépendance des variables

# Avoir moins de 18 ans donne-t-il plus de chance de  survie lors du naufrage du Titanic ?

## H0 : Il n'éxiste pas de corrélation entre la survie et le fait d'être agé de moins de 18 ans , les variables sont indépendantes.<br>
## h1 : Il existe une corrélation entre la survie et le fait d'être agé de moins de 18 ans

In [10]:
newval = ''
for index , i in train_clean.iterrows():
    if i['Age'] >18:
        newval = 0
    else :
        newval = 1
    train_clean.at[index,'Age'] = newval

In [11]:
from scipy.stats import chi2_contingency
contingence3 = pd.crosstab(train_clean['Age'],train_clean['Survived'])
chi2, p, dof, ex = chi2_contingency(contingence3)
alpha = 1.0 - 0.95
print('significance=%.3f, p=%.3f' % (alpha, p))
if p <= alpha:
    print('Dependent (reject H0)')
else:
    print('Independent (fail to reject H0)')

significance=0.050, p=0.002
Dependent (reject H0)


La Pvalue est de 0.002, soit en dessous de notre alpha de 0.05, Nous rejetons h0, et nous confirmons que les 2 features sont liées.
Nous confirmons cela avec un test de fisher

In [12]:
contingence3 = pd.crosstab(train_clean['Age'],train_clean['Survived'])
oddsratio, pvalue = scipy.stats.fisher_exact(contingence3)
pvalue

0.0022911292216911162

La dépendance des 2 variables est confirmée

# Le prix du billet a-t-il eu une incidence sur la survie des passagers?


## Ho : Le prix du billet n'a pas eu d'incidence sur la survie
## H1: le prix a eu une incidence

In [14]:
contingence4 = pd.crosstab(train_clean['Fare'],train_clean['Survived'])
chi2, p, dof, ex = chi2_contingency(contingence4)
alpha = 1.0 - 0.95
print('significance=%.3f, p=%.3f' % (alpha, p))
if p <= alpha:
    print('Dependent (reject H0)')
else:
    print('Independent (fail to reject H0)')

significance=0.050, p=0.000
Dependent (reject H0)


La Pvalue est de 0.000, soit en dessous de notre alpha de 0.05, Nous rejetons h0, et nous confirmons que les 2 features sont liées.


In [15]:
print(train_clean['Fare'].head())
print(train_clean['Fare'].mean())
print(train_clean['Fare'].median())


0     7.2500
1    71.2833
2     7.9250
3    53.1000
4     8.0500
Name: Fare, dtype: float64
32.2042079685746
14.4542


In [56]:
newfare = ''
for index , i in train_clean.iterrows():
    
    if i['Fare'] > 32:

        newfare = 0
    else :
        newfare = 1
    train_clean.at[index,'Fare'] = newfare

In [57]:
contingence4 = pd.crosstab(train_clean['Fare'],train_clean['Survived'], margins=False)
oddsratio, pvalue = scipy.stats.fisher_exact(contingence4)
pvalue

8.871807500023419e-13

Le test de fisher confirme l'influence du tarif du billet sur la survie du passager

## Conclusion: nous allons pouvoir étudier plus en détail l'influence de ces variables sur la prédictions de la survie des passagers