# Régression Logistique
"utilisation de la librairie Scikit Learn"

In [26]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [11]:
# load the data:
dataset = pd.read_csv('data/train.csv')
print(dataset.shape)
dataset.head()

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [14]:
train = dataset[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin']].copy()
train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin
0,3,male,22.0,1,0,A/5 21171,7.25,
1,1,female,38.0,1,0,PC 17599,71.2833,C85
2,3,female,26.0,0,0,STON/O2. 3101282,7.925,
3,1,female,35.0,1,0,113803,53.1,C123
4,3,male,35.0,0,0,373450,8.05,


In [17]:
sex_map = { 'male': 1, 'female': 0 }
train['Sex'] = train['Sex'].map(sex_map)

ticket_freq_map = train['Ticket'].value_counts().to_dict()
train['Ticket'] = train['Ticket'].map(ticket_freq_map)

cabin_freq_map = train['Cabin'].value_counts().to_dict()
train['Cabin'] = train['Cabin'].map(cabin_freq_map)

train = train.fillna(0)

In [20]:
X_train = train.to_numpy()
print(X_train.shape)
print(X_train[:5])

Y_train = dataset['Survived'].to_numpy()
print(Y_train[:5])

(891, 8)
[[ 3.      1.     22.      1.      0.      1.      7.25    0.    ]
 [ 1.      0.     38.      1.      0.      1.     71.2833  1.    ]
 [ 3.      0.     26.      0.      0.      1.      7.925   0.    ]
 [ 1.      0.     35.      1.      0.      2.     53.1     2.    ]
 [ 3.      1.     35.      0.      0.      1.      8.05    0.    ]]
[0 1 1 1 0]


In [28]:
scaler = StandardScaler().fit(X_train)
X_scaled = scaler.transform(X_train)

In [30]:
lr_model = LogisticRegression()
lr_model.fit(X_scaled, Y_train)

In [33]:
def compute_accuracy(y_predict, y_train):
    num_wrong = 0
    n = y_train.shape[0]
    for i in range(n):
        if y_train[i] != y_predict[i]:
            num_wrong += 1
    print(f"Accuracy: {np.round(100.0 - ((num_wrong / n) * 100))}")
    print(f"Wrong: {np.round((num_wrong / n) * 100)}")

In [38]:
y_pred = lr_model.predict(X_scaled)

In [40]:
compute_accuracy(y_pred, Y_train)

Accuracy: 81.0
Wrong: 19.0


# Compétition sur Kaggle
- on va charger le test dataset
- on va prédire les survivants depuis ces donnés
- on va soumettre chez Kaggle

In [57]:
test_dataset = pd.read_csv('data/test.csv')
print(test_dataset.shape)
test_dataset.head()

(418, 11)


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [59]:
test = test_dataset[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin']].copy()
test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin
0,3,male,34.5,0,0,330911,7.8292,
1,3,female,47.0,1,0,363272,7.0,
2,2,male,62.0,0,0,240276,9.6875,
3,3,male,27.0,0,0,315154,8.6625,
4,3,female,22.0,1,1,3101298,12.2875,


In [62]:
test['Sex'] = test['Sex'].map(sex_map)
test['Ticket'] = test['Ticket'].map(ticket_freq_map)
test['Cabin'] = test['Cabin'].map(cabin_freq_map)

test = test.fillna(0)

In [65]:
X_test = test.to_numpy()
print(X_test.shape)
print(X_test[:5])

(418, 8)
[[ 3.      1.     34.5     0.      0.      0.      7.8292  0.    ]
 [ 3.      0.     47.      1.      0.      0.      7.      0.    ]
 [ 2.      1.     62.      0.      0.      0.      9.6875  0.    ]
 [ 3.      1.     27.      0.      0.      0.      8.6625  0.    ]
 [ 3.      0.     22.      1.      1.      1.     12.2875  0.    ]]


In [68]:
test_scaler = StandardScaler().fit(X_test)
X_test_scaled = test_scaler.transform(X_test)

Y_pred = lr_model.predict(X_test_scaled)

res = pd.DataFrame(test_dataset['PassengerId'], columns=['PassengerId', 'Survived'])
res['Survived'] = Y_pred
res.to_csv("results/logireg_scikitlearn_submission.csv", index=False)