# Building a Logistic Regression Model

In [43]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [4]:
train = pd.read_csv("data/train_cleaned_data.csv")
train.head(10)

Unnamed: 0.1,Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,male,Q,S
0,0,1,0,3,22.0,1,0,7.25,1,0,1
1,1,2,1,1,38.0,1,0,71.2833,0,0,0
2,2,3,1,3,26.0,0,0,7.925,0,0,1
3,3,4,1,1,35.0,1,0,53.1,0,0,1
4,4,5,0,3,35.0,0,0,8.05,1,0,1
5,5,6,0,3,24.0,0,0,8.4583,1,1,0
6,6,7,0,1,54.0,0,0,51.8625,1,0,1
7,7,8,0,3,2.0,3,1,21.075,1,0,1
8,8,9,1,3,27.0,0,2,11.1333,0,0,1
9,9,10,1,2,14.0,1,0,30.0708,0,0,0


## Train Test Split

In [5]:
x_train, x_test, y_train, y_test = train_test_split(train.drop('Survived', axis=1), 
                                                    train.Survived, test_size=0.30, random_state=101)

In [6]:
x_train.head(10)

Unnamed: 0.1,Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,male,Q,S
806,807,808,3,18.0,0,0,7.775,0,0,1
650,651,652,2,18.0,0,1,23.0,0,0,1
2,2,3,3,26.0,0,0,7.925,0,0,1
689,690,691,1,31.0,1,0,57.0,1,0,1
195,196,197,3,24.0,0,0,7.75,1,1,0
886,888,889,3,24.0,1,2,23.45,0,0,1
257,258,259,1,35.0,0,0,512.3292,0,0,0
39,39,40,3,14.0,1,0,11.2417,0,0,0
234,235,236,3,24.0,0,0,7.55,0,0,1
298,299,300,1,50.0,0,1,247.5208,0,0,0


In [7]:
x_test.head(10)

Unnamed: 0.1,Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,male,Q,S
510,511,512,3,24.0,0,0,8.05,1,0,1
612,613,614,3,24.0,0,0,7.75,1,1,0
614,615,616,2,24.0,1,2,65.0,0,0,1
336,337,338,1,41.0,0,0,134.5,0,0,0
717,718,719,3,24.0,0,0,15.5,1,1,0
151,152,153,3,55.5,0,0,8.05,1,0,1
825,826,827,3,24.0,0,0,56.4958,1,0,1
417,418,419,2,30.0,0,0,13.0,1,0,1
613,614,615,3,35.0,0,0,8.05,1,0,1
240,241,242,3,24.0,1,0,15.5,0,1,0


In [8]:
y_train.head(10)

806    0
650    1
2      1
689    1
195    0
886    0
257    1
39     1
234    0
298    1
Name: Survived, dtype: int64

In [9]:
y_test.head(10)

510    0
612    0
614    1
336    1
717    0
151    0
825    0
417    0
613    0
240    1
Name: Survived, dtype: int64

## Training and Predicting

In [11]:
logmodel = LogisticRegression()
logmodel.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [12]:
predictions = logmodel.predict(x_test)

In [32]:
x_test_comparison =  x_test.copy()
x_test_comparison.head(10)

Unnamed: 0.1,Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,male,Q,S
510,511,512,3,24.0,0,0,8.05,1,0,1
612,613,614,3,24.0,0,0,7.75,1,1,0
614,615,616,2,24.0,1,2,65.0,0,0,1
336,337,338,1,41.0,0,0,134.5,0,0,0
717,718,719,3,24.0,0,0,15.5,1,1,0
151,152,153,3,55.5,0,0,8.05,1,0,1
825,826,827,3,24.0,0,0,56.4958,1,0,1
417,418,419,2,30.0,0,0,13.0,1,0,1
613,614,615,3,35.0,0,0,8.05,1,0,1
240,241,242,3,24.0,1,0,15.5,0,1,0


In [33]:
list_predictions = list(predictions)
df_predictions = pd.Series(list_predictions, index=x_test_comparison.index)
x_test_comparison['Prediction'] = df_predictions
x_test_comparison['Survived'] = y_test
x_test_comparison.head(10)

Unnamed: 0.1,Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,male,Q,S,Prediction,Survived
510,511,512,3,24.0,0,0,8.05,1,0,1,0,0
612,613,614,3,24.0,0,0,7.75,1,1,0,0,0
614,615,616,2,24.0,1,2,65.0,0,0,1,1,1
336,337,338,1,41.0,0,0,134.5,0,0,0,1,1
717,718,719,3,24.0,0,0,15.5,1,1,0,0,0
151,152,153,3,55.5,0,0,8.05,1,0,1,0,0
825,826,827,3,24.0,0,0,56.4958,1,0,1,0,0
417,418,419,2,30.0,0,0,13.0,1,0,1,0,0
613,614,615,3,35.0,0,0,8.05,1,0,1,0,0
240,241,242,3,24.0,1,0,15.5,0,1,0,1,1


In [35]:
# Incorrect predictions
x_test_comparison[x_test_comparison.Survived != x_test_comparison.Prediction].head(10)

Unnamed: 0.1,Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,male,Q,S,Prediction,Survived
723,724,725,1,27.0,1,0,53.1,1,0,1,0,1
138,139,140,1,24.0,0,0,79.2,1,0,0,1,0
656,657,658,3,32.0,1,1,15.5,0,1,0,1,0
621,622,623,3,20.0,1,1,15.7417,1,0,0,0,1
728,729,730,3,25.0,1,0,7.925,0,0,1,1,0
815,816,817,3,23.0,0,0,7.925,0,0,1,1,0
429,430,431,1,28.0,0,0,26.55,1,0,1,0,1
428,429,430,3,32.0,0,0,8.05,1,0,1,0,1
787,788,789,3,1.0,1,2,20.575,1,0,1,0,1
700,701,702,1,35.0,0,0,26.2875,1,0,1,0,1


In [36]:
# Correctly predicted results
x_test_comparison[x_test_comparison.Survived == x_test_comparison.Prediction].head(10)

Unnamed: 0.1,Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,male,Q,S,Prediction,Survived
510,511,512,3,24.0,0,0,8.05,1,0,1,0,0
612,613,614,3,24.0,0,0,7.75,1,1,0,0,0
614,615,616,2,24.0,1,2,65.0,0,0,1,1,1
336,337,338,1,41.0,0,0,134.5,0,0,0,1,1
717,718,719,3,24.0,0,0,15.5,1,1,0,0,0
151,152,153,3,55.5,0,0,8.05,1,0,1,0,0
825,826,827,3,24.0,0,0,56.4958,1,0,1,0,0
417,418,419,2,30.0,0,0,13.0,1,0,1,0,0
613,614,615,3,35.0,0,0,8.05,1,0,1,0,0
240,241,242,3,24.0,1,0,15.5,0,1,0,1,1


## Evaluation 

In [44]:
print(classification_report(y_test, predictions))

             precision    recall  f1-score   support

          0       0.81      0.91      0.86       163
          1       0.83      0.66      0.74       104

avg / total       0.82      0.82      0.81       267

