### Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

### Reading csv file

In [19]:
# Load the passenger data
passengers = pd.read_csv('passengers.csv')
passengers.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Clean the data

In [21]:
# Update sex column to numerical
sex = np.where(passengers.Sex == 'female', 1, 0)
passengers['SexNumeric'] = sex
passengers.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,SexNumeric
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0


In [22]:
# Fill the nan values in the age column
passengers['Age'].fillna(passengers.Age.mean(), inplace = True)
passengers.Age.isna().any()

False

In [23]:
# Create a first class column
print(passengers.Pclass.value_counts())
first_class = np.where(passengers.Pclass == 1, 1, 0)
passengers['FirstClass'] = first_class

3    491
1    216
2    184
Name: Pclass, dtype: int64


In [24]:
# Create a second class column
second_class = np.where(passengers.Pclass == 2, 1, 0)
passengers['SecondClass'] = second_class
passengers.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,SexNumeric,FirstClass,SecondClass
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,1,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1,0,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1,1,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,0,0


### Select and Split Data

In [33]:
# Select the desired features
features = passengers[['Age', 'SexNumeric', 'FirstClass', 'SecondClass']]
survival = passengers['Survived']

In [34]:
# Perform train, test, split
X_train, X_test, y_train, y_test = train_test_split(features, survival, test_size = 0.2)

### Normalize training and testing features

In [35]:
# Scale the feature data so it has mean = 0 and standard deviation = 1
scaler = StandardScaler()
scaler.fit_transform(X_train)
scaler.transform(X_test)

array([[-0.21911916, -0.74470474, -0.57302549,  1.99474835],
       [-0.76302766, -0.74470474, -0.57302549, -0.50131637],
       [ 0.79099664,  1.34281406,  1.74512307, -0.50131637],
       [-2.23935075, -0.74470474, -0.57302549, -0.50131637],
       [-0.00939444, -0.74470474,  1.74512307, -0.50131637],
       [-0.68532645, -0.74470474, -0.57302549, -0.50131637],
       [ 0.0916857 , -0.74470474,  1.74512307, -0.50131637],
       [ 0.40249056, -0.74470474,  1.74512307, -0.50131637],
       [-0.91843009,  1.34281406, -0.57302549,  1.99474835],
       [-0.99613131, -0.74470474, -0.57302549, -0.50131637],
       [ 1.56800878,  1.34281406, -0.57302549,  1.99474835],
       [-0.76302766, -0.74470474, -0.57302549, -0.50131637],
       [-0.60762523, -0.74470474, -0.57302549, -0.50131637],
       [-0.52992402, -0.74470474,  1.74512307, -0.50131637],
       [ 1.10180149,  1.34281406, -0.57302549,  1.99474835],
       [ 1.17950271,  1.34281406, -0.57302549,  1.99474835],
       [ 1.33490514,  1.

### Create and Evaluate Model

In [36]:
# Create and train the model
reg = LogisticRegression()
reg.fit(X_train, y_train)

LogisticRegression()

In [37]:
# Score the model on the train data
print('Training Score:', reg.score(X_train, y_train))

Training Score: 0.8033707865168539


In [38]:
# Score the model on the test data
print('Test Score:', reg.score(X_test, y_test))

Test Score: 0.770949720670391


In [39]:
# Analyze the coefficients
reg.coef_

array([[-0.02733845,  2.51271492,  2.14656427,  1.16535561]])

In [15]:
# Sample passenger features
Jack = np.array([0.0,20.0,0.0,0.0])
Rose = np.array([1.0,17.0,1.0,0.0])
You = np.array([0.0,54.0,0.0,0.0])

In [40]:
# Combine passenger arrays
sample_passengers = np.array([Jack, Rose, You])
sample_passengers

array([[ 0., 20.,  0.,  0.],
       [ 1., 17.,  1.,  0.],
       [ 0., 54.,  0.,  0.]])

In [41]:
# Scale the sample passenger features
sample_passengers = scaler.transform(sample_passengers)
sample_passengers

array([[ -2.31705196,  41.00567135,  -0.57302549,  -0.50131637],
       [ -2.23935075,  34.74311493,   1.74512307,  -0.50131637],
       [ -2.31705196, 111.9813107 ,  -0.57302549,  -0.50131637]])

In [42]:
# Make survival predictions!
reg.predict(sample_passengers)

array([1, 1, 1], dtype=int64)

In [43]:
reg.predict_proba(sample_passengers)

array([[0., 1.],
       [0., 1.],
       [0., 1.]])