In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [3]:
# Load the passenger data
passengers = pd.read_csv("passengers.csv")
# print(passengers)

# Update sex column to numerical
sex = {'male':0, 'female':1}
passengers['Sex'] = passengers.Sex.map(sex)
# print(passengers.head())
# Fill the nan values in the age column
# print(passengers['Age'].values)
passengers.Age.fillna(passengers.Age.mean(), inplace=True)
# print(passengers['Age'].values)

# Create a first class column
passengers['FirstClass'] = passengers.Pclass.apply(lambda pcl: 1 if pcl == 1 else 0)
# print(passengers.head())

# Create a second class column
passengers['SecondClass'] = passengers.Pclass.apply(lambda pcl: 1 if pcl == 2 else 0)
# print(passengers.head())

# Select the desired features
features = passengers[['Sex', 'Age', 'FirstClass', 'SecondClass']]
survival = passengers['Survived']
# Perform train, test, split
X_train, X_test, y_train, y_test = train_test_split(features, survival, test_size=0.2, random_state=5)
# print(X_train.shape)
# print(y_train.shape)
# print(X_test.shape)
# print(y_test.shape)
# Scale the feature data so it has mean = 0 and standard deviation = 1
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Create and train the model
model = LogisticRegression()
model.fit(X_train, y_train)

# Score the model on the train data
train_score = model.score(X_train, y_train)
print(train_score)

# Score the model on the test data
test_score = model.score(X_test, y_test)
print(test_score)

# Analyze the coefficients
coeffs = model.coef_
print(coeffs)
print(list(zip(['Sex','Age','FirstClass','SecondClass'],coeffs[0])))
# Sample passenger features
Jack = np.array([0.0,20.0,0.0,0.0])
Rose = np.array([1.0,17.0,1.0,0.0])
You = np.array([0.0,30.0,0.0,1.0])

# Combine passenger arrays
sample_passengers = np.array([Jack, Rose, You])

# Scale the sample passenger features
sample_passengers = scaler.transform(sample_passengers)
print(sample_passengers)

# Make survival predictions!
survive = model.predict(sample_passengers)
print(survive)
survive_proba = model.predict_proba(sample_passengers)
print(survive_proba)

0.7935393258426966
0.8156424581005587
[[ 1.24819705 -0.41868712  0.97133716  0.50296137]]
[('Sex', 1.2481970530040352), ('Age', -0.4186871178453579), ('FirstClass', 0.9713371618994415), ('SecondClass', 0.5029613749441112)]
[[-7.46985188e-01 -7.61277679e-01 -5.77350269e-01 -4.90324503e-01]
 [ 1.33871463e+00 -9.89927839e-01  1.73205081e+00 -4.90324503e-01]
 [-7.46985188e-01  8.89520402e-04 -5.77350269e-01  2.03946569e+00]]
[0 1 0]
[[0.88778036 0.11221964]
 [0.05345198 0.94654802]
 [0.75305808 0.24694192]]
