In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


# Load the passenger data
url = "https://drive.google.com/uc?id=19pnvsM08PoaRsk3XXYle0ZhdSH0uSVF0"
passengers = pd.read_csv(url)
passengers.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
# Update sex column to numerical
passengers['Sex_num'] = passengers['Sex'].map({'female': 1, 'male': 0})

# Fill the nan values in the age column
passengers['Age'] = passengers['Age'].fillna(passengers['Age'].mean())

# Create a first class column
passengers['FirstClass'] = passengers['Pclass'].apply(lambda x: 1 if x == 1 else 0)

# Create a second class column
passengers['SecondClass'] = passengers['Pclass'].apply(lambda x: 1 if x == 2 else 0)


In [None]:
# Select the desired features
features = passengers[['Sex_num','Age','FirstClass','SecondClass']]
survival = passengers['Survived']

# Perform train, test, split
x_train, x_test, y_train, y_test = train_test_split(features, survival, train_size=0.8, test_size=0.2, random_state=6)


# Scale the feature data so it has mean = 0 and standard deviation = 1
scaler = StandardScaler()
train_features = scaler.fit_transform(x_train)
test_features = scaler.transform(x_test)

In [None]:
# Create and train the model
model = LogisticRegression()
model.fit(x_train, y_train)

# Score the model on the train data
print(model.score(x_train, y_train))

# Score the model on the test data
print(model.score(x_test, y_test))

# Analyze the coefficients
print (model.coef_)

0.7823033707865169
0.8491620111731844
[[ 2.37348078 -0.03030018  2.00165386  1.01696998]]


In [None]:
# Sample passenger features
Jack = np.array([0.0,20.0,0.0,0.0])
Rose = np.array([1.0,17.0,1.0,0.0])
You = np.array([0.0,29.0,1.0,0.0])

# Convert the sample passengers array to a DataFrame with feature names
sample_passengers = pd.DataFrame(
    [Jack, Rose, You],
    columns=['Sex_num', 'Age', 'FirstClass', 'SecondClass']
)

# Scale the sample passenger features
sample_passengers = scaler.transform(sample_passengers)

In [None]:
# Make survival predictions!
print(model.predict(sample_passengers))
print(model.predict_proba(sample_passengers))

[0 1 0]
[[0.99087938 0.00912062]
 [0.00735399 0.99264601]
 [0.51718743 0.48281257]]


