In [None]:
# For preprocessing, see notebooks/TitanicDatasetPreprocessing.ipynb

In [None]:
# --- Training a Logistic Regression Classifier --- #

In [8]:
import pandas as pd

In [10]:
titanic_df = pd.read_csv('datasets/titanic/processed.csv')

titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,1,2,1,2.0,1,1,26.0,0,0,1
1,0,3,1,21.0,0,0,7.925,0,0,1
2,0,3,1,44.0,0,0,8.05,0,0,1
3,1,3,0,22.0,0,0,7.75,0,0,1
4,0,3,0,45.0,1,4,27.9,0,0,1


In [11]:
titanic_df.shape

(712, 10)

In [14]:
# Hold out a fraction of the original dataset to evaluate the model against unseen data
from sklearn.model_selection import train_test_split

# X will be the features, i.e., the dataframe labels used to predict
X = titanic_df.drop('Survived', axis=1)

# Y will be the label that we want to predict
Y = titanic_df['Survived']

# Handle the splitting - it first shuffles (so prior shuffle wasn't required)
# test_size=0.2 holds out 20% of data for test/evaluation
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [15]:
x_train.shape, y_train.shape

((569, 9), (569,))

In [16]:
x_test.shape, y_test.shape

((143, 9), (143,))

In [19]:
# Estimators are high-level objects that make it easy to build and train ML models for prediction
# We will use the LogisticRegression estimator 
from sklearn.linear_model import LogisticRegression

# Instantiate the estimator and call fit on the training data and labels to train the model
    # penalty enables us to regularize the model by applying a penalty on overly complex models
        # To make a more robust model that is better for prediction
        # penalty='l2' is the default penalty (note it's L2 not TWELVE - the options are l1 and l2)
    # C specifies the strength of the regularization (smaller values indicate stronger reguarlization)
    # solver is the algorithm used in the optimization problem - liblinear works well with small datasets
logistic_model = LogisticRegression(penalty='l2', C=1.0, solver='liblinear').fit(x_train, y_train)

In [20]:
# Predict the values for the test set
y_pred = logistic_model.predict(x_test)

In [21]:
# --- Calculating Accuracy, Precision, and Recall for the Classification Model --- #

In [22]:
# Evaluate the predictions by comparing the actual values against the predicted values
pred_results = pd.DataFrame({'y_test': y_test, 'y_pred': y_pred})

In [23]:
pred_results.head()

Unnamed: 0,y_test,y_pred
240,0,0
572,0,0
508,1,1
595,1,0
173,1,1


In [29]:
# Plot the actual results vs the predicted values from our model in a confusion matrix
titanic_crosstab = pd.crosstab(pred_results.y_pred, pred_results.y_test)
titanic_crosstab

y_test,0,1
y_pred,Unnamed: 1_level_1,Unnamed: 2_level_1
0,60,27
1,14,42


In [26]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [30]:
# Measure model performance using formal metrics 
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

# Measures how many of the predicted values did the model get right
# For binary classification we should see > 50% because 50% is what you get guessing at random
print("accuracy_score : ", acc)
# Measures how many passengers predicted to survive actually did survive
print("precision_score : ", prec)
# Measures how many actual survivers did the model accruately predict
# Low recall suggests many FN where model though passenger did not survive but passenger did
print("recall_score : ", recall)

accuracy_score :  0.7132867132867133
precision_score :  0.75
recall_score :  0.6086956521739131


In [31]:
# Show how the metrics work under the hood by calculating them manually
TP = titanic_crosstab[1][1] # predicated survived, did survive
TN = titanic_crosstab[0][0] # predicated did not survive, did not survive
FP = titanic_crosstab[0][1] # predicated survived, did not survive
FN = titanic_crosstab[1][0] # predicated did not survive, did survive

In [32]:
accuracy_score_verified = (TP + TN) / (TP + FP + TN + FN)

accuracy_score_verified

0.7132867132867133

In [35]:
precision_score_survived = TP / (TP + FP)

precision_score_survived

0.75

In [36]:
recall_score_survived = TP / (TP + FN)

recall_score_survived

0.6086956521739131

In [None]:
# --- Defining Helper Functions to Train and Evaluate Classification Models --- #
# See notebooks/MultipleClassificationModels_Titanic.ipynb