In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import numpy as np

# comparing logisitic regression models using different features 

# import and prep data
df = pd.read_csv('https://sololearn.com/uploads/files/titanic.csv')
df['Male'] = df['Sex'] == 'male'

# instantiate KFold object
kf = KFold(n_splits=5, shuffle=True)

# three different feature matrices, all with same target 
X1 = df[['Pclass', 'Male', 'Age', 'Siblings/Spouses', 'Parents/Children', 'Fare']].values
X2 = df[['Pclass', 'Male', 'Age']].values
X3 = df[['Fare', 'Age']].values
y = df['Survived'].values

# writing function to score the model 
def score_model(X, y, kf): # pass features, target, Kfold object
    accuracy_scores = [] # create empty lists for each metric
    precision_scores = [] # ""
    recall_scores = [] # ""
    f1_scores = [] # ""
    for train_index, test_index in kf.split(X): # pass features to 'split()' method  which creates the splits, outputs a generator, for loop using training and testing indices generated for each split 
        X_train, X_test = X[train_index], X[test_index] # train/test-split for each chunk..
        y_train, y_test = y[train_index], y[test_index] # ..using training and testing indices
        model = LogisticRegression() # instantiate model for each chunk
        model.fit(X_train, y_train) # fit each model
        y_pred = model.predict(X_test) # get predictions for each model
        accuracy_scores.append(accuracy_score(y_test, y_pred)) # passing each model's targets and predictions to get metrics..
        precision_scores.append(precision_score(y_test, y_pred)) # ..and adding each model's metrics to respective metric lists
        recall_scores.append(recall_score(y_test, y_pred)) # "
        f1_scores.append(f1_score(y_test, y_pred)) # "
    print("accuracy:", np.mean(accuracy_scores)) # print mean of each metric list..
    print("precision:", np.mean(precision_scores)) # ..this is the cross-validated metric value
    print("recall:", np.mean(recall_scores)) # "
    print("f1 score:", np.mean(f1_scores)) # "

print("Logistic Regression with all features")
score_model(X1, y, kf) # run 'score_model' on 1st model by passing 1st feature matrice
print()
print("Logistic Regression with Pclass, Sex & Age features")
score_model(X2, y, kf) # run 'score_model' on 2nd model by passing 2nd feature matrice
print()
print("Logistic Regression with Fare & Age features")
score_model(X3, y, kf) # run 'score_model' on 2nd model by passing 2nd feature matrice
print()

# The first two models have almost identical scores. The third model has lower scores for all four metrics. The first two are thus much better options than the third. This makes sense since the third model doesn’t have access to the sex of the passenger. Our expectation is that women are more likely to survive, so having the sex would be a very valuable predictor
# Since the first two models have equivalent results, it makes sense to choose the simpler model, the one that uses the Pclass, Sex & Age features.


Logistic Regression with all features
accuracy: 0.8026915508157177
precision: 0.7678677650001359
recall: 0.7018155905127736
f1 score: 0.7325793728095118

Logistic Regression with Pclass, Sex & Age features
accuracy: 0.7982035167904526
precision: 0.7554665492957746
recall: 0.7081643724051816
f1 score: 0.729040996985256

Logistic Regression with Fare & Age features
accuracy: 0.6539198882752492
precision: 0.644768115942029
recall: 0.23062118949442892
f1 score: 0.33924564811574187



In [4]:
# Now that we’ve made a choice of a best model, we build a single final model using all of the data
model = LogisticRegression()
model.fit(X2, y)

# Now we can make a prediction with our model
print(model.predict([[3, False, 25]])) # passenger class 3, female, 25 years old
print(model.predict([[3, True, 25]])) # passenger class 3, male, 25 years old

# code and comments by github.com/alandavidgrunberg


[1]
[0]
