In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, KFold, cross_val_score
import numpy as np

df = pd.read_csv('https://sololearn.com/uploads/files/titanic.csv')
df['Male'] = df['Sex'] == 'male'
X = df[['Pclass', 'Male', 'Age', 'Siblings/Spouses', 'Parents/Children', 'Fare']].values
y = df['Survived'].values

#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y) # no set random state, so different shuffle each time we run the code.

# building the model
model = LogisticRegression()
model.fit(X_train, y_train)

# evaluating the model
y_pred = model.predict(X_test)
print("*Metrics change each run*")
print(f"accuracy: {accuracy_score(y_test, y_pred):.4f}") # f-string formatting
print(f"precision: {precision_score(y_test, y_pred):.4f}") # ':.4f' = float with 4 digits after decimal point
print(f"recall: {recall_score(y_test, y_pred):.4f}")
print(f"f1 score: {f1_score(y_test, y_pred):.4f}")
# You can see that each time we run it, we get different values for the metrics, depending on how lucky or unlucky we were in which datapoints ended up in the test set. 

# The accuracy ranges from 0.79 to 0.84, the precision from 0.75 to 0.81 and the recall from 0.63 to 0.75. Wide range of possible values that could output, so hard to know how reliable our metrics are. 


*Metrics change each run*
accuracy: 0.8288
precision: 0.7647
recall: 0.7831
f1 score: 0.7738


In [3]:
## k-fold cross validation

# k-fold cross validation addresses this issue by breaking our dataset into multiple training and testing set chunks. Each datapoint will appear in the testing set one time. By averaging out the results from each training/testing chunk, we can be more confident in our metrics. 

# lets start with just 2 features and a much smaller dataset
print('___ k-fold cross validation with just 2 features, 9 datapoints ___')
print()

X = df[['Age', 'Fare']].values[:9] # 'Age' & 'Fare' features selected, first 9 datapoints
y = df['Survived'].values[:9] # target values, first 9 datapoints 

kf = KFold(n_splits=3, shuffle=True, random_state=42) # instantiate 'Kfold()' object, will split data into 3 chunks, 'shuffle' randomizes order, 'random_state' holds the shuffle
chunk_generator = kf.split(X) # 'split()' method creates the splits, outputs a generator. pass features 'X', assign generator to 'chunk_generator' 
print("*k-fold chunks*")
print("[training set]             [testing set]")
for chunk in chunk_generator: # for loop to output the 3 chunks from the generator
    print(chunk) # shows indices, not actual data values. each chunk has 6 datapoints in training set and 3 datapoints in testing set
print()

chunk_list = list(kf.split(X))  # 'split()' method creates the splits, outputs a generator. pass features 'X', turn generator into list, assign list to 'chunk_list'
first_chunk = chunk_list[0] # pull out first chunk from list of chunks. contains an array of training indices and an array of testing indices 
train_indices, test_indices = first_chunk # unpack the two arrays to seperate variables
print("*First chunk*")
print("training set indices:", train_indices)
print("test set indices:", test_indices)
print()


___ k-fold cross validation with just 2 features, 9 datapoints ___

*k-fold chunks*
[training set]             [testing set]
(array([0, 2, 3, 4, 6, 8]), array([1, 5, 7]))
(array([1, 3, 4, 5, 6, 7]), array([0, 2, 8]))
(array([0, 1, 2, 5, 7, 8]), array([3, 4, 6]))

*First chunk*
training set indices: [0 2 3 4 6 8]
test set indices: [1 5 7]



In [6]:
# creating train/test-split based on the indices from first K-fold chunk
X_train = X[train_indices]
y_train = y[train_indices]
X_test = X[test_indices]
y_test = y[test_indices]
print("X_train:")
print(X_train)
print("y_train:", y_train)
print()
print("X_test:")
print(X_test)
print("y_test:", y_test)
print()

# creating model for first K-fold chunk
model = LogisticRegression()
model.fit(X_train, y_train)
print("accuracy:", model.score(X_test, y_test))


X_train:
[[3 True 22.0 1 0 7.25]
 [3 False 26.0 0 0 7.925]
 [1 False 35.0 1 0 53.1]
 [3 True 35.0 0 0 8.05]
 [1 True 54.0 0 0 51.8625]
 [3 False 27.0 0 2 11.1333]]
y_train: [0 1 1 0 0 1]

X_test:
[[1 False 38.0 1 0 71.2833]
 [3 True 27.0 0 0 8.4583]
 [3 True 2.0 3 1 21.075]]
y_test: [1 0 0]

accuracy: 0.6666666666666666


In [7]:
# we’ve essentially done a single train/test split. 
#In order to do a k-fold cross validation, we need to use each of the other 2 splits to build a model and score the model.
# loop over all the folds
print("*All 3 chunks*")
print()
list_of_scores = [] # create empty list 'list_of_scores' 
for train_index, test_index in kf.split(X): # 'split()' method creates the splits, outputs a generator, pass features X. for loop using training and testing indices generated for each chunk 
    X_train, X_test = X[train_index], X[test_index] # train/test-split for each chunk
    y_train, y_test = y[train_index], y[test_index] # using training and testing indices 
    model = LogisticRegression() # instantiate model for each chunk
    model.fit(X_train, y_train) # fit each model 
    list_of_scores.append(model.score(X_test, y_test)) # add each model's accuracy score to 'list_of_scores' 
print("accuracy scores:", list_of_scores)
print("mean accuracy score:", np.mean(list_of_scores)) # find mean of all three model's scores. This is the cross-validated score we would report for the final model


*All 3 chunks*

accuracy scores: [0.7359550561797753, 0.8258426966292135, 0.8022598870056498, 0.847457627118644, 0.7627118644067796]
mean accuracy score: 0.7948454262680125


In [8]:
# cross_val_score
# no need to loop over all the folds manually! you can use 'cross_val_score' as a shortcut
scores = cross_val_score(model, X, y, cv=kf) # pass model, all features 'X', all targets 'y' 'cv=' means number of folds. Instead of passing an integer, here we pass 'cv=kf'. 'kf' is the 'KFold' object that was initiated earlier with '(n_splits=3, shuffle=True, random_state=42)' 
# 'cross_val_score' will automatically create a train/test-split for each chunk, instantiate a model for each chunk, fit each model, and output each model's accuracy score
print('accuracy scores:', scores)
print("mean accuracy score:", np.mean(scores))


accuracy scores: [0.85955056 0.74719101 0.8079096  0.81355932 0.78531073]
mean accuracy score: 0.8027042468101314


In [10]:
# now lets do k-fold cross validation using all 6 features and the entire dataset 
print('___ k-fold cross validation with all 6 features and entire dataset ___')
print()

model = LogisticRegression()
X = df[['Pclass', 'Male', 'Age', 'Siblings/Spouses', 'Parents/Children', 'Fare']].values
y = df['Survived'].values

# cross_val_score shortcut
scores = cross_val_score(model, X, y, cv=5) # 5 folds this time (no set random state)
print('accuracy scores:', scores)
print("mean accuracy score:", np.mean(scores))
print()

# loop over all folds
kf = KFold(n_splits=5, shuffle=True) # instantiate 'Kfold()' object, will split data into 5 chunks, 'shuffle' randomizes order, (no set random state)
list_of_scores = [] # create empty list 'list_of_scores' 
for train_index, test_index in kf.split(X): # 'split()' method creates the splits, outputs a generator, pass features X. for loop using training and testing indices generated for each chunk 
    X_train, X_test = X[train_index], X[test_index] # train/test-split for each chunk
    y_train, y_test = y[train_index], y[test_index] # using training and testing indices 
    model = LogisticRegression() # instantiate model for each chunk
    model.fit(X_train, y_train) # fit each model 
    list_of_scores.append(model.score(X_test, y_test)) # add each model's accuracy score to 'list_of_scores'
print("accuracy scores:", list_of_scores)
print("mean accuracy score:", np.mean(list_of_scores)) # find mean of all 5 model's scores. This is the cross-validated score we would report for the final model
# notice slight difference between results from 'cross_val_score' shortcut and results from manually looping over the folds. 
#That's because we did not set a matching random state. We also get slightly different results each time we run the code. However, the mean accuracy scores should be quite close and not fluctuate much each run. That's what k-fold cross validation is for!



___ k-fold cross validation with all 6 features and entire dataset ___

accuracy scores: [0.79775281 0.78651685 0.77966102 0.79096045 0.81355932]
mean accuracy score: 0.7936900907763602

accuracy scores: [0.797752808988764, 0.8033707865168539, 0.7740112994350282, 0.8022598870056498, 0.807909604519774]
mean accuracy score: 0.797060877293214


In [11]:
# loop over all folds again
# this time outputting predictions to get more metrics
kf = KFold(n_splits=5, shuffle=True) # instantiate 'Kfold()' object, will split data into 5 chunks, 'shuffle' randomizes order, (no set random state)
list_of_accuracy_scores = [] # create empty lists for each metric
list_of_precision_scores = []
list_of_recall_scores = []
list_of_f1_scores = []
for train_index, test_index in kf.split(X): # 'split()' method creates the splits, outputs a generator, pass features X. for loop using training and testing indices generated for each chunk 
    X_train, X_test = X[train_index], X[test_index] # train/test-split for each chunk
    y_train, y_test = y[train_index], y[test_index] # using training and testing indices 
    model = LogisticRegression() # instantiate model for each chunk
    model.fit(X_train, y_train) # fit each model 
    y_pred = model.predict(X_test) # get predictions for each model
    list_of_accuracy_scores.append(accuracy_score(y_test, y_pred)) # passing each model's targets and predictions to get metrics
    list_of_precision_scores.append(precision_score(y_test, y_pred)) # and adding each model's metrics to respective lists
    list_of_recall_scores.append(recall_score(y_test, y_pred))
    list_of_f1_scores.append(f1_score(y_test, y_pred))
print("accuracy scores:", list_of_accuracy_scores)
print("precision scores:", list_of_precision_scores)
print("recall scores:", list_of_recall_scores)
print("f1 scores:", list_of_f1_scores)
print()
print(f"mean accuracy: {np.mean(list_of_accuracy_scores):.4f}") # f-string formatting
print(f"mean precision: {np.mean(list_of_precision_scores):.4f}") # ':.4f' = float with 4 digits after decimal point
print(f"mean recall: {np.mean(list_of_recall_scores):.4f}")
print(f"mean f1 score: {np.mean(list_of_f1_scores):.4f}")

# These metrics results are more reliable than the results from a single train/test split, because they are the cross validated results of 5 different train/test splits! 

# code and comments by github.com/alandavidgrunberg


accuracy scores: [0.797752808988764, 0.8146067415730337, 0.7796610169491526, 0.8135593220338984, 0.7853107344632768]
precision scores: [0.7142857142857143, 0.7878787878787878, 0.746031746031746, 0.7313432835820896, 0.8305084745762712]
recall scores: [0.6666666666666666, 0.7323943661971831, 0.6714285714285714, 0.765625, 0.6363636363636364]
f1 scores: [0.689655172413793, 0.7591240875912407, 0.706766917293233, 0.748091603053435, 0.7205882352941178]

mean accuracy: 0.7982
mean precision: 0.7620
mean recall: 0.6945
mean f1 score: 0.7248
