In [2]:
import numpy as np
import pandas as pd

df = pd.read_csv('https://sololearn.com/uploads/files/titanic.csv') # 'read_csv()' takes a file in csv (comma seperated value) format and converts it to a Pandas DataFrame
print(df.head()) # check first 5 rows


   Survived  Pclass     Sex   Age  Siblings/Spouses  Parents/Children     Fare
0         0       3    male  22.0                 1                 0   7.2500
1         1       1  female  38.0                 1                 0  71.2833
2         1       3  female  26.0                 0                 0   7.9250
3         1       1  female  35.0                 1                 0  53.1000
4         0       3    male  35.0                 0                 0   8.0500


In [3]:
# prepping data
df['Male'] = df['Sex'] == 'male' # create a new column 'Male' with boolean True/False values to tell us if passenger is male or not. Booleans are easier for Python to do computations on than the strings in the 'Sex' column
print(df.head()) # the new column is added to the end of the DataFrame 

   Survived  Pclass     Sex   Age  Siblings/Spouses  Parents/Children  \
0         0       3    male  22.0                 1                 0   
1         1       1  female  38.0                 1                 0   
2         1       3  female  26.0                 0                 0   
3         1       1  female  35.0                 1                 0   
4         0       3    male  35.0                 0                 0   

      Fare   Male  
0   7.2500   True  
1  71.2833  False  
2   7.9250  False  
3  53.1000  False  
4   8.0500   True  


In [4]:
X = df[['Pclass', 'Male', 'Age', 'Siblings/Spouses', 'Parents/Children', 'Fare']].values
# 2d Numpy array of features 'X' (we don't select 'Sex' column because it's non-numerical)
y = df['Survived'].values
# 1d Numpy array of target values 'y'
print(X[:5]) # check first 5 datapoint features 
print()
print(y[:5]) # check first 5 datapoint target values


[[3 True 22.0 1 0 7.25]
 [1 False 38.0 1 0 71.2833]
 [3 False 26.0 0 0 7.925]
 [1 False 35.0 1 0 53.1]
 [3 True 35.0 0 0 8.05]]

[0 1 1 1 0]


In [5]:
## building logistic regression model 
from sklearn.linear_model import LogisticRegression # import
X = df[['Fare', 'Age']].values # let's start with just two features first
y = df['Survived'].values
model = LogisticRegression() # instantiate 
model.fit(X, y) # fit 

print(model.coef_, model.intercept_) # check line of best fit
print() # equation:  0 = 0.0161594x + -0.01549065y + -0.51037152


[[ 0.01615949 -0.01549065]] [-0.51037152]



In [6]:
## logistic regression with all features 
X = df[['Pclass', 'Male', 'Age', 'Siblings/Spouses', 'Parents/Children', 'Fare']].values
y = df['Survived'].values
model = LogisticRegression()
model.fit(X, y)

#predict
print(model.predict([[3, True, 22.0, 1, 0, 7.25]])) # first passenger predicted not survived
print(model.predict(X[:5])) # first 5 predictions (0 = not survived, 1 = survived)
print(y[:5]) # first 5 targets (actual outcomes) we see that all 5 predictions were correct!


[0]
[0 1 1 1 0]
[0 1 1 1 0]


In [7]:
## evaluating the model
y_pred = model.predict(X) # array of predictions
print((y == y_pred).sum()) # sum of correct matches between target values and predictions
print((y == y_pred).sum() / y.shape[0]) # number of correct matches / total number of passengers = accuracy score, percent accurately predicted 
print(model.score(X, y)) # same output, '.score()' uses the model to make a prediction for X and counts what percent of them match y


714
0.8049605411499436
0.8049605411499436


In [8]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# confusion matrix
print(confusion_matrix(y, y_pred))
print()

# predicted positive, actually positive = True Positive (TP)
# predicted positive, actually negative = False Positive (FP)
# predicted negative, actually negative = True Negative (TN)
# predicted negative, actually positive = False Negative (FN)

# Order of numbers in confusion matrix depends on scikit-learn conventions. This confusion matrix shows:

#      A 0     TN | FP
#      c       ___|___
#      t          |
#      u 1     FN | TP  
#      a       
#      l       0     1
#          P r e d i c t e d


[[475  70]
 [103 239]]



In [9]:
# accuracy 
print("accuracy:", accuracy_score(y, y_pred))
# TP + TN / TP + FP + TN + FN = accuracy score
# (239 + 475) / (239+70+475+103) = 714/887 = 80.49%
# same result as model.score(X,y)

# precision
print("precision:", precision_score(y, y_pred))
# Precision: percentage of model's positive predictions that are correct
# (also called positive predictive value)

# TP / TP + FP = precision
# (239) / (239 + 70) = 239/309 = 77.34%

# recall
print("recall:", recall_score(y, y_pred)) 
# Recall: percentage of actual positives that were predicted positively
# (also called sensitivity)

# TP / TP + FN = recall
# (239) / (239 + 103) = 239/342 = 69.88%

# Often a trade-off between precision and recall. Increasing the model's precision means making the model less sensitive, to lower the false positive rate. But a less sensitive model could also miss more actual positives, leading to a lower recall. Increasing the model's recall means making the model more sensitive, to lower the false negative rate.  But a more sensitive model could also incorrectly flag more false positives, leading to lower precision.

# f1 score
print("f1 score:", f1_score(y, y_pred))
# f1 score = harmononic mean of precision and recall, when we just want one value
# 2 x (precision x recall)/(precision + recall) = F1 score
# 2 x (77.34 x 69.88)/(77.34 + 69.88) = (2 x 5405.51) / (147.22) = 73.42%


accuracy: 0.8049605411499436
precision: 0.7734627831715211
recall: 0.6988304093567251
f1 score: 0.7342549923195083


In [10]:
## logistic regression with data split into training and testing sets

# train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y) # default 75% / 25% split
print("whole dataset:", X.shape, y.shape) # (all features, all targets)
print("training set:", X_train.shape, y_train.shape) # (training features, training targets)
print("test set:", X_test.shape, y_test.shape) # (testing features, testing targets)
print()# notice X_train/X_test stay 2d, y_train/y_test stays 1d


whole dataset: (887, 6) (887,)
training set: (665, 6) (665,)
test set: (222, 6) (222,)



In [11]:
# build model using training set
model = LogisticRegression()
model.fit(X_train, y_train)

# evaluate model using test set
print(model.score(X_test, y_test))
print()

y_pred = model.predict(X_test) # array of test predictions
print("accuracy:", accuracy_score(y_test, y_pred))
print("precision:", precision_score(y_test, y_pred))
print("recall:", recall_score(y_test, y_pred))
print("f1 score:", f1_score(y_test, y_pred))
print() # accuracy, precision, recall and F1 score values are similar to the values when we used the entire dataset. This is a sign our model is not overfit.


0.8108108108108109

accuracy: 0.8108108108108109
precision: 0.8271604938271605
recall: 0.7052631578947368
f1 score: 0.7613636363636362



In [12]:
# random state
X = [[1, 1], [2, 2], [3, 3], [4, 4]]
y = [0, 0, 1, 1]
# train_test_split() randomly splits the data, so we will end up with different data points in each set every time we run the code 
X_train, X_test, y_train, y_test = train_test_split(X, y)
print('Diff split each time we run the code:')
print('run 1')
print('X_train:', X_train)
print('X_test:', X_test)
X_train, X_test, y_train, y_test = train_test_split(X, y)
print('run 2')
print('X_train:', X_train)
print('X_test:', X_test)
print()
# 'random state=' "locks" the split as long as we use the same number each time
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=27)
print('Same split each time we run the code:')
print('run 1')
print('X_train:', X_train)
print('X_test:', X_test)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=27)
print('run 2')
print('X_train:', X_train)
print('X_test:', X_test)

# the random state is also called a seed

# code and comments by github.com/alandavidgrunberg



Diff split each time we run the code:
run 1
X_train: [[4, 4], [1, 1], [2, 2]]
X_test: [[3, 3]]
run 2
X_train: [[2, 2], [1, 1], [4, 4]]
X_test: [[3, 3]]

Same split each time we run the code:
run 1
X_train: [[3, 3], [1, 1], [4, 4]]
X_test: [[2, 2]]
run 2
X_train: [[3, 3], [1, 1], [4, 4]]
X_test: [[2, 2]]
