In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

df = pd.read_csv('https://sololearn.com/uploads/files/titanic.csv')
df['Male'] = df['Sex'] == 'male'
X = df[['Pclass', 'Male', 'Age', 'Siblings/Spouses', 'Parents/Children', 'Fare']].values
y = df['Survived'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=5)

## Area Under Curve (AUC)

# ROC Curve shows different thresholds for the same logistic regression model. This helps us choose the best threshold for our final model.

# AUC allows us to compare ROC Curves for different logistic regression models. This helps us choose the best combination of features for our final model.

# AUC value between 0 and 1, the higher the better

model1 = LogisticRegression() 
model1.fit(X_train, y_train) # fit on training data, all 6 features
y_pred_proba1 = model1.predict_proba(X_test) # prediction probabilities on testing data
                                             # outputs 2 columns: probability in 0 class (died), probability in 1 class (survived)
y_survival_proba1 = y_pred_proba1[:,1] # we only need 2nd column: probability survived
                                       # assign to new variable 'y_survival_proba1'
print("model 1 AUC score:", roc_auc_score(y_test, y_survival_proba1)) # pass actual targets and predicted probabilities to 'roc_auc_score()' to get AUC score


model2 = LogisticRegression()
model2.fit(X_train[:, 0:2], y_train) # fitting only on first 2 columns of 'X_train' (first 2 features 'Pclass' and 'Male')
y_pred_proba2 = model2.predict_proba(X_test[:, 0:2]) # prediction probabilities on first 2 columns of 'X_test' (first 2 features 'Pclass' and 'Male')
                                                     # outputs 2 columns: probability in 0 class (died), probability in 1 class (survived)
y_survival_proba2 = y_pred_proba2[:,1] # we only need 2nd column: probability survived
                                       # assign to new variable 'y_survival_proba2'
print("model 2 AUC score:", roc_auc_score(y_test, y_survival_proba2)) # pass actual targets and predicted probabilities to 'roc_auc_score()' to get AUC score

# model 1 has higher AUC score, so better model. 

# code and comments by github.com/alandavidgrunberg


model 1 AUC score: 0.8572299651567944
model 2 AUC score: 0.8390679442508711
