In [58]:
from google.colab import drive
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, roc_auc_score, auc
from sklearn.model_selection import cross_val_score, train_test_split, KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
import matplotlib.pyplot as plt

In [5]:
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
df = pd.read_csv('/content/drive/MyDrive/Colab/datasets/Titanic-Dataset.csv')

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [24]:
X = df[['Sex', 'Age', 'Pclass']]
Y = df[['Survived']]

In [29]:
X = X.copy()
X['Age'] = X['Age'].fillna(X['Age'].mean())

In [60]:
X['Sex'] = X['Sex'].map({'male':1, 'female':0})
X = X.values
Y = Y.values

In [41]:
df_Embarked1 = df[['Embarked']].copy().dropna()
df_Embarked2 = df[['Embarked']].copy().dropna()

In [53]:
encoder = OneHotEncoder(drop = 'first', sparse_output = False)
df_Embarked1_encoded = encoder.fit_transform(df_Embarked1)
print(df_Embarked1_encoded)

[[0. 1.]
 [0. 0.]
 [0. 1.]
 ...
 [0. 1.]
 [0. 0.]
 [1. 0.]]


In [57]:
df_Embarked2_encoded = pd.get_dummies(df_Embarked2, drop_first=True, dtype = int)
print(df_Embarked2_encoded.values)

[[0 1]
 [0 0]
 [0 1]
 ...
 [0 1]
 [0 0]
 [1 0]]


In [103]:
model = LogisticRegression(penalty='l2', C = 1, max_iter=50)
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42, shuffle=True)
kf = KFold(n_splits = 3, random_state=42, shuffle=True)
cross_scores = cross_val_score(model, x_train, y_train.flatten(), cv = kf, scoring = 'accuracy')
print(cross_scores.mean())

0.7963573615100049


In [93]:
print(f"""
roc_auc: {cross_val_score(model, x_train, y_train.flatten(), scoring='roc_auc', cv=5).mean():.2f}
recall: {cross_val_score(model, x_train, y_train.flatten(), scoring='recall', cv=5).mean():.2f}
f1:{cross_val_score(model, x_train, y_train.flatten(), scoring='f1', cv=5).mean():.2f}""")


roc_auc: 0.84
recall: 0.69
f1:0.71


In [77]:
model_train = LogisticRegression(penalty='l2', C = 1, max_iter=50)
model_train.fit(x_train, y_train.flatten())

In [113]:
y_prob = model_train.predict_proba(x_test)[:, 1]
y_pred = (y_prob >= 0.5).astype('int')
y_true = y_test.ravel()
print(f'Accuracy score: {accuracy_score(y_true, y_pred):.2f}')
print(f'Precision score: {precision_score(y_true, y_pred, zero_division=1.0):.2f}')
print(f'Recall score: {recall_score(y_true, y_pred):.2f}')
print(f'f1_score: {f1_score(y_true, y_pred):.2f}')
fpr, tpr, thresholds = roc_curve(y_test.flatten(), y_prob)
print(f'AreaUnderCurve with fpr, tpr: {auc(fpr, tpr):.2f}')
print(f'AreaUnderCurve with y_true, y_pred: {roc_auc_score(y_true, y_prob):.2f}')

Accuracy score: 0.81
Precision score: 0.79
Recall score: 0.73
f1_score: 0.76
AreaUnderCurve with fpr, tpr: 0.87
AreaUnderCurve with y_true, y_pred: 0.87


In [95]:
y_prob.dtype

dtype('float64')

In [104]:
y_test = y_test.astype('float')

In [111]:
y_test.dtype

dtype('float64')

In [81]:
y_prob = model_train.predict_proba(x_test)[:, 1]
y_prob.shape

(179,)

In [84]:
y_test.flatten().shape

(179,)

In [85]:
y_prob.dtype

dtype('float64')

In [86]:
y_test.dtype

dtype('int64')

In [114]:
scaler = StandardScaler()
scaler.fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [120]:
model_on_scaled_dataset = LogisticRegression(penalty = 'l2', C = 0.1, max_iter = 50)
model_on_scaled_dataset.fit(x_train_scaled, y_train.flatten())

In [121]:
y_prob = model_on_scaled_dataset.predict_proba(x_test_scaled)[:, 1]
y_pred = (y_prob >= 0.5).astype('int')
y_true = y_test.ravel()
print(f'Accuracy score: {accuracy_score(y_true, y_pred):.2f}')
print(f'Precision score: {precision_score(y_true, y_pred, zero_division=1.0):.2f}')
print(f'Recall score: {recall_score(y_true, y_pred):.2f}')
print(f'f1_score: {f1_score(y_true, y_pred):.2f}')
fpr, tpr, thresholds = roc_curve(y_test.flatten(), y_prob)
print(f'AreaUnderCurve with fpr, tpr: {auc(fpr, tpr):.2f}')
print(f'AreaUnderCurve with y_true, y_pred: {roc_auc_score(y_true, y_prob):.2f}')

Accuracy score: 0.80
Precision score: 0.78
Recall score: 0.72
f1_score: 0.75
AreaUnderCurve with fpr, tpr: 0.87
AreaUnderCurve with y_true, y_pred: 0.87
