Essential Libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

Loading the dataset

In [None]:
df = pd.read_csv('titanic.csv')
df.head(2)

Make a copy for further analysis

In [26]:
df_for_results = df[['Name', 'Age', 'Fare']].copy()

Finding missing values and treat them properly.

In [None]:
columns_null_quantities = df.isnull().sum()
columns_null_quantities
df.info()

Fill rows where Age is missing (with the median of Age column)

In [None]:
df.fillna({'Age': df['Age'].median()}, inplace=True)

Drop rows where Embarked is null

In [None]:
df.dropna(subset=['Embarked'], inplace=True)

Substitute outlieres in column Fare with mean 

In [46]:
mean , std = df['Fare'].mean() , df['Fare'].std()
outliers = (df['Fare'] - mean).abs() > std * 3
df.loc[outliers, 'Fare'] = mean

In [None]:
df[['Sex', 'Embarked']]

Perform one-hot encoding on columns Sex and Embarked

In [None]:
df = pd.get_dummies(df, columns=['Sex', 'Embarked'])
df

Drop 4 irrelevant columns (Cabin, Name,Survived, Ticket)


In [62]:
X = df.drop(['Cabin', 'Name', 'Ticket', 'Survived'], axis=1)
y = df['Survived']

Split data into training and test sets

In [None]:
X_train, X_test, y_train , y_test = train_test_split(X, y, test_size=.2, random_state=42)
X_test_copy = X_test.copy()

Scaling features

In [67]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Making the model and evaluate it's performance

In [68]:
model = LogisticRegression()

model.fit(X_train, y_train)

Making predictions using the test set

In [None]:
y_pred = model.predict(X_test)

# inserting the predictions to the dataset as a new column
df_for_results.loc[X_test_copy.index , 'Survived_Predictions'] = y_pred

Measuring the model's accuracy using 2 different metrics

In [None]:
print(f'Accuracy Score of the Model is -> {accuracy_score(y_test, y_pred)}')
print(f'Confusion Matrix of the Model is  ->\n{confusion_matrix(y_test, y_pred)}')

# conf matrix structure is - > [[True+, True-], [False+, False-]]

In [None]:
df_for_results.loc[X_test_copy.index]