In [1]:
#import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
#import our dataset
data = pd.read_csv("Titanic-Dataset.csv")
df = pd.DataFrame(data)

In [3]:
#select required columns
df = df[['Pclass', 'Sex', 'Age', 'Fare', 'Survived']]
df.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Survived
0,3,male,22.0,7.25,0
1,1,female,38.0,71.2833,1
2,3,female,26.0,7.925,1
3,1,female,35.0,53.1,1
4,3,male,35.0,8.05,0


In [4]:
#transform categorical feature into numerical feature
label_encoder = LabelEncoder()
df["Sex"] = label_encoder.fit_transform(df["Sex"])

In [5]:
#get insight into our data frame
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    int64  
 1   Sex       891 non-null    int32  
 2   Age       714 non-null    float64
 3   Fare      891 non-null    float64
 4   Survived  891 non-null    int64  
dtypes: float64(2), int32(1), int64(2)
memory usage: 31.4 KB


Unnamed: 0,Pclass,Sex,Age,Fare,Survived
0,3,1,22.0,7.25,0
1,1,0,38.0,71.2833,1
2,3,0,26.0,7.925,1
3,1,0,35.0,53.1,1
4,3,1,35.0,8.05,0


In [6]:
#fill the vacant boxes in the Age column with the median of the given ages
df["Age"].fillna(df["Age"].median())
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    int64  
 1   Sex       891 non-null    int32  
 2   Age       714 non-null    float64
 3   Fare      891 non-null    float64
 4   Survived  891 non-null    int64  
dtypes: float64(2), int32(1), int64(2)
memory usage: 31.4 KB


In [7]:
#Split the dataset into test and train
x = df.drop("Survived", axis = 1) #drop the Survived column from the data frame
y = df["Survived"] #assign the survived column to y
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state = 42) #split respectively

In [54]:
#train the logistic regression model
model = LogisticRegression(random_state = 42)
model.fit(x_train, y_train)

In [55]:
#predict from the X test
#the predictions will be used to evaluate the Y test
model_predictions = model.predict(x_test)

#check for the model accuracy
model_accuracy = accuracy_score(model_predictions, y_test)
model_conf_matrix = confusion_matrix(y_test, model_predictions)
print(f"Logistic Regression Accuracy: {model_accuracy:.2f}")

Logistic Regression Accuracy: 0.80


In [56]:
#evaluate the model
print("Confusion Matrix:\n", model_conf_matrix)
#[True Positive False Positive]
#[False Negative True Negative]

Confusion Matrix:
 [[90 15]
 [20 54]]
