In [None]:
# importing libraries
import pandas as pd
from tabulate import tabulate # type: ignore

#Labelling categorical data using LabelEncoder class (optional)
from sklearn.preprocessing import LabelEncoder
#for scaling x to a standard range of values
from sklearn.preprocessing import StandardScaler
# for splitting train-test dataset
from sklearn.model_selection import train_test_split
#for logistic regression and Linear regression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
#for performance metrices
from sklearn.metrics import accuracy_score as accuracy, precision_score as precision, recall_score as recall, f1_score as f1, confusion_matrix, classification_report


In [73]:
# load the dataset
df = pd.read_csv("Titanic-Dataset.csv")

In [74]:
# drop irrelevant columns
df.drop(columns=["PassengerId", "Name", "Ticket", 'Cabin']) 


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.2500,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.9250,S
3,1,1,female,35.0,1,0,53.1000,S
4,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S
887,1,1,female,19.0,0,0,30.0000,S
888,0,3,female,,1,2,23.4500,S
889,1,1,male,26.0,0,0,30.0000,C


In [75]:
# handling missing values
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

In [76]:


#hot encoding for non ordinal categorical variables (sex or embarked(P, Q, S))
df = pd.get_dummies(df, columns=['Sex', 'Embarked'], drop_first=True)
df


Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Sex_male,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.2500,,True,False,True
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,False,False,False
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.9250,,False,False,True
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1000,C123,False,False,True
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.0500,,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",27.0,0,0,211536,13.0000,,True,False,True
887,888,1,1,"Graham, Miss. Margaret Edith",19.0,0,0,112053,30.0000,B42,False,False,True
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",28.0,1,2,W./C. 6607,23.4500,,False,False,True
889,890,1,1,"Behr, Mr. Karl Howell",26.0,0,0,111369,30.0000,C148,True,False,False


In [77]:
# scaling the numeric columns (age, fare, sibsp, Parch)
# standardization 
stdScalar = StandardScaler()
df[['Age', 'Fare', 'Parch', 'SibSp']] = stdScalar.fit_transform(df[['Age', 'Fare', 'Parch', 'SibSp']])
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Sex_male,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",-0.565736,0.432793,-0.473674,A/5 21171,-0.502445,,True,False,True
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0.663861,0.432793,-0.473674,PC 17599,0.786845,C85,False,False,False
2,3,1,3,"Heikkinen, Miss. Laina",-0.258337,-0.474545,-0.473674,STON/O2. 3101282,-0.488854,,False,False,True
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0.433312,0.432793,-0.473674,113803,0.420730,C123,False,False,True
4,5,0,3,"Allen, Mr. William Henry",0.433312,-0.474545,-0.473674,373450,-0.486337,,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",-0.181487,-0.474545,-0.473674,211536,-0.386671,,True,False,True
887,888,1,1,"Graham, Miss. Margaret Edith",-0.796286,-0.474545,-0.473674,112053,-0.044381,B42,False,False,True
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",-0.104637,0.432793,2.008933,W./C. 6607,-0.176263,,False,False,True
889,890,1,1,"Behr, Mr. Karl Howell",-0.258337,-0.474545,-0.473674,111369,-0.044381,C148,True,False,False


In [84]:
# now Target(y) = Survived or not? 
# remove Target(y) from features(x)
# Drop rows with missing target values (if any)

df = df.dropna(subset=['Survived'])
x = df.drop('Survived', axis=1)
y = df['Survived']



In [87]:
# split the data into 80% for training and 20% for test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# apply logistic regression
