In [86]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Load Dataset

In [87]:
# Load dataset
titanic_df = pd.read_csv(r'C:\Users\USER\Desktop\ICT DATASCIENCE\titanic_dataset .csv')
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Pre processing

In [88]:
# Drop irrelevant columns
titanic_df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)


In [89]:
titanic_df.isna().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [90]:
# Fill missing values
titanic_df['Age'].fillna(titanic_df['Age'].median(), inplace=True)
titanic_df['Embarked'].fillna(titanic_df['Embarked'].mode(), inplace=True)

In [91]:
titanic_df.isna().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    2
dtype: int64

In [92]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


In [93]:
# Convert categorical variables into dummy/indicator variables
titanic_df = pd.get_dummies(titanic_df, columns=['Sex', 'Embarked'],drop_first=True)

In [94]:
# Separate features and target variable
X = titanic_df.drop('Survived', axis=1)
y = titanic_df['Survived']

In [95]:
# Standardize features
sc = StandardScaler()
X_sc = sc.fit_transform(X)
X_sc=pd.DataFrame(X_sc)
X_sc.describe()

Unnamed: 0,0,1,2,3,4,5,6,7
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,-8.772133e-17,2.27278e-16,4.3860660000000004e-17,5.3829000000000005e-17,3.9873330000000004e-18,-1.156327e-16,0.0,-4.984166e-17
std,1.000562,1.000562,1.000562,1.000562,1.000562,1.000562,1.000562,1.000562
min,-1.566107,-2.224156,-0.4745452,-0.4736736,-0.6484217,-1.355574,-0.307562,-1.61471
25%,-0.3693648,-0.5657365,-0.4745452,-0.4736736,-0.4891482,-1.355574,-0.307562,-1.61471
50%,0.8273772,-0.1046374,-0.4745452,-0.4736736,-0.3573909,0.7376951,-0.307562,0.6193064
75%,0.8273772,0.4333115,0.4327934,-0.4736736,-0.02424635,0.7376951,-0.307562,0.6193064
max,0.8273772,3.891554,6.784163,6.974147,9.667167,0.7376951,3.251373,0.6193064


In [96]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# KNN Model

In [107]:
# Create kNN model
knn_model = KNeighborsClassifier()

# KFold cross validation on KNN

In [108]:
kfold_validator=KFold(10)

In [109]:
kf_cv_result=cross_val_score(knn_model,X_sc,y,cv=kfold_validator)

In [110]:
kf_cv_result

array([0.78888889, 0.79775281, 0.7752809 , 0.80898876, 0.82022472,
       0.82022472, 0.83146067, 0.7752809 , 0.85393258, 0.78651685])

In [111]:
#Average score of the model
kf_cv_result.mean()

0.8058551810237204

#  stratified k-fold cross validation on KNN

In [112]:
strat_validator=StratifiedKFold(n_splits=10)

In [113]:
st_cv_result=cross_val_score(knn_model,X_sc,y,cv=strat_validator)

In [114]:
st_cv_result

array([0.78888889, 0.82022472, 0.73033708, 0.83146067, 0.84269663,
       0.82022472, 0.85393258, 0.78651685, 0.83146067, 0.78651685])

In [115]:
#Average score of the model
st_cv_result.mean()

0.8092259675405742

# SVM Model

In [116]:
# Create SVM model
sv_clf=SVC()

# KFold cross validation on SVM

In [117]:
kf_cv_result_svm=cross_val_score(sv_clf,X_sc,y,cv=kfold_validator)

In [118]:
kf_cv_result_svm

array([0.81111111, 0.86516854, 0.7752809 , 0.84269663, 0.79775281,
       0.79775281, 0.80898876, 0.75280899, 0.8988764 , 0.86516854])

In [119]:
kf_cv_result_svm.mean()

0.8215605493133584

# stratified k-fold cross validation on SVM

In [120]:
st_cv_result_svm=cross_val_score(sv_clf,X_sc,y,cv=strat_validator)

In [121]:
st_cv_result_svm

array([0.81111111, 0.84269663, 0.7752809 , 0.86516854, 0.82022472,
       0.78651685, 0.83146067, 0.76404494, 0.87640449, 0.87640449])

In [122]:
st_cv_result_svm.mean()

0.8249313358302123

# Average accuracy score of models

In [123]:
print("Average Accuracy Scores:")
print("KNN KFold average score:",kf_cv_result.mean())
print("kNN Stratified Kfold average score:",st_cv_result.mean())
print("SVM KFold average score:",kf_cv_result_svm.mean())
print("SVM Stratified Kfold average score:",st_cv_result_svm.mean())

Average Accuracy Scores:
KNN KFold average score: 0.8058551810237204
kNN Stratified Kfold average score: 0.8092259675405742
SVM KFold average score: 0.8215605493133584
SVM Stratified Kfold average score: 0.8249313358302123
