# Library

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, LeaveOneOut
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [2]:
import warnings
from sklearn.exceptions import DataConversionWarning

warnings.filterwarnings("ignore", category = DataConversionWarning)

# Read Dataset

In [3]:
dataset = pd.read_csv('titanic.csv')
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Train & Test Data

In [4]:
# features for train data
train_data = dataset[['Sex', 'Age', 'Pclass', 'Fare']]
train_data.head()

Unnamed: 0,Sex,Age,Pclass,Fare
0,male,22.0,3,7.25
1,female,38.0,1,71.2833
2,female,26.0,3,7.925
3,female,35.0,1,53.1
4,male,35.0,3,8.05


In [5]:
# label for test data
label = dataset[['Survived']]
label.head()

Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0


# Handling Missing Values

In [6]:
avg_age_per_class = dataset.groupby('Survived')['Age'].mean()
print(avg_age_per_class)

Survived
0    30.626179
1    28.343690
Name: Age, dtype: float64


In [7]:
train_data['Age'].tail()

886    27.0
887    19.0
888     NaN
889    26.0
890    32.0
Name: Age, dtype: float64

In [8]:
for survived, mean_age in avg_age_per_class.items():
    train_data.loc[(dataset['Survived'] == survived) & (train_data['Age'].isnull()), 'Age'] = mean_age

train_data['Age'].tail()

886    27.000000
887    19.000000
888    30.626179
889    26.000000
890    32.000000
Name: Age, dtype: float64

# Encoding Categorical Feature

In [9]:
train_data.loc[:, 'Sex'] = train_data['Sex'].map({'male': 0, 'female': 1})
train_data.head()

Unnamed: 0,Sex,Age,Pclass,Fare
0,0,22.0,3,7.25
1,1,38.0,1,71.2833
2,1,26.0,3,7.925
3,1,35.0,1,53.1
4,0,35.0,3,8.05


# Normalization

In [10]:
num_features = ['Age', 'Fare']
scaler = MinMaxScaler()
train_data_norm = pd.DataFrame(scaler.fit_transform(train_data[num_features]), columns = num_features)
train_data_norm.head()

Unnamed: 0,Age,Fare
0,0.271174,0.014151
1,0.472229,0.139136
2,0.321438,0.015469
3,0.434531,0.103644
4,0.434531,0.015713


In [11]:
min_train = scaler.data_min_
max_train = scaler.data_max_

print(f'Min Value\t: {min_train}')
print(f'Max Value\t: {max_train}')

Min Value	: [0.42 0.  ]
Max Value	: [ 80.     512.3292]


In [12]:
train_data = pd.concat([train_data_norm, train_data.drop(columns = num_features)], axis = 1)
train_data.head()

Unnamed: 0,Age,Fare,Sex,Pclass
0,0.271174,0.014151,0,3
1,0.472229,0.139136,1,1
2,0.321438,0.015469,1,3
3,0.434531,0.103644,1,1
4,0.434531,0.015713,0,3


In [13]:
cat_features = ['Sex', 'Pclass']
test_data_norm = (train_data.drop(columns = cat_features) - min_train) / (max_train - min_train)
test_data = pd.concat([test_data_norm, train_data.drop(columns = num_features)], axis = 1)
test_data.head()

Unnamed: 0,Age,Fare,Sex,Pclass
0,-0.00187,2.8e-05,0,3
1,0.000656,0.000272,1,1
2,-0.001239,3e-05,1,3
3,0.000183,0.000202,1,1
4,0.000183,3.1e-05,0,3


# k-NN Classification & Validation Model

In [14]:
knn = KNeighborsClassifier(n_neighbors = 3)

# Hold-Out Method (70%-30%)
X_train, X_test, y_train, y_test = train_test_split(train_data, label, test_size = 0.3, random_state = 100)
knn.fit(X_train, y_train)
holdout_pred = knn.predict(X_test)
holdout_error_ratio = 1 - accuracy_score(y_test, holdout_pred)

In [15]:
# k-Fold (k: 10)
kf = KFold(n_splits = 10, shuffle = True, random_state = 100)
kf_error_sum = 0

for train_index, test_index in kf.split(train_data):
    X_train, X_test = train_data.iloc[train_index], train_data.iloc[test_index]
    y_train, y_test = label.iloc[train_index], label.iloc[test_index]
    knn.fit(X_train, y_train)
    kf_pred = knn.predict(X_test)
    kf_error_sum += 1 - accuracy_score(y_test, kf_pred)

kfold_error_ratio = kf_error_sum / 10

In [16]:
# Leave-One-Out (LOO)
loo = LeaveOneOut()
loo_error_sum = 0

for train_index, test_index in loo.split(train_data):
    X_train, X_test = train_data.iloc[train_index], train_data.iloc[test_index]
    y_train, y_test = label.iloc[train_index], label.iloc[test_index]
    knn.fit(X_train, y_train)
    loo_pred = knn.predict(X_test)
    loo_error_sum += (1 - accuracy_score(y_test, loo_pred))

loo_error_ratio = loo_error_sum / len(train_data)

In [17]:
print('Error Ratios'.center(33, '=')) 
print(f'Holdout (70%-30%)\t: {holdout_error_ratio:.5f}')
print(f'k-Fold (k: 10)\t\t: {kfold_error_ratio:.5f}')
print(f'Leave One Out (LOO)\t: {loo_error_ratio:.5f}')

Holdout (70%-30%)	: 0.13433
k-Fold (k: 10)		: 0.15941
Leave One Out (LOO)	: 0.15937
