In [1]:
from typing import List, Dict, Any
import pandas as pd
import numpy as np
import os

In [2]:
train_df = pd.read_csv(os.path.join('data', "train.csv"))
train_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [3]:
def substitute_sex(x: str) -> int:
    return int(x != "male")


def stone_the_adulters(x: int) -> str: # denormalizing function
    return "male" if x == 0 else "female"

def substitute_embarked(x: str) -> int:
    if x == "S":
        return 0
    elif x == "C":
        return 1
    else:
        return 2

def disembarque(x: int) -> str:
    if x == 0:
        return "S"
    elif x == 1:
        return "C"
    else:
        return "Q"

In [4]:
train_df = train_df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Fare', 'Cabin'])
train_df = train_df.dropna()


train_df['Sex'] = train_df['Sex'].apply(substitute_sex)
train_df['Embarked'] = train_df['Embarked'].apply(substitute_embarked)


mean = train_df['Age'].mean()
std = train_df['Age'].std()
train_df['Age'] = (train_df['Age'] - mean) / std

In [5]:
train_df['bias'] = [1 for x in range((train_df.shape[0]))]

In [6]:
train_df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked,bias
0,0,3,0,-0.527298,1,0,0,1
1,1,1,1,0.576688,1,0,1,1
2,1,3,1,-0.251301,0,0,0,1
3,1,1,1,0.369691,1,0,0,1
4,0,3,0,0.369691,0,0,0,1
...,...,...,...,...,...,...,...,...
885,0,3,1,0.645688,0,5,2,1
886,0,2,0,-0.182302,0,0,0,1
887,1,1,1,-0.734295,0,0,0,1
889,1,1,0,-0.251301,0,0,1,1


In [7]:
from sklearn.model_selection import train_test_split

X = train_df.drop(columns=['Survived'])
y = train_df['Survived']

X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=0.15, random_state=69)

In [8]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((605, 7), (107, 7), (605,), (107,))

In [34]:
def MSELoss(y_true: np.ndarray, y_pred: np.ndarray) -> np.float32:
    y_true = y_true.flatten()
    y_pred = y_pred.flatten()
    return (1/len(y_true)) * np.sum((y_pred - y_true) ** 2)

def MAELoss(y_true: np.ndarray, y_pred: np.ndarray) -> np.float32:
    y_true = y_true.flatten()
    y_pred = y_pred.flatten()
    return (1/len(y_true)) * np.sum(np.abs((y_pred - y_true)))

def accuracy_with_thresholding(y_true: np.ndarray, y_pred: np.ndarray, threshold = 0.5):
    y_true = y_true.flatten()
    y_pred = y_pred.flatten()
    return np.sum(
        y_true == (y_pred > threshold).astype(np.int32)
    ) / len(y_true)
    
def accuracy(y_true: np.ndarray, y_pred: np.ndarray):
    y_true = y_true.flatten()
    y_pred = y_pred.flatten()
    return np.sum(y_true == y_pred) / len(y_true)
    
def sigmoid(z):
    return 1/(1 + np.exp(-z))


# KNN

In [45]:
np.sum(y_test == 0) / len(y_test)

np.float64(0.5607476635514018)

In [65]:

def distance(x_row, X_train):
    x_row = x_row.reshape(1, -1)
    return np.sqrt(np.sum((X_train - x_row) ** 2, axis=1))
    
def knn(x_row, X_train, y_train, k):
    distances = distance(x_row, X_train)
    
    sorted_labels = [x[1] for x in sorted(zip(distances, y_train))]
    sorted_k_labels = sorted_labels[:k]

    return max(set(sorted_k_labels), key=sorted_k_labels.count)

In [68]:
k = 25
y_pred = [int(knn(x_row, X_train, y_train, k)) for x_row in X_test]
accuracy(y_test, np.array(y_pred))

np.float64(0.8130841121495327)