In [1]:
from typing import List, Dict, Any
import pandas as pd
import numpy as np
import os

In [2]:
train_df = pd.read_csv(os.path.join('data', "train.csv"))
train_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [3]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

train_df = train_df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Fare', 'Cabin']).dropna()
train_df["Sex"] = LabelEncoder().fit_transform(train_df["Sex"])
train_df["Embarked"] = LabelEncoder().fit_transform(train_df["Embarked"])
train_df['Age'] = StandardScaler().fit_transform(train_df['Age'].values.reshape(-1, 1))
train_df['bias'] = np.ones(train_df.shape[0])

In [4]:
train_df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked,bias
0,0,3,1,-0.527669,1,0,2,1.0
1,1,1,0,0.577094,1,0,0,1.0
2,1,3,0,-0.251478,0,0,2,1.0
3,1,1,0,0.369951,1,0,2,1.0
4,0,3,1,0.369951,0,0,2,1.0
...,...,...,...,...,...,...,...,...
885,0,3,0,0.646142,0,5,1,1.0
886,0,2,1,-0.182430,0,0,2,1.0
887,1,1,0,-0.734812,0,0,2,1.0
889,1,1,1,-0.251478,0,0,0,1.0


In [5]:
from sklearn.model_selection import train_test_split

X = train_df.drop(columns=['Survived'])
y = train_df['Survived']

X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=0.15, random_state=69)

In [33]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((605, 7), (107, 7), (605,), (107,))

In [7]:
def MSELoss(y_true: np.ndarray, y_pred: np.ndarray) -> np.float32:
    y_true = y_true.flatten()
    y_pred = y_pred.flatten()
    return (1/len(y_true)) * np.sum((y_pred - y_true) ** 2)

def MAELoss(y_true: np.ndarray, y_pred: np.ndarray) -> np.float32:
    y_true = y_true.flatten()
    y_pred = y_pred.flatten()
    return (1/len(y_true)) * np.sum(np.abs((y_pred - y_true)))

def accuracy_with_thresholding(y_true: np.ndarray, y_pred: np.ndarray, threshold = 0.5):
    y_true = y_true.flatten()
    y_pred = y_pred.flatten()
    return np.sum(
        y_true == (y_pred > threshold).astype(np.int32)
    ) / len(y_true)
    
def sigmoid(z):
    return 1/(1 + np.exp(-z))


# K-Means

In [27]:
np.random.seed(69)
n_dims = X_train.shape[1]
centeroid_1 = np.random.rand(n_dims)
centeroid_0 = np.random.rand(n_dims)

centeroid_1, centeroid_0

(array([0.29624916, 0.80906772, 0.35025253, 0.78940926, 0.56134898,
        0.25358895, 0.10497708]),
 array([0.05846073, 0.67329238, 0.69782966, 0.73278321, 0.78787406,
        0.07637632, 0.3176806 ]))

In [28]:
df = pd.DataFrame()
df['cluster'] = 0


In [29]:
df['distance_from_centeroid_1'] = np.linalg.norm(X_train - centeroid_1, axis=1)
df['distance_from_centeroid_0'] = np.linalg.norm(X_train - centeroid_0, axis=1)
df

Unnamed: 0,cluster,distance_from_centeroid_1,distance_from_centeroid_0
0,,3.592599,3.947764
1,,2.315323,2.511048
2,,3.111002,3.394407
3,,2.887162,3.063696
4,,3.660660,3.832303
...,...,...,...
600,,3.677622,3.991419
601,,3.514396,3.839409
602,,2.478655,2.477514
603,,3.415140,3.751069


In [30]:
df['cluster'] = (df['distance_from_centeroid_0'] > df['distance_from_centeroid_1']).astype(np.int32)
df

Unnamed: 0,cluster,distance_from_centeroid_1,distance_from_centeroid_0
0,1,3.592599,3.947764
1,1,2.315323,2.511048
2,1,3.111002,3.394407
3,1,2.887162,3.063696
4,1,3.660660,3.832303
...,...,...,...
600,1,3.677622,3.991419
601,1,3.514396,3.839409
602,0,2.478655,2.477514
603,1,3.415140,3.751069


In [31]:
n_epochs = 100

for epoch in range(n_epochs):
    df['distance_from_centeroid_1'] = np.linalg.norm(X_train - centeroid_1, axis=1)
    df['distance_from_centeroid_0'] = np.linalg.norm(X_train - centeroid_0, axis=1)
    df['cluster'] = (df['distance_from_centeroid_0'] > df['distance_from_centeroid_1']).astype(np.int32)
    
    centeroid_1 = np.mean(X_train[df['cluster'] == 1], axis=0)
    centeroid_0 = np.mean(X_train[df['cluster'] == 0], axis=0)
    
    print(f"Epoch {epoch+1}/{n_epochs} - Loss: {MSELoss(y_train, sigmoid(np.dot(X_train, centeroid_1)))}")

Epoch 1/100 - Loss: 0.6004290994911503
Epoch 2/100 - Loss: 0.6003391501588581
Epoch 3/100 - Loss: 0.6003740402280581
Epoch 4/100 - Loss: 0.6003969612385782
Epoch 5/100 - Loss: 0.600395738825342
Epoch 6/100 - Loss: 0.6003929486496854
Epoch 7/100 - Loss: 0.6003783392558093
Epoch 8/100 - Loss: 0.6003600523338269
Epoch 9/100 - Loss: 0.6003526441651202
Epoch 10/100 - Loss: 0.6003164098397494
Epoch 11/100 - Loss: 0.600278131670149
Epoch 12/100 - Loss: 0.6002430980058954
Epoch 13/100 - Loss: 0.6002243447588397
Epoch 14/100 - Loss: 0.600223043090072
Epoch 15/100 - Loss: 0.6002111253097686
Epoch 16/100 - Loss: 0.6002111253097686
Epoch 17/100 - Loss: 0.6002111253097686
Epoch 18/100 - Loss: 0.6002111253097686
Epoch 19/100 - Loss: 0.6002111253097686
Epoch 20/100 - Loss: 0.6002111253097686
Epoch 21/100 - Loss: 0.6002111253097686
Epoch 22/100 - Loss: 0.6002111253097686
Epoch 23/100 - Loss: 0.6002111253097686
Epoch 24/100 - Loss: 0.6002111253097686
Epoch 25/100 - Loss: 0.6002111253097686
Epoch 26/100

In [34]:
eval_df = pd.DataFrame()

eval_df['distance_from_centeroid_1'] = np.linalg.norm(X_test - centeroid_1, axis=1)
eval_df['distance_from_centeroid_0'] = np.linalg.norm(X_test - centeroid_0, axis=1)
eval_df['cluster'] = (eval_df['distance_from_centeroid_0'] > eval_df['distance_from_centeroid_1']).astype(np.int32)

In [36]:
eval_df['y_true'] = y_test

In [37]:
eval_df

Unnamed: 0,distance_from_centeroid_1,distance_from_centeroid_0,cluster,y_true
0,2.591151,2.167993,0,1
1,1.304863,1.343987,1,0
2,1.467429,1.357261,0,1
3,2.134500,1.399580,0,1
4,0.881046,2.126977,1,0
...,...,...,...,...
102,1.046446,2.249655,1,0
103,1.361640,1.749750,1,0
104,3.002309,1.652109,0,1
105,1.596635,2.544203,1,0


In [41]:
eval_df['y_pred'] = eval_df['cluster'].apply(lambda x: 1 if x == 0 else 0)

In [43]:
(eval_df['y_true'] == eval_df['y_pred']).sum() / len(eval_df)

np.float64(0.6542056074766355)

# surprize for mohammed

In [48]:
eval_df = pd.DataFrame()

eval_df['distance_from_centeroid_1'] = np.linalg.norm(X_train - centeroid_1, axis=1)
eval_df['distance_from_centeroid_0'] = np.linalg.norm(X_train - centeroid_0, axis=1)
eval_df['cluster'] = (eval_df['distance_from_centeroid_0'] > eval_df['distance_from_centeroid_1']).astype(np.int32)

In [49]:
X_train_with_cluster = pd.DataFrame(X_train, columns=X.columns)
X_train_with_cluster['cluster'] = eval_df['cluster']

In [53]:
(X_train_with_cluster['cluster'].apply(lambda x: 1 if x == 0 else 0) == y_train).sum() / len(y_train)

np.float64(0.5818181818181818)

In [55]:
X_train_with_cluster = X_train_with_cluster.values

In [58]:
eval_df = pd.DataFrame()

eval_df['distance_from_centeroid_1'] = np.linalg.norm(X_test - centeroid_1, axis=1)
eval_df['distance_from_centeroid_0'] = np.linalg.norm(X_test - centeroid_0, axis=1)
eval_df['cluster'] = (eval_df['distance_from_centeroid_0'] > eval_df['distance_from_centeroid_1']).astype(np.int32)

In [59]:
X_test_with_cluster = pd.DataFrame(X_test, columns=X.columns)
X_test_with_cluster['cluster'] = eval_df['cluster']

In [60]:
X_test_with_cluster = X_test_with_cluster.values

In [56]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=25)


In [57]:
knn.fit(X_train_with_cluster, y_train)

In [61]:
knn.score(X_test_with_cluster, y_test)

0.8130841121495327

In [64]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()

lr.fit(X_train_with_cluster, y_train)
lr.score(X_test_with_cluster, y_test)

0.794392523364486

In [65]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()

lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.8037383177570093