In [97]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from typing import Any, Callable
import helpers

labeled_df = pd.read_csv('data/labeled_penguins.csv')

labeled_df.head(5)

X_train, y_train, X_val, y_val, X_test, y_test, feature_names, label_map = helpers.preprocess_data(df=labeled_df, label="species", train_size=0.6, val_size=0.2, seed=42)
unlabeled_df = pd.read_csv('data/unlabeled_penguins.csv')

unlabeled_df.head(5)

X_unlabeled = unlabeled_df.to_numpy()

mean=(sum(X_train[:,])/len(X_train))
std=(((sum(X_train[:,]**2)-len(X_train)*(mean**2)))/(len(X_train)))**0.5
print(mean,std)

def normalize(X: np.ndarray, mean: np.ndarray, std: np.ndarray):
    X_normalized = (X-mean)/(std)
    return X_normalized

feature_names = ['Bill length (normalized)', 'Body mass (normalized)']

X_train = normalize(X_train, mean, std)
X_val = normalize(X_val, mean, std)
X_test = normalize(X_test, mean, std)
X_unlabeled = normalize(X_unlabeled, mean, std)

def manhattan_dist(sample: np.ndarray, X: np.ndarray):
    distances = sum((abs(sample-X)).T)
    return distances
   
def euclidean_dist(sample: np.ndarray, X: np.ndarray):
    distances = (sum(((sample-X)**2).T))**0.5
    return distances

print(f'Manhattan: {np.round(manhattan_dist(X_val[0], X_train[:3]), decimals=1)}')
print(f'Euclidean: {np.round(euclidean_dist(X_val[0], X_train[:3]), decimals=1)}')

def find_nearest_neighbors(
    sample: np.ndarray, 
    X: np.ndarray, 
    distance_fn: Callable = euclidean_dist, 
    k: int = 1):
    neighbor_indices=np.argsort(distance_fn(sample,X))[0:k]
    return neighbor_indices

#print(find_nearest_neighbors(X_val[0], X_train,k=24))
#neighbor_indices = find_nearest_neighbors(sample, X_train,k=1)
#helpers.plot_nearest_neighbors(sample, X_train, y_train, neighbor_indices, label_map, feature_names)

def predict_single(
    sample: np.ndarray, 
    X: np.ndarray, 
    y: np.ndarray, 
    distance_fn: Callable, 
    k: int = 1):
    
    l=np.array([y[find_nearest_neighbors(sample,X,distance_fn,k)],find_nearest_neighbors(sample,X,distance_fn,k)])
    predicted_labels=np.argmax(np.bincount(l[0]))
    return predicted_labels

print(predict_single(X_val[0], X_train, y_train, euclidean_dist, 5))

# With for-loop
def predict(
    samples: np.ndarray, 
    X: np.ndarray = X_train, 
    y: np.ndarray = y_train, 
    distance_fn: Callable = euclidean_dist, 
    k: int = 1):
    
    predicted_labels = np.array([predict_single(i,X,y,distance_fn,k) for i in samples])
    
    return predicted_labels

print(f'Predicted labels: {predict(X_val[:6], X_train, y_train, euclidean_dist, 5)}')


[  44.73442623 4184.22131148] [  5.30537572 725.93354833]
Manhattan: [4.2 2.7 2.2]
Euclidean: [3.1 2.  2. ]
1
Predicted labels: [1 1 2 1 2 0]
