# DATA PREPARATION

In [19]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
np.random.seed(42)
df = pd.read_csv('diabetes.csv')
print(f"Dataset shape: {df.shape}")
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)
Train_set = np.column_stack((X_train, y_train))
Val_set = np.column_stack((X_val, y_val))
Test_set = np.column_stack((X_test, y_test))
print(f"Training: {len(Train_set)} samples ({len(Train_set)/len(df)*100:.1f}%)")
print(f"Validation: {len(Val_set)} samples ({len(Val_set)/len(df)*100:.1f}%)")
print(f"Test: {len(Test_set)} samples ({len(Test_set)/len(df)*100:.1f}%)")

Dataset shape: (768, 9)
Training: 537 samples (69.9%)
Validation: 115 samples (15.0%)
Test: 116 samples (15.1%)


# KNN REGRESSION

In [20]:
def knn_regression_manual(train_set, validation_sample, k=5):
    distances = []
    vx = validation_sample[:-1]  
    
    for train_sample in train_set:
        tx = train_sample[:-1]  
        
        distance = np.sqrt(np.sum((vx - tx)**2))
        
        distances.append((train_sample[-1], distance))
    
    distances.sort(key=lambda x: x[1])  
    k_nearest_neighbors_data = distances[:k]  
    outputs = [item[0] for item in k_nearest_neighbors_data]
    predicted_output = np.mean(outputs)   
    return predicted_output
def mean_squared_error(train_set, validation_set, k=5):
    total_error = 0
    for validation_sample in validation_set:
        predicted_output = knn_regression_manual(train_set, validation_sample, k)
        actual_output = validation_sample[-1]
        error = (predicted_output - actual_output) ** 2
        total_error += error
    mse = total_error / len(validation_set)
    return mse

# KNN REGRESSION RESULTS

In [None]:
k_values = [1, 3, 5, 10, 15]
validation_mse_manual = {}

for k in k_values:
    mse_manual = calculate_mse_manual(Train_set, Val_set, k)
    validation_mse_manual[k] = mse_manual
    
best_k_manual = min(validation_mse_manual, key=validation_mse_manual.get)
test_mse_manual = calculate_mse_manual(Train_set, Test_set, best_k_manual)
print("k | Validation MSE  |")
print("-" * 40)
for k in k_values:
    print(f" {k} | {validation_mse_manual[k]:.4f} |")
print("-" * 40)
print("\nTest Mean Squared Error : {:.4f} (K={})".format(test_mse_manual, best_k_manual))

k | Validation MSE  |
----------------------------------------
 1 | 0.3217 |
 3 | 0.2155 |
 5 | 0.2014 |
 10 | 0.1917 |
 15 | 0.1764 |
----------------------------------------

Test Mean Squared Error (Manual): 0.1989 (K=15)
