# DATA PREAPARATION

In [5]:
import random
import numpy as np
random.seed(100)

data_path = 'diabetes.csv'
features = np.genfromtxt(data_path, delimiter=',', skip_header=1, usecols=range(0, 8))
target = np.genfromtxt(data_path, delimiter=',', skip_header=1, usecols=[8])
my_data = np.column_stack((features, target))

print("First 5 rows of the processed dataset:")
for i in range(5):
    print(my_data[i])

Train_set = []
Val_set = []
Test_set = []

for sample in my_data:
    R = random.random()
    if R >= 0 and R <= 0.7:
        Train_set.append(sample)
    elif R > 0.7 and R <= 0.85:
        Val_set.append(sample)
    else:
        Test_set.append(sample)

print(f"Training set size: {len(Train_set)}")
print(f"Validation set size: {len(Val_set)}")
print(f"Test set size: {len(Test_set)}")

First 5 rows of the processed dataset:
[  6.    148.     72.     35.      0.     33.6     0.627  50.      1.   ]
[ 1.    85.    66.    29.     0.    26.6    0.351 31.     0.   ]
[  8.    183.     64.      0.      0.     23.3     0.672  32.      1.   ]
[ 1.    89.    66.    23.    94.    28.1    0.167 21.     0.   ]
[  0.    137.     40.     35.    168.     43.1     2.288  33.      1.   ]
Training set size: 529
Validation set size: 125
Test set size: 114


# KNN REGRESSION 

In [6]:
def knn_regression(train_set, validation_sample, k=5):
    distances = []
    vx = validation_sample[:-1]
    
    for train_sample in train_set:
        tx = train_sample[:-1]
        distance = np.sqrt(np.sum((vx - tx)**2))
        distances.append((train_sample, distance))
    
    distances.sort(key=lambda x: x[1])
    k_nearest = distances[:k]
    outputs = [sample[0][-1] for sample in k_nearest]
    predicted_output = np.mean(outputs)
    
    return predicted_output

def calculate_mse(train_set, val_set, k):
    error = 0
    
    for v_sample in val_set:
        true_output = v_sample[-1]
        predicted_output = knn_regression(train_set, v_sample, k)
        error += (true_output - predicted_output)**2
    
    mse = error / len(val_set)
    return mse

# KNN REGRESSION REPORT

In [7]:

k_values = [1, 3, 5, 10, 15]
validation_mse = {}

for k in k_values:
    mse = calculate_mse(Train_set, Val_set, k)
    validation_mse[k] = mse
    print(f"Mean Squared Error for K={k}: {mse:.4f}")

best_k = min(validation_mse, key=validation_mse.get)
print(f"\nBest K value: {best_k} with MSE: {validation_mse[best_k]:.4f}")

print("\nK\tMean Squared Error")
print("-" * 25)
for k in k_values:
    print(f"{k}\t{validation_mse[k]:.4f}")

test_mse = calculate_mse(Train_set, Test_set, best_k)
print(f"\nTest Mean Squared Error (K={best_k}): {test_mse:.4f}")

Mean Squared Error for K=1: 0.3120
Mean Squared Error for K=3: 0.1760
Mean Squared Error for K=5: 0.1510
Mean Squared Error for K=10: 0.1366
Mean Squared Error for K=15: 0.1369

Best K value: 10 with MSE: 0.1366

K	Mean Squared Error
-------------------------
1	0.3120
3	0.1760
5	0.1510
10	0.1366
15	0.1369

Test Mean Squared Error (K=10): 0.2013
