In [40]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from distance import distance

In [2]:
df = pd.read_csv('data_regression.csv')

In [4]:
df.shape

(1000, 3)

In [5]:
df.head()

Unnamed: 0,Feature 1,Feature 2,Target
0,3.745401,1.851329,15.462274
1,9.507143,5.419009,39.853406
2,7.319939,8.729458,41.231244
3,5.986585,7.322249,33.103345
4,1.560186,8.065611,21.355682


In [8]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [12]:
x_input = X[0]

In [14]:
y[0]

15.46227350966508

In [43]:
distance_target_dict = {}
k = 3

for idx, x in enumerate(X):
  if not np.array_equal(x, x_input):
    distance_target_dict[distance(x, x_input)] = y[idx]

sorted_distance_target = sorted(distance_target_dict.items())
sorted_distance_target[:k]

[(0.16321924908892282, 43.85514446370052),
 (0.2522602592550542, 42.99275836297927),
 (0.296472602483191, 42.7179722811335)]

In [53]:
from typing import List, Tuple
import numpy as np

def mean(sorted_distance_target: List[Tuple[float, float]]) -> float:
  """
  Compute the mean of the target values from a list of tuples.

  This function takes a list of tuples, sorted according to the first elements of the tuples (i.e., the distance).

  Arguments:
    sorted_distance_target (List[Tuple[float, float]]): A list where each element is a tuple with two elements.
      The first element of each tuple is a distance.
      The second element of each tuple is a target value.
      The list is sorted in ascending order of the distance values.

  Returns:
    (float): The mean of the target values.
  """
  sum = 0

  for tp in sorted_distance_target:
    sum += tp[1]

  return sum / len(sorted_distance_target)

In [47]:
len(sorted_distance_target), len(X)

(999, 1000)

In [48]:
mean(sorted_distance_target[:k])

43.188625035937754

In [54]:
import numpy as np

def get_prediction(x_input: np.ndarray, X: np.ndarray, y: np.ndarray, k: int) -> float:
  """
  Predict the target value for a given point x_input using the k-Nearest Neighbors algorithm.

  Arguments:
    x_input (np.ndarray): The point for which a target value has to be predicted.
    X (np.ndarray): The training data points.
    y (np.ndarray): The target values corresponding to the training data points.
    k (int): The number of nearest neighbors to consider for the prediction.

  Returns:
    (float): The target value predicted for the given input point.
  """
  assert k > 0 and isinstance(k, int), "'k' must be a positive integer."
  distance_target = {}

  for idx, x in enumerate(X):
    if not np.array_equal(x, x_input):
      distance_target[distance(x_input, x)] = y[idx]

  sorted_distance_target = sorted(distance_target.items())[:k]

  return mean(sorted_distance_target)

In [55]:
import numpy as np

def predict(X: np.ndarray, y: np.array, k: int) -> np.ndarray:
  """
  Predict the target values for a set of input points using the k-Nearest Neighbors algorithm.

  Arguments:
    X (np.ndarray): The input points for which predictions are to be made.
    y (np.ndarray): The target values corresponding to the training data points.
    k (int): The number of nearest neighbors to be considered for making the predictions.

  Returns:
    (np.ndarray): The predicted target values for the input points.
  """
  y_preds = np.zeros(len(X))

  for idx, x_input in enumerate(X):
    y_preds[idx] = get_prediction(x_input, X, y, k)

  return y_preds

In [52]:
# Model evaluation
y_pred = predict(X, y, 3)

print(f'R2 score = {r2_score(y, y_pred)}')
print(f'Mean squared error = {mean_squared_error(y, y_pred)}')

R2 score = 0.9945183333500371
Mean squared error = 0.6277523689696386
