# CS676 Homework #4
Aayushi Verma

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

Consider the following data set for questions Q 1 ∼ Q 20 where x column is the hours that a student studied, y column is the hours that a student played video games, and g is the GPA that the student received.

In [2]:
df = pd.DataFrame(
    {
        'x':[5,6,4,3,4,3,1],
        'y':[2,5,2,3,7,6,8],
        'g':[4,3,3,3,2,2,1]
    }
)

## Defining Functions

In [3]:
def l1(given_point,x,y):
    # Function to determine Manhattan distance, L1.
    d_l1 = []
    d = len(x) # assumes len(x) = len(y)
    for i in range(d):
        d_sum = (x[i] - y[i]) + (given_point[0] - given_point[1])
        d_l1.append(d_sum)
    return d_l1

In [4]:
def l2(given_point,x,y):
    # Function to determine Euclidean distance, L2.
    d_l2 = []
    d = len(x) # assumes len(x) = len(y)
    for i in range(d):
        d_sum = np.sqrt(((x[i] - y[i]) ** 2) + ((given_point[0] - given_point[1]) ** 2))
        d_l2.append(d_sum)
    return d_l2

In [5]:
def l3(given_point, x, y, p):
    # Function to determine Minkowski distance, L3.
    d_l3 = []
    d = len(x) # assumes len(x) = len(y)
    for i in range(d):
        d_sum = (((x[i] - y[i]) ** p) + ((given_point[0] - given_point[1]) ** p)) ** (1 / p)
        d_l3.append(d_sum)
    return d_l3

In [6]:
def nearest_neighbor(df, given_point, distance='l2', p=None):
    df1 = df.copy()
    
    if distance == 'l1':
        distance_metric = l1(given_point, df1['x'], df1['y'])
    elif distance == 'l3':
        distance_metric = l3(given_point, df1['x'], df1['y'], p)
    else:
        distance_metric = l2(given_point, df1['x'], df1['y'])

    df1[distance] = distance_metric
    df1 = df1.sort_values(distance).reset_index().drop(['index'], axis=1)
    return (df1)

In [7]:
def k_nearest_neighbor(df, given_point, k, distance='l2', p=None):
    df2 = nearest_neighbor(df, given_point, distance, p)
    top_k_values = df2.loc[:k - 1]['g']
    expected_gpa = (np.sum(top_k_values)) / k
    return (expected_gpa)

In [8]:
def geom_k_nearest_neighbor(df, given_point, k, distance='l2', p=None):
    df2 = nearest_neighbor(df, given_point, distance, p)
    top_k_values = df2.loc[:k - 1]['g']
    product = 1
    for i in top_k_values:
        product *= i
    expected_gpa = product ** (1 / k)
    return (expected_gpa)

In [9]:
def weight(d, p):
    weighted = 1 / (d ** p)
    return (weighted)

In [10]:
def weighted_k_nearest_neighbor(df, given_point, k, p, distance='l2'):
    df2 = nearest_neighbor(df, given_point, distance, p)
    top_k_values = df2.loc[:k - 1]['g']
    
    numerator = 0
    for i in top_k_values:
        numerator += weight(i, p) * i
    
    denominator = 0
    for i in top_k_values:
        denominator += weight(i, p)

    expected_gpa = numerator / denominator
    return (expected_gpa)

### 1. According to the nearest neighbor algorithm where Euclidean L2 distance is used, what is the expected GPA if x = 2 and y = 4?

In [11]:
given_point = [2,4]
df1 = nearest_neighbor(df, given_point)
print("Expected GPA: ", np.round(df1.loc[0]['g'], 3))

Expected GPA:  3.0


### 2. According to the (k = 3)-NN algorithm where Euclidean L2 distance is used, what is the expected GPA if x = 2 and y = 4?

In [12]:
given_point = [2,4]
expected_gpa_q2 = k_nearest_neighbor(df, given_point, k=3)
print("Expected GPA: ", np.round(expected_gpa_q2, 3))


Expected GPA:  3.0


### 3. According to the (k = 5)-NN algorithm where Euclidean L2 distance is used, what is the expected GPA if x = 2 and y = 4?

In [13]:
given_point = [2,4]
expected_gpa_q3 = k_nearest_neighbor(df, given_point, k=5)
print("Expected GPA: ", np.round(expected_gpa_q3, 3))

Expected GPA:  3.0


### 4. According to the geometric average (k = 3)-NN algorithm where Euclidean L2 distance is used, what is the expected GPA if x = 2 and y = 4?

In [14]:
given_point = [2,4]
expected_gpa_q4 = geom_k_nearest_neighbor(df, given_point, k=3)
print("Expected GPA: ", np.round(expected_gpa_q4, 3))

Expected GPA:  3.0


### 5. According to the arithmetic-geometric average (k = 3)-NN algorithm where Euclidean L2 distance is used, what is the expected GPA if x = 2 and y = 4?

In [15]:
given_point = [2,4]
expected_gpa_q5 = geom_k_nearest_neighbor(df, given_point, k=3)
print("Expected GPA: ", np.round(expected_gpa_q5, 3))

Expected GPA:  3.0


### 6. According to the (p = 2) distance weighted (k = 5)-NN algorithm where Euclidean L2 distance is used, what is the expected GPA if x = 2 and y = 4?

In [16]:
p = 2
k = 5
given_point = [2,4]
expected_gpa_q6 = weighted_k_nearest_neighbor(df, given_point, k, p)
print("Expected GPA: ", np.round(expected_gpa_q6, 3))

Expected GPA:  2.71


### 7. According to the (p = 3) distance weighted (k = 5)-NN algorithm where Euclidean L2 distance is used, what is the expected GPA if x = 2 and y = 4?

In [17]:
p = 3
k = 5
given_point = [2,4]
expected_gpa_q7 = weighted_k_nearest_neighbor(df, given_point, k, p)
print("Expected GPA: ", np.round(expected_gpa_q7, 3))

Expected GPA:  2.566


### 8. According to the (k = 3)-NN algorithm where Manhattan L1 distance is used, what is the expected GPA if x = 6 and y = 2?

In [18]:
p = 2
k = 3
given_point = [2,4]
expected_gpa_q8 = weighted_k_nearest_neighbor(df, given_point, k, p)
print("Expected GPA: ", np.round(expected_gpa_q8, 3))

Expected GPA:  3.0


### 9. According to the (k = 5)-NN algorithm where Manhattan L1 distance is used, what is the expected GPA if x = 6 and y = 2?

In [19]:
k = 5
given_point = [6,2]
expected_gpa_q9 = k_nearest_neighbor(df, given_point, k, p=None, distance='l1')
print("Expected GPA: ", np.round(expected_gpa_q9, 3))

Expected GPA:  2.2


### 10. According to the (p = 2) distance weighted (k = 5)-NN algorithm where Manhattan L1 distance is used, what is the expected GPA if x = 6 and y = 2?

In [20]:
p = 2
k = 5
given_point = [6,2]
expected_gpa_q10 = weighted_k_nearest_neighbor(df, given_point, k, p, distance='l1')
print("Expected GPA: ", np.round(expected_gpa_q10, 3))

Expected GPA:  1.548


### 11. What does the (p = 3) distance weighted (k = 5)-NN algorithm classifies a query instance (6, 2) if Manhattan L1 distance is used.

In [21]:
p = 3
k = 5
given_point = [6,2]
expected_gpa_q11 = weighted_k_nearest_neighbor(df, given_point, k, p, distance='l1')
print("Expected GPA: ", np.round(expected_gpa_q11, 3))

Expected GPA:  1.301


### 12. According to the (k = 3)-NN algorithm where Minkowski (p = 3) L3 distance is used, what is the expected GPA if x = 6 and y = 2?

In [22]:
p = 3
k = 3
given_point = [6,2]
expected_gpa_q12 = weighted_k_nearest_neighbor(df, given_point, k, p, distance='l3')
print("Expected GPA: ", np.round(expected_gpa_q12, 3))

Expected GPA:  2.129


  d_sum = (((x[i] - y[i]) ** p) + ((given_point[0] - given_point[1]) ** p)) ** (1 / p)


### 13. Normalize the dataset using the simple min-max normalization.

In [23]:
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(df.copy())

In [24]:
df_normalized = pd.DataFrame(
    {
        'x':normalized_data[:,0],
        'y':normalized_data[:,1],
        'g':normalized_data[:,2],
    }
)

### 14. According to the (k = 3)-NN algorithm where Euclidean L2 distance is used on the normalized dataset in Q 13, what is the expected GPA if x = 6 and y = 2?

In [25]:
k = 3
given_point = [6,2]
expected_gpa_q14 = k_nearest_neighbor(df_normalized, given_point, k, distance='l2')
print("Expected GPA: ", np.round(expected_gpa_q14, 3))

Expected GPA:  0.444


### 15. According to the (k = 5)-NN algorithm where Euclidean L2 distance is used on the normalized dataset in Q 13, what is the expected GPA if x = 6 and y = 2?

In [26]:
k = 5
given_point = [6,2]
expected_gpa_q15 = k_nearest_neighbor(df_normalized, given_point, k, distance='l2')
print("Expected GPA: ", np.round(expected_gpa_q15, 3))

Expected GPA:  0.533


### 16. According to the (p = 2) distance weighted (k = 5)-NN algorithm where Euclidean L2 distance is used on the normalized dataset in Q 13, what is the expected GPA if x = 6 and y = 2?

In [27]:
p = 2
k = 5
given_point = [6,2]
expected_gpa_q16 = weighted_k_nearest_neighbor(df_normalized, given_point, k, p, distance='l2')
print("Expected GPA: ", np.round(expected_gpa_q16, 3))

Expected GPA:  0.424


### 17. According to the (k = 5)-NN algorithm where Manhattan L1 distance is used on the normalized dataset in Q 13, what is the expected GPA if x = 6 and y = 2?

In [28]:
p = 2
k = 5
given_point = [6,2]
expected_gpa_q17 = weighted_k_nearest_neighbor(df_normalized, given_point, k, p, distance='l1')
print("Expected GPA: ", np.round(expected_gpa_q17, 3))

ZeroDivisionError: float division by zero

### 18. Using the leave one out method, find the residual, R2 of the nearest neighbor algorithm.

### 19. Using the leave one out method, find the residual, R2 of the (k = 3)-NN algorithm.

### 20. Using the leave one out method, find the adjusted R2, R¯2 of the (p = 2) distance weighted (k = 3)-NN algorithm.