In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score 
%matplotlib inline



In [3]:
filename="Advertising.csv"
df = pd.read_csv(filename)
df.head()
x = df.TV.values
y = df.Sales.values
x_train, x_test, y_train, y_test = train_test_split(x,y,train_size=0.6,random_state=66)

In [14]:
for col in df.columns[:-1]:
    print(df[col].values)

[230.1  44.5  17.2 151.5 180.8   8.7  57.5 120.2   8.6 199.8  66.1 214.7
  23.8  97.5 204.1 195.4  67.8 281.4  69.2 147.3 218.4 237.4  13.2 228.3
  62.3 262.9 142.9 240.1 248.8  70.6 292.9 112.9  97.2 265.6  95.7 290.7
 266.9  74.7  43.1 228.  202.5 177.  293.6 206.9  25.1 175.1  89.7 239.9
 227.2  66.9 199.8 100.4 216.4 182.6 262.7 198.9   7.3 136.2 210.8 210.7
  53.5 261.3 239.3 102.7 131.1  69.   31.5 139.3 237.4 216.8 199.1 109.8
  26.8 129.4 213.4  16.9  27.5 120.5   5.4 116.   76.4 239.8  75.3  68.4
 213.5 193.2  76.3 110.7  88.3 109.8 134.3  28.6 217.7 250.9 107.4 163.3
 197.6 184.9 289.7 135.2 222.4 296.4 280.2 187.9 238.2 137.9  25.   90.4
  13.1 255.4 225.8 241.7 175.7 209.6  78.2  75.1 139.2  76.4 125.7  19.4
 141.3  18.8 224.  123.1 229.5  87.2   7.8  80.2 220.3  59.6   0.7 265.2
   8.4 219.8  36.9  48.3  25.6 273.7  43.  184.9  73.4 193.7 220.5 104.6
  96.2 140.3 240.1 243.2  38.   44.7 280.7 121.  197.6 171.3 187.8   4.1
  93.9 149.8  11.7 131.7 172.5  85.7 188.4 163.5 11

In [None]:
Knns = np.linspace(1,70,num=70,dtype=int)
mses = {}
for k in Knns:
    knnmodel = KNeighborsRegressor(n_neighbors=int(k))
    knnmodel.fit(x_train.reshape(-1,1),y_train.reshape(-1,1))
    y_pred = knnmodel.predict(x_test.reshape(-1,1))
    mse = mean_squared_error(y_test, y_pred)
    r2e = r2_score(y_test,y_pred)
    mses[k]={"mse":mse,
             "r2e":r2e}


In [None]:
mse_df = pd.DataFrame.from_dict(mses, orient='index')
mse_df.head()
mse_df.plot(figsize=(10,8),
            subplots=True,
            #style='-o',
            grid=True,
            )
#plt.tight_layout()

In [None]:
# Efficiently extract the data into lists
k_values = list(mses.keys())
mse_values = [v['mse'] for v in mses.values()]
r2_values = [v['r2e'] for v in mses.values()]
import matplotlib.pyplot as plt

# 1. Create a figure and a set of subplots
# fig, axes = plt.subplots(nrows, ncols, ...)
# sharex=True links the x-axis of both plots. Zooming on one will zoom the other.
fig, axes = plt.subplots(2, 1, figsize=(10, 8), sharex=True)

# 2. Plot MSE on the first subplot (axes[0])
axes[0].plot(k_values, mse_values, marker='o', linestyle='-', color='b')
axes[0].set_ylabel("Mean Squared Error (MSE)")
axes[0].set_title("Model Performance vs. K")
axes[0].grid(True)

# 3. Plot R2 Score on the second subplot (axes[1])
axes[1].plot(k_values, r2_values, marker='o', linestyle='-', color='r')
axes[1].set_ylabel("R² Score")
axes[1].set_xlabel("Value of K (Neighbors)")
axes[1].grid(True)

# Find the best K for R2 (max value) and MSE (min value) to annotate the plot
best_k_r2 = k_values[np.argmax(r2_values)]
best_k_mse = k_values[np.argmin(mse_values)]
axes[0].axvline(x=best_k_mse, color='gray', linestyle='--', label=f'Best K (MSE) = {best_k_mse}')
axes[1].axvline(x=best_k_r2, color='gray', linestyle='--', label=f'Best K (R²) = {best_k_r2}')
axes[0].legend()
axes[1].legend()


# 4. Improve layout and show the plot
plt.tight_layout() # Adjusts subplot params for a tight layout.
plt.show()