In [51]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor

In [52]:
cars = pd.read_csv("cars.csv")                         # reading the dataset

In [53]:
x = cars[["Horsepower", "Weight", "Displacement"]]     # predictors
y = cars["MPG"]                                        # target variable

In [57]:
def cross_validation(x_train, y_train, k):                           # function taking the argument x-train, y_train and k
    try:
        val_score_list=[]
        end = 0

        #---------------------------Shuffling the data------------------------------------------------

        x_sample = x_train.sample(len(x_train))            # shuffling the data randomly
        index_list = list(x_sample.index)                  # storing the index of shuffled data in a list
        y_sample = []

        for j in index_list:
            y_sample.append(y_train.iloc[j,])              # storing value of y corresponding to the index of x_sample

        df = pd.DataFrame(x_sample)                        # creating a dataframe having x_sample, shuffled value of x_train
        df.insert(0, 'MPG', y_sample)                      # inserting column of y_sample at 0th index

        x_train = df[["Horsepower", "Weight", "Displacement"]]   # x_train has shuffled rows of predictors
        y_train = df["MPG"]                                      # y_train is shuffled target variable

        #------------------------train and validation set-----------------------------------------------------

        for i in range(k):                               # since data is divided in k parts, loop will run k times
            start = end                                  # starting index
            end = start + (len(x_train) // k)            # ending index

            if i != k-1:                                 # except for last value of k, validation set will have records from start to end index
                validation_x = x_train.iloc[start:end]
                validation_y = y_train.iloc[start:end]
            else:
                validation_x = x_train.iloc[start:]      # for last value of k, validation set will have all remaining values except training set
                validation_y = y_train.iloc[start:]

            train1_x = x_train.iloc[:start]              
            train2_x = x_train.iloc[end:]
            train_x = train1_x.append(train2_x)          # train_x has all values of predictors except those which are in validation set

            train1_y = y_train.iloc[:start]
            train2_y = y_train.iloc[end:]
            train_y = train1_y.append(train2_y)          # train_y has all values of predictors except those which are in validation set

            #----------------------------fitting the knn model--------------------------------------

            knn = KNeighborsRegressor(n_neighbors = 4) # 4NN

            knn.fit(train_x, train_y)
            pred = knn.predict(validation_x)

            # finding error which is actual - predicted value of validation target variable
            error = validation_y - pred

            # finding cv score using MSE method
            cv_score = (error**2).sum()/len(validation_y)
            val_score_list.append(cv_score)

            # returning average of CV scores
        return np.mean(val_score_list)
        
    except:
        print("Some error occured")                # exception is raised

In [58]:
cross_validation(x,y,10)

26.826779076086957