### Group Details:
|Roll Number|Name|
| ----------- | ----------- |
|E20004|Akshay Amrit|
|E20032|Shivam Babbar|

In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [2]:
iris = pd.read_csv(r'C:\Users\aksha\OneDrive\Desktop\Data Science Material\ML + Python Notes\Data\iris.csv')
cars = pd.read_csv(r'C:\Users\aksha\OneDrive\Desktop\Data Science Material\ML + Python Notes\Data\cars.csv')

In [3]:
X = iris[['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width']]
Y = iris['Species']

In [4]:
X_train_iris, X_test_iris, y_train_iris, y_test_iris = train_test_split(X, Y, test_size = 0.3, stratify = Y, random_state = 100)

In [5]:
X = cars[['Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration']]
y = cars['MPG']

In [6]:
X_train_cars, X_test_cars, y_train_cars, y_test_cars = train_test_split(X, y, test_size = 0.3, random_state = 100)

## K-NN Classification with Scaling

In [7]:
# Function 1: Function to calculate distance
def dist(vect1, vect2):
    vect1 = np.array(vect1)
    vect2 = np.array(vect2)
    distance =  np.sqrt(np.matmul(np.transpose(vect1 - vect2), (vect1 - vect2)))
    return distance

In [8]:
# Function 2: Function to get the nearest neighour.
def nearest_neighour_classification(X_train, y_train, test_row, k):
    distance_list = []
    y_count = dict()
    # Loop to calculate euclidean distance of current row from each value in X_train.
    for j in range(X_train.shape[0]):
        curr_dist = dist(test_row,X_train.iloc[j, ])
        distance_list.append((curr_dist, j))

    # Sorting in ascending order using the distances calculated from each row.
    distance_list.sort()

    # Creating a dictionary to count the number of instances a classification appears.
    for rank in range(k):
        pred = y_train.iloc[distance_list[rank][1], ]
        y_count[pred] = y_count.get(pred, 0) + 1

    # Selecting the classification with highest number of votes.
    prediction = sorted(y_count, key=y_count.get, reverse=True)[0]
    return prediction

In [9]:
def min_max_scaling(train, test, column_list = list()):
    if len(column_list) == 0:
        column_list = train.columns.tolist()
    for each in column_list:
        variable = train[each]
        min_train = np.min(variable)
        max_train = np.max(variable)
        train[each] = ((train[each] - min_train) / (max_train - min_train))
        test[each] = ((test[each] - min_train) / (max_train - min_train))
    return train, test

In [10]:
def k_NN_Classification(X_train, X_test, y_train, k = 5):
    pred_list = list()
    # Handling the condition when the user tries to predict for just one entry and it is not in dataframe format
    if X_test.shape == (X_train.shape[1],):
        # Reshaping and converting to a dataframe.
        X_test = pd.DataFrame(X_test.values.reshape((-1, X_train.shape[1])), columns=X_train.columns)
    
    #Scaling Data:
    X_train, X_test = min_max_scaling(X_train, X_test)
    
    # Loop to predict for each row.
    for i in range(X_test.shape[0]):
        prediction = nearest_neighour_classification(X_train, y_train, X_test.iloc[i, ], k)
        pred_list.append(prediction)
    return pred_list

In [11]:
pred = k_NN_Classification(X_train_iris, X_test_iris, y_train_iris)
dict_for_df = {'prediction': pred, 'actual': y_test_iris}
pd.DataFrame(dict_for_df).head()

Unnamed: 0,prediction,actual
34,setosa,setosa
89,versicolor,versicolor
121,virginica,virginica
94,versicolor,versicolor
104,virginica,virginica


In [12]:
c = confusion_matrix(y_test_iris, pred, labels=['setosa', 'versicolor', 'virginica'])
c

array([[15,  0,  0],
       [ 0, 15,  0],
       [ 0,  3, 12]], dtype=int64)

In [13]:
print(classification_report(y_test_iris, pred))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        15
  versicolor       0.83      1.00      0.91        15
   virginica       1.00      0.80      0.89        15

    accuracy                           0.93        45
   macro avg       0.94      0.93      0.93        45
weighted avg       0.94      0.93      0.93        45



## K-NN Regression with Scaling

In [14]:
# Function 2: Function to get the nearest neighour.
def nearest_neighour_regression(X_train, y_train, test_row, k):
    distance_list = []
    # Loop to calculate euclidean distance of current row from each value in X_train.
    for j in range(X_train.shape[0]):
        curr_dist = dist(test_row,X_train.iloc[j, ])
        distance_list.append((curr_dist, j))

    # Sorting in ascending order using the distances calculated from each row.
    distance_list.sort()

    # Creating a list to get mean of
    y_list = list()
    for rank in range(k):
        pred = y_train.iloc[distance_list[rank][1], ]
        y_list.append(pred)

    # Finding mean of every value to return.
    prediction = np.mean(y_list)
    return prediction

In [15]:
def k_NN_Regression(X_train, X_test, y_train, k = 5):
    pred_list = list()
    # Handling the condition when the user tries to predict for just one entry and it is not in dataframe format
    if X_test.shape == (X_train.shape[1],):
        # Reshaping and converting to a dataframe.
        X_test = pd.DataFrame(X_test.values.reshape((-1, X_train.shape[1])), columns=X_train.columns)
    #Scaling Data:
    X_train, X_test = min_max_scaling(X_train, X_test)
    # Loop to predict for each row.
    for i in range(X_test.shape[0]):
        prediction = nearest_neighour_regression(X_train, y_train, X_test.iloc[i, ], k)
        pred_list.append(prediction)
    return pred_list

In [16]:
pred = k_NN_Regression(X_train_cars, X_test_cars, y_train_cars)
dict_for_df = {'prediction': pred, 'actual': y_test_cars}
pd.DataFrame(dict_for_df).head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


Unnamed: 0,prediction,actual
233,18.38,20.5
309,36.08,34.5
377,23.6,34.0
320,26.1,28.0
161,18.06,15.0


In [17]:
RMSE = mean_squared_error(y_test_cars, pred)**0.5
r_square = r2_score(y_test_cars, pred)
print('RMSE:', RMSE, '\nR Square:', r_square)

RMSE: 3.8024651193149106 
R Square: 0.7339017188512167
