In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#!pip install matplotlib

In [3]:
#Read train data
train_data = pd.read_csv("dataset/train.csv")

In [4]:
train_data.head()

Unnamed: 0,Weight,Colour,Label
0,303,3,B
1,370,1,A
2,298,3,B
3,277,3,B
4,377,4,A


#### About our Train Data

1. we have `Weight`, `Colour` are the features and `Label` as label
2. using Weight and Color of fruit we have to predict what type of fruit in test data

In [5]:
train_data['Label'].unique()

array(['B', 'A'], dtype=object)

Here we have only two unique fruits available in our label data.

In [6]:
def euclidean_disance(a, b):
    """
    Returns a float value that computes euclidean distance between two points.
    
    This function is useful to calculate the Euclidean Distance between two points.
    
    
    Parameters
    ----------
    a,b : tuple/array required
        contains two numbers(integer/float) index-0 contains,
        X-Coordinate, index-1 containe Y-Coordinate.
    """
    #a=(x1, y1), b=(x2, y2)
    if not (len(a) == 2 and len(b) == 2):
        raise ValueError('length of a and b expected to 2 but got for a:{} and for b:{}'.format(len(a), len(b))) 
    return np.sqrt( (b[0] - a[0])**2 + (b[1] - a[1])**2  )
    
    

In [7]:
train_data.shape

(17, 3)

## Read Test Data

In [8]:
#Read test data
test_data = pd.read_csv("dataset/test.csv")

In [9]:
test_data.head()

Unnamed: 0,Weight,Colour
0,318.1,3
1,293.8,3
2,277.3,2
3,324.2,4
4,327.0,2


In [10]:
test_data.shape

(30, 2)

Main logic begins here,
Here we have to compute distance with test and train data means, follow the below points
    - take one point from test and compute the distance with each and every point in train data
    - save the distance and label in an array
    - repeat it with every other point in test data
    - after that sort them according to the distance and apply k
    - now find the predicted label using majority voting system.

In [11]:
# https://stackoverflow.com/questions/47843707/count-frequency-of-item-in-a-list-of-tuples
from collections import Counter

def majority_votes(a):
    """
    Returns a string which has more major count in the given list.
    
    Parameters
    ----------
    a: array required
       contains array of tuples with first element is distance and second element as label
    """
    counts = Counter(x[1] for x in a)
    return 'B' if counts['B']>counts['A'] else 'A'

In [12]:
test_dict = test_data.to_dict('records')
#     print(row['Weight'], row['Colour'])
train_dict = train_data.to_dict('records')

# Taking K as default value 5
k = 5 # hyper-parameter

predicted_labels = []

for test_row in test_dict:
    test_point = (test_row['Weight'], test_row['Colour'])
    measured_distances = []
    
    for train_row in train_dict:
        train_point = (train_row['Weight'], train_row['Colour'])
        distance = euclidean_disance(test_point, train_point)
        measured_distances.append((distance, train_row['Label']))
        # Sorted_distances
        measured_distances.sort() # as my first element is a distance(float) in tuple default sort works here
        measured_distances = measured_distances[:k]
#     print(measured_distances)
    predicted_labels.append(majority_votes(measured_distances))
        


# Checking predicted values

Here we have another file, which has list of all predicted values. 
so we will check our knn accuracy

In [13]:
defined_labels = pd.read_csv('dataset/test-labels.csv')
count = 0
defined_labels_dict = defined_labels.to_dict('records')

for row in defined_labels_dict:
    if row['Label'] in predicted_labels:
        count += 1
accuracy = count/len(test_dict)
print('Accuracy: {} %'.format( accuracy * 100.0))

Accuracy: 100.0 %
