In [10]:
# Import necessary packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [5]:
# Read Dataset
df = pd.read_csv("datasets/Pokemon.csv")
df = df.drop(["#", "Name"], axis = 1)
df = df.fillna(" ")

# For every Non-numeric column, convert it into a numeric column using sklearn's LabelEncoder
type1_encoder = LabelEncoder()
type1_encoder.fit(df['Type 1'])
df['Type 1'] = type1_encoder.transform(df['Type 1'])

type2_encoder = LabelEncoder()
type2_encoder.fit(df['Type 2'])
df['Type 2'] = type2_encoder.transform(df['Type 2'])

generation_encoder = LabelEncoder()
generation_encoder.fit(df['Generation'])
df['Generation'] = generation_encoder.transform(df['Generation'])

legendary_encoder = LabelEncoder()
legendary_encoder.fit(df['Legendary'])
df['Legendary'] = legendary_encoder.transform(df['Legendary'])

# Look the preprocessed dataset
df.head()

Unnamed: 0,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,9,14,318,45,49,49,65,65,45,0,0
1,9,14,405,60,62,63,80,80,60,0,0
2,9,14,525,80,82,83,100,100,80,0,0
3,9,14,625,80,100,123,122,120,80,0,0
4,6,0,309,39,52,43,60,50,65,0,0


In [6]:
# Extract Input(X) and Output(y) data
X = df.values[:, :-1]
y = df.values[:, -1]
# Split Data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35)

In [7]:
# Define a function to calculate distance between datapoints.
def distance(a, b):
    dis = 0
    for i in range(len(a)):
        dis += (b[i] - a[i])**2
    return np.sqrt(dis)

In [8]:
y_pred = [] # Array to store predicted data
k = 3 # set the value of k here
for test_point in X_test: #Iterate through every test point
    neighbors = []
    for (Xt, yt) in zip(X_train, y_train): # find distance between testpoint with every training data point
        neighbors.append([distance(Xt, test_point), yt])
    neighbors = sorted(neighbors, key = lambda x : x[0]) # sort the neighbors in ascending order based on distance
    neighbors_count = [0, 0] # set an array to count num occurrences of different classes.
    for i in range(k):
        neighbors_count[neighbors[i][1]] += 1 # for first k occurrences in sorted neighbors, count individual classes
    y_pred.append(np.argmax(neighbors_count)) # The class with highest count(votes) is predicted as output
    
    

In [9]:
# Print the accuracy of our model
print(f'KNN Accuracy : {accuracy_score(y_test, y_pred):.2f}')

KNN Accuracy : 0.94


In [16]:
from sklearn.neighbors import KNeighborsClassifier
knn_classifier = KNeighborsClassifier(n_neighbors=3)
knn_classifier.fit(X_train, y_train)
y_pred = knn_classifier.predict(X_test)
print(f'sklearn\'s KNN Accuracy : {accuracy_score(y_test, y_pred):.2f}')

sklearn's KNN Accuracy : 0.94


In [15]:
print(classification_report(y_test, y_pred, target_names = ['Not Legendary', 'Legendary']))

               precision    recall  f1-score   support

Not Legendary       0.98      0.96      0.97       256
    Legendary       0.62      0.75      0.68        24

     accuracy                           0.94       280
    macro avg       0.80      0.85      0.82       280
 weighted avg       0.95      0.94      0.94       280



In [12]:
confusion_matrix(y_test, y_pred)

array([[245,  11],
       [  6,  18]])