KNN Classifier

In [2]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
file_loc = "./gender_classification/Transformed Data Set - Sheet1.csv"

Get labeled data

In [4]:
df = pd.read_csv(file_loc)

# First Feature
color = df['Favorite Color']
# Second Feature
music = df['Favorite Music Genre']
# Third Feature
beverage = df['Favorite Beverage']
# Fourth Feature
soda = df['Favorite Soft Drink']
# Label or target variable
gender = df['Gender']

Convert labeled data to encoded data

In [5]:
le = preprocessing.LabelEncoder()

color_encoded = le.fit_transform(color)
print('Color Encoded', color_encoded)

music_encoded = le.fit_transform(music)
beverage_encoded = le.fit_transform(beverage)
soda_encoded = le.fit_transform(soda)

label = le.fit_transform(gender)
# print('Label Encoded', label)

Color Encoded [0 1 2 2 0 2 0 2 2 1 0 2 2 2 0 2 0 0 0 0 0 2 0 2 0 2 0 0 0 0 2 1 0 2 1 2 1
 0 0 2 0 1 2 0 0 0 0 0 0 2 0 0 0 2 0 0 2 2 2 0 0 0 0 1 0 0]


Create feature set

In [6]:
features = list(zip(color_encoded, music_encoded, beverage_encoded, soda_encoded))
print('Features', features)

Features [(0, 6, 3, 0), (1, 2, 3, 1), (2, 6, 5, 1), (2, 1, 4, 2), (0, 6, 3, 1), (2, 3, 1, 2), (0, 4, 0, 1), (2, 4, 4, 2), (2, 6, 2, 0), (1, 4, 5, 1), (0, 4, 2, 0), (2, 4, 2, 0), (2, 4, 5, 0), (2, 0, 5, 1), (0, 6, 0, 1), (2, 3, 5, 1), (0, 4, 5, 0), (0, 6, 2, 1), (0, 6, 2, 1), (0, 4, 1, 0), (0, 4, 0, 2), (2, 3, 4, 2), (0, 6, 3, 1), (2, 4, 2, 1), (0, 1, 4, 0), (2, 5, 4, 1), (0, 4, 0, 3), (0, 4, 1, 3), (0, 4, 1, 1), (0, 0, 1, 2), (2, 6, 2, 1), (1, 6, 0, 1), (0, 5, 0, 1), (2, 5, 5, 3), (1, 2, 0, 0), (2, 0, 2, 1), (1, 6, 1, 1), (0, 4, 2, 2), (0, 4, 4, 2), (2, 6, 3, 0), (0, 6, 3, 1), (1, 4, 1, 0), (2, 5, 1, 1), (0, 6, 5, 0), (0, 1, 0, 3), (0, 2, 0, 1), (0, 2, 5, 1), (0, 5, 4, 0), (0, 6, 1, 3), (2, 2, 0, 1), (0, 5, 1, 1), (0, 6, 1, 1), (0, 2, 1, 3), (2, 6, 0, 2), (0, 0, 1, 2), (0, 0, 2, 2), (2, 1, 2, 2), (2, 0, 3, 2), (2, 3, 3, 1), (0, 4, 4, 3), (0, 0, 4, 1), (0, 6, 3, 1), (0, 2, 0, 1), (1, 2, 1, 2), (0, 6, 5, 1), (0, 0, 0, 1)]


Split train and test sets & train the classifier

In [14]:
# Splitting train : test to 70 : 30 ratio
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.3)

# Applying k = 3, default Minkowski distance metrics
model = KNeighborsClassifier(n_neighbors=4)
# Training the classifier
model.fit(X_train, y_train)

Test the classifier

In [15]:
# Testing the classifier
y_pred = model.predict(X_test)
print('Predicted', y_pred)
print('Actual data', y_test)

Predicted [0 0 0 1 0 0 1 1 1 1 1 0 0 1 0 1 1 0 0 1]
Actual data [0 0 0 1 0 1 0 1 0 1 1 0 0 1 1 1 1 1 1 1]


Evaluate the classifier

In [16]:
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy', accuracy)

Accuracy 0.7


In [30]:
user_color = ['Cool']
user_music = ['Pop']
user_beverage = ['Doesn\'t drink']
user_soda = ['7UP/Sprite']
user_gender = ['F']

user_color_encoded = le.fit_transform(user_color)
user_music_encoded = le.fit_transform(user_music)
user_beverage_encoded = le.fit_transform(user_beverage)
user_soda_encoded = le.fit_transform(user_soda)
user_label = le.fit_transform(user_gender)

features = list(zip(user_color_encoded, user_music_encoded, user_beverage_encoded, user_soda_encoded))

user_pred = model.predict(features)
print('User-inputted Accuracy (0 if wrong; 1 if correct):', accuracy_score(user_label, user_pred))

User-inputted Accuracy (0 if wrong; 1 if correct): 0.0
