In [81]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter


In [68]:
data_set = pd.read_csv('winequality-red.csv')
data_set.head(10)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
5,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5
6,7.9,0.6,0.06,1.6,0.069,15.0,59.0,0.9964,3.3,0.46,9.4,5
7,7.3,0.65,0.0,1.2,0.065,15.0,21.0,0.9946,3.39,0.47,10.0,7
8,7.8,0.58,0.02,2.0,0.073,9.0,18.0,0.9968,3.36,0.57,9.5,7
9,7.5,0.5,0.36,6.1,0.071,17.0,102.0,0.9978,3.35,0.8,10.5,5


In [69]:
data_set['quality'] = data_set['quality'].apply(lambda x: 0 if 3 <= x <= 5 else 1)

In [70]:
data_set['quality'].value_counts()


1    855
0    744
Name: quality, dtype: int64

In [71]:
class KNN:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        y_pred = [self._predict(x) for x in X]
        return np.array(y_pred)

    def _predict(self, x):
        distances = [np.sqrt(np.sum((x_train - x) ** 2)) for x_train in self.X_train]
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        most_common = Counter(k_nearest_labels).most_common(1)
        return most_common[0][0]

In [72]:
data = data_set.sample(frac=0.9, random_state=789) 
data_unseen = data_set.drop(data.index) 
data.reset_index(drop=True, inplace=True) 
data_unseen.reset_index(drop=True, inplace=True) 
print('Data for Modeling: ' + str(data.shape)) 
print('Unseen Data For Predictions ' + str(data_unseen.shape))

Data for Modeling: (1439, 12)
Unseen Data For Predictions (160, 12)


In [73]:
X = data.drop('quality', axis=1)
Y = data['quality']


X_unseen = data_unseen.drop('quality', axis=1)
Y_unseen = data_unseen['quality']

In [77]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X = scaler.fit_transform(X)
X_unseen = scaler.transform(X_unseen)

In [78]:
Y = Y.to_numpy()

In [83]:
clf = KNN(k=5)
clf.fit(X, Y)

pred = clf.predict(X)

In [84]:
from sklearn.metrics import classification_report

print(classification_report(Y , pred))

              precision    recall  f1-score   support

           0       0.82      0.76      0.79       659
           1       0.81      0.86      0.83       780

    accuracy                           0.81      1439
   macro avg       0.81      0.81      0.81      1439
weighted avg       0.81      0.81      0.81      1439



In [85]:
preds = clf.predict(X_unseen)

In [86]:
print(classification_report(Y_unseen , preds))

              precision    recall  f1-score   support

           0       0.78      0.72      0.75        85
           1       0.71      0.77      0.74        75

    accuracy                           0.74       160
   macro avg       0.74      0.75      0.74       160
weighted avg       0.75      0.74      0.74       160

