In [129]:
import pandas as pd
import numpy as np
import scipy.stats as sp
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from ucimlrepo import fetch_ucirepo

In [2]:
# fetch dataset 
wine = fetch_ucirepo(id=109) 
  
# data (as pandas dataframes) 
X = wine.data.features 
y = wine.data.targets 
  
# metadata 
print(wine.metadata)
  
# variable information 
print(wine.variables) 

{'uci_id': 109, 'name': 'Wine', 'repository_url': 'https://archive.ics.uci.edu/dataset/109/wine', 'data_url': 'https://archive.ics.uci.edu/static/public/109/data.csv', 'abstract': 'Using chemical analysis to determine the origin of wines', 'area': 'Physical Science', 'tasks': ['Classification'], 'characteristics': ['Tabular'], 'num_instances': 178, 'num_features': 13, 'feature_types': ['Integer', 'Real'], 'demographics': [], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1992, 'last_updated': 'Mon Aug 28 2023', 'dataset_doi': '10.24432/C5PC7J', 'creators': ['Stefan Aeberhard', 'M. Forina'], 'intro_paper': {'title': 'Comparative analysis of statistical pattern recognition methods in high dimensional settings', 'authors': 'S. Aeberhard, D. Coomans, O. Vel', 'published_in': 'Pattern Recognition', 'year': 1994, 'url': 'https://www.semanticscholar.org/paper/83dc3e4030d7b9fbdbb4bde03ce12ab70ca10528', 'doi': '

In [3]:
# y = label
# x = features

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=123)

print(f"Labels of training dataset: \n{y_train}") # Labeltrain
print(f"Features of training dataset: \n{X_train}") # train

print(f"Labels of testing dataset: \n{y_test}") # Labeltest
print(f"Features of testing dataset: \n{X_test}") # test

Labels of training dataset: 
     class
142      3
8        1
72       2
114      2
148      3
..     ...
17       1
98       2
66       2
126      2
109      2

[142 rows x 1 columns]
Features of training dataset: 
     Alcohol  Malicacid   Ash  Alcalinity_of_ash  Magnesium  Total_phenols  \
142    13.52       3.17  2.72               23.5         97           1.55   
8      14.83       1.64  2.17               14.0         97           2.80   
72     13.49       1.66  2.24               24.0         87           1.88   
114    12.08       1.39  2.50               22.5         84           2.56   
148    13.32       3.24  2.38               21.5         92           1.93   
..       ...        ...   ...                ...        ...            ...   
17     13.83       1.57  2.62               20.0        115           2.95   
98     12.37       1.07  2.10               18.5         88           3.52   
66     13.11       1.01  1.70               15.0         78           2.98   
126 

In [4]:
def normalize(data):
    n,m = data.shape
    avg = np.mean(data, axis=0) # find column means
    for i in range(0, m-1):
        temp = data[:, i] - avg[i] # subtract mean
        standard_dev = np.std(data[:, i])
        data[:, i] = temp / standard_dev # divide by standard dev
    return data

In [19]:
def myKNN(train, label_train, test, k):
    n1, m1 = train.shape
    n2, m2 = test.shape
    d = euclidean_distances(test, train) # n2 by n1 each row i contains distances to training set from test sample i
    y_test = np.zeros(n2)
    for i in range(0, n2):
        ind = np.argsort(d[i,:])
        k_top_labels = label_train[ind[0:k]]
        l = sp.mode(k_top_labels)
        y_test[i] = np.asscalar(l.mode)
    return y_test

In [6]:
n1, m1 = X_train.shape # train.shape
n2, m2 = X_test.shape # test.shape

print(n1, m1, n2, m2)

142 13 36 13


In [7]:
data = np.concatenate((X_train, X_test), axis=0)

In [8]:
print(data)

[[1.352e+01 3.170e+00 2.720e+00 ... 8.900e-01 2.060e+00 5.200e+02]
 [1.483e+01 1.640e+00 2.170e+00 ... 1.080e+00 2.850e+00 1.045e+03]
 [1.349e+01 1.660e+00 2.240e+00 ... 9.800e-01 2.780e+00 4.720e+02]
 ...
 [1.146e+01 3.740e+00 1.820e+00 ... 7.500e-01 2.810e+00 5.620e+02]
 [1.282e+01 3.370e+00 2.300e+00 ... 7.200e-01 1.750e+00 6.850e+02]
 [1.145e+01 2.400e+00 2.420e+00 ... 8.000e-01 3.390e+00 6.250e+02]]


In [9]:
data = normalize(data)

In [10]:
print(data)

[[ 6.41574416e-01  7.48338394e-01  1.29210141e+00 ... -2.95923532e-01
  -7.79224070e-01  5.20000000e+02]
 [ 2.25977152e+00 -6.25086219e-01 -7.18336096e-01 ...  5.37670824e-01
   3.36605754e-01  1.04500000e+03]
 [ 6.04516467e-01 -6.07132956e-01 -4.62462232e-01 ...  9.89369528e-02
   2.37734757e-01  4.72000000e+02]
 ...
 [-1.90307141e+00  1.26000639e+00 -1.99770541e+00 ... -9.10150952e-01
   2.80108041e-01  5.62000000e+02]
 [-2.23111060e-01  9.27871023e-01 -2.43141777e-01 ... -1.04177111e+00
  -1.21708134e+00  6.85000000e+02]
 [-1.91542406e+00  5.71377721e-02  1.95499132e-01 ... -6.90784016e-01
   1.09932487e+00  6.25000000e+02]]


In [14]:
Train = data[0:n1,:]
Test = data[n1:n1+n2,:]

In [20]:
K = 14

y_test = myKNN(train=Train, label_train=y_train, test=Test, k=K)

KeyError: "None of [Index([48, 46, 78, 54, 92, 79, 84, 32, 15, 39, 56, 132, 101, 19], dtype='int64')] are in the [columns]"

In [13]:
# # for homework
# k_choices = np.array([5, 10, 15, 20])

# for i in range(0, len(k_choices)):
#     k = k_choices[i]
#     kf = KFold(n_splits=10)
#     kf.get_n_splits(X_train)

In [21]:
# # for train_index, test_index in kf.split(X_train):
#     y_test = myKNN(X_train[train_index,:], y_train[train_index], X_train[test_index,:], k) # return value is the prediction
#     n = test_index.shape
#     Acc = ((y_test == y_train[test_index],).astype('uint8')).sum() / n[0]

In [22]:
import numpy as np
import matplotlib as plt
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.model_selection import KFold
import scipy.stats as sp

In [23]:
def normalize(data):
    n, m= data.shape
    avg = np.mean(data, axis=0)
    for i in range(0, m):
        temp = data[:,i] - avg[i]
        s = np.std(data[:,i])
        data[:,i] = temp / s
    return data

In [24]:
def myKNN(train, label_train, test, k):
    n1, m1 = train.shape
    n2, m2 = test.shape
    distance = euclidean_distances(train, test) # n1 by n2
    distance = distance.transpose()
    y_test = np.zeros(n2)
    
    for i in range (0, n2):
        ind = np.argsort(distance[i,:])
        k_top_labels = label_train[ind[0:k]]
        l = sp.mode(k_top_labels)
        y_test[i] = l.mode.item()

    return y_test

In [29]:
train = np.genfromtxt("./data/wine_train.txt", delimiter=",", dtype=float)
label_train = train[:, 11]
train = train[:,0:11]
n1, m1 = train.shape
print(train.shape)

(1469, 11)


In [31]:
test = np.genfromtxt("./data/wine_test.txt", delimiter=",", dtype=float)
label_test = test[:, 11]
test = test[:,0:11]
n2, m2 = test.shape
print(test.shape)

(3429, 11)


In [32]:
data = np.concatenate((train, test), axis = 0)
data = normalize(data)

In [33]:
train = data[0:n1, :]
test = data[n1:n1+n2, : ]

In [42]:
k = 14
y_test = myKNN(train=train, label_train=label_train, test=test, k=k) # prediction of test

# compare prediction with truth(label_test)
acc = ((y_test == label_test).astype('uint8')).sum() / n2
print(acc)

0.5430154564012831


In [124]:
df = pd.DataFrame(columns=["k", "accuracy"])

k_results = []
accuracy_results = [] 
# Homework
k_choices = np.arange(10, 100, step=10)
for k in k_choices:
    kf = KFold(n_splits=10)
    # kf.get_n_splits(train)

    for train_index, test_index in kf.split(train):
        y_test = myKNN(train=train[train_index], label_train=label_train[train_index], test=train[test_index,:], k=k) # return values is prediction
        n = test_index.shape

        acc = ((y_test == label_train[test_index]).astype('uint8')).sum() / n[0]
        print(f"k = {k}")
        print(f"Predicted class labels: {[y for y in y_test]}")
        print(f"Actual class labels: {[label for label in label_test[test_index]]}")
        print(f"accuracy = {acc:.2e}")
        print("=======================")

        k_results.append(k)
        accuracy_results.append(acc)

df["k"] = k_results
df["accuracy"] = accuracy_results
df = df.groupby("k").mean().reset_index()
mae = df.sort_values(by="accuracy", ascending=False).iloc[0]["accuracy"]
print(f"Mean Absolute Error: {mae}")

k = 10
Predicted class labels: [5.0, 5.0, 5.0, 5.0, 6.0, 5.0, 6.0, 5.0, 5.0, 6.0, 5.0, 6.0, 5.0, 5.0, 5.0, 6.0, 6.0, 6.0, 6.0, 5.0, 5.0, 5.0, 5.0, 5.0, 6.0, 6.0, 5.0, 6.0, 6.0, 5.0, 5.0, 5.0, 6.0, 5.0, 6.0, 7.0, 6.0, 5.0, 5.0, 5.0, 4.0, 5.0, 6.0, 6.0, 6.0, 6.0, 5.0, 6.0, 7.0, 6.0, 7.0, 5.0, 7.0, 7.0, 5.0, 6.0, 5.0, 6.0, 4.0, 5.0, 5.0, 5.0, 5.0, 6.0, 5.0, 6.0, 6.0, 6.0, 5.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 5.0, 5.0, 5.0, 5.0, 7.0, 6.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 6.0, 6.0, 5.0, 5.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 5.0, 5.0, 5.0, 7.0, 7.0, 5.0, 6.0, 5.0, 6.0, 5.0, 6.0, 5.0, 6.0, 5.0, 6.0, 6.0, 5.0, 5.0, 6.0, 6.0, 6.0, 5.0, 7.0, 6.0, 5.0, 6.0, 5.0, 6.0, 6.0, 6.0, 6.0, 5.0, 5.0, 5.0, 6.0, 6.0, 6.0, 6.0, 5.0, 5.0, 5.0, 6.0, 6.0, 5.0, 5.0, 6.0]
Actual class labels: [6.0, 6.0, 6.0, 6.0, 7.0, 6.0, 8.0, 6.0, 5.0, 8.0, 7.0, 6.0, 6.0, 7.0, 6.0, 6.0, 5.0, 5.0, 6.0, 5.0, 6.0, 6.0, 7.0, 4.0, 5.0, 5.0, 6.0, 7.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 5.0, 6.0, 6.0, 5.0, 7.0, 5

In [127]:
train = np.genfromtxt("./data/yeast_train.txt", delimiter=",", dtype=float)
y_train = train[:, 8]
X_train = train[:,0:8]
n1, m1 = train.shape

test = np.genfromtxt("./data/yeast_test.txt", delimiter=",", dtype=float)
y_test = test[:, 8]
X_test = test[:,0:8]
n2, m2 = test.shape
print(test.shape)

# Scale the features using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

(445, 9)


In [130]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

ValueError: Input y contains NaN.