In [None]:
import gdown, pandas as pd, numpy as np

file_id = "1ZdhRqYv-JizWV6DxO6C4R_k1kxPhmlF2"
url = f"https://drive.google.com/uc?id={file_id}"

gdown.download(url, "data.csv", quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1ZdhRqYv-JizWV6DxO6C4R_k1kxPhmlF2
To: /content/data.csv
100%|██████████| 14.6k/14.6k [00:00<00:00, 22.1MB/s]


'data.csv'

In [None]:
data = pd.read_csv("data.csv")
data.head()

Unnamed: 0,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,class
0,3,12669,9656,7561,214,2674,1338,2
1,3,7057,9810,9568,1762,3293,1776,2
2,3,6353,8808,7684,2405,3516,7844,2
3,3,13265,1196,4221,6404,507,1788,1
4,3,22615,5410,7198,3915,1777,5185,1


In [None]:
print(data.shape)
print(data.columns)
print("\n----- data types -----")
print(data.dtypes)

print(data['class'].value_counts())

print("\n--- Null values ---")
print(data.isna().sum())

print(f"Number of duplicates: {data.duplicated().sum()}")

(440, 8)
Index(['Region', 'Fresh', 'Milk', 'Grocery', 'Frozen', 'Detergents_Paper',
       'Delicassen', 'class'],
      dtype='object')

----- data types -----
Region              int64
Fresh               int64
Milk                int64
Grocery             int64
Frozen              int64
Detergents_Paper    int64
Delicassen          int64
class               int64
dtype: object
class
2    180
3    173
1     87
Name: count, dtype: int64

--- Null values ---
Region              0
Fresh               0
Milk                0
Grocery             0
Frozen              0
Detergents_Paper    0
Delicassen          0
class               0
dtype: int64
Number of duplicates: 0


In [None]:
from imblearn.under_sampling import RandomUnderSampler

Target = 'class'
temp_data = data.copy()
X = temp_data.drop(Target, axis=1)
y = temp_data[Target]

rus = RandomUnderSampler(random_state=42)
X_res, y_res = rus.fit_resample(X, y)


print("Before:", pd.Series(y).value_counts())
print("After:", pd.Series(y_res).value_counts())

Before: class
2    180
3    173
1     87
Name: count, dtype: int64
After: class
1    87
2    87
3    87
Name: count, dtype: int64


In [None]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
X_res = ss.fit_transform(X_res)

X_res = pd.DataFrame(X_res, columns=X.columns)
X_res.head()


Unnamed: 0,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
0,0.598932,-0.001334,-0.596388,-0.291892,0.868699,-0.401659,0.308843
1,0.598932,0.823045,0.131003,0.013372,0.225695,-0.132883,2.586432
2,0.598932,0.376117,-0.022795,-0.252824,-0.510311,-0.030874,0.534121
3,0.598932,0.087805,-0.664225,-0.41248,-0.660406,-0.487795,-0.542655
4,0.598932,0.736375,-0.57533,-0.428784,-0.716982,-0.314677,-0.618418


In [None]:
# picking a random row from X

combined = X_res.copy()
combined[Target] = y_res

# Pick one random row
rand_row = combined.sample()

print(rand_row)

# Extract its label (works because it's still a DataFrame)
rand_class = rand_row[Target].values

# Extract its features as a NumPy array (for distance calculation)
query_point = rand_row.drop(Target, axis=1).values.flatten().astype(float)

print(rand_class, query_point)

       Region     Fresh      Milk   Grocery    Frozen  Detergents_Paper  \
116  0.598932  0.326214 -0.305017  0.529563 -0.559912          0.171025   

     Delicassen  class  
116   -0.723012    NaN  
[nan] [ 0.59893218  0.32621352 -0.30501736  0.5295631  -0.55991226  0.1710255
 -0.72301151]


In [None]:
# KNN scratch implementation
from collections import Counter

def KNN_scratch(X_res, y_res, query_point, k=3):
    X_res = X_res.astype(float)
    y_res = y_res.astype(int)
    dist_list = []

    y_values = y_res.values if hasattr(y_res, 'values') else y_res

    for row, class_val in zip(X_res.values, y_values):
        dist = np.linalg.norm(row - query_point)
        dist_list.append({"dist": dist, "class": class_val})

    dist_list = sorted(dist_list, key=lambda x: x['dist'])
    nearest_neighbors = dist_list[:k]

    # Count class frequencies
    class_counts = Counter([i['class'] for i in nearest_neighbors])

    # Handle ties by choosing class with smallest distance
    max_count = max(class_counts.values())
    tied_classes = [cls for cls, count in class_counts.items() if count == max_count]

    if len(tied_classes) == 1:
        predicted_class = tied_classes[0]
    else:
        # Break tie by choosing class with smallest average distance
        tie_distances = {}
        for neighbor in nearest_neighbors:
            if neighbor['class'] in tied_classes:
                if neighbor['class'] not in tie_distances:
                    tie_distances[neighbor['class']] = []
                tie_distances[neighbor['class']].append(neighbor['dist'])

        predicted_class = min(tie_distances.keys(),
                            key=lambda x: np.mean(tie_distances[x]))

    return predicted_class, nearest_neighbors

print(KNN_scratch(X_res, y_res, query_point))

(np.int64(2), [{'dist': np.float64(0.0), 'class': np.int64(2)}, {'dist': np.float64(0.8072102826772087), 'class': np.int64(2)}, {'dist': np.float64(1.0186849477241737), 'class': np.int64(2)}])


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)



def get_accuracy(X_train, y_train, X_test, y_test, k=3):
    correct = 0
    total = len(X_test)

    for i in range(len(X_test)):
        query_point = X_test.iloc[i].values.astype(float)
        true_class = y_test.iloc[i]

        predicted_class, _ = KNN_scratch(X_train, y_train, query_point, k)

        if predicted_class == true_class:
            correct += 1

    accuracy = correct / total
    return accuracy

# Usage
accuracy = get_accuracy(X_train, y_train, X_test, y_test, k=3)
print(f"Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")

Accuracy: 0.8302 (83.02%)


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)


# Train kNN
clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(X_train, y_train)

# Predict
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.68      0.88      0.77        17
           2       1.00      0.80      0.89        20
           3       0.87      0.81      0.84        16

    accuracy                           0.83        53
   macro avg       0.85      0.83      0.83        53
weighted avg       0.86      0.83      0.84        53

