In [1]:
import gdown
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

In [2]:
# ==============================
# Step 1: Download & Load Dataset
# ==============================
# Download from Google Drive
gdown.download(id="1ZdhRqYv-JizWV6DxO6C4R_k1kxPhmlF2", output="groceries.csv", quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1ZdhRqYv-JizWV6DxO6C4R_k1kxPhmlF2
To: /content/groceries.csv
100%|██████████| 14.6k/14.6k [00:00<00:00, 13.5MB/s]


'groceries.csv'

In [3]:
# Load dataset
df = pd.read_csv("groceries.csv")
print("First 5 rows:\n", df.head())
print("\nDataset Info:\n")
print(df.info())

First 5 rows:
    Region  Fresh  Milk  Grocery  Frozen  Detergents_Paper  Delicassen  class
0       3  12669  9656     7561     214              2674        1338      2
1       3   7057  9810     9568    1762              3293        1776      2
2       3   6353  8808     7684    2405              3516        7844      2
3       3  13265  1196     4221    6404               507        1788      1
4       3  22615  5410     7198    3915              1777        5185      1

Dataset Info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 440 entries, 0 to 439
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   Region            440 non-null    int64
 1   Fresh             440 non-null    int64
 2   Milk              440 non-null    int64
 3   Grocery           440 non-null    int64
 4   Frozen            440 non-null    int64
 5   Detergents_Paper  440 non-null    int64
 6   Delicassen        440 non-null    int6

In [4]:
# Check number of null values per column
print("Null values in each column:")
print(df.isnull().sum())

# Check number of duplicate rows
print("\nNumber of duplicate rows:", df.duplicated().sum())

# If you want to see the duplicate rows themselves
duplicate_rows = df[df.duplicated()]
print("\nDuplicate rows:")
print(duplicate_rows)

Null values in each column:
Region              0
Fresh               0
Milk                0
Grocery             0
Frozen              0
Detergents_Paper    0
Delicassen          0
class               0
dtype: int64

Number of duplicate rows: 0

Duplicate rows:
Empty DataFrame
Columns: [Region, Fresh, Milk, Grocery, Frozen, Detergents_Paper, Delicassen, class]
Index: []


In [5]:
print("Unique values in 'class' column:", df['class'].unique())
print("\nNumber of values in each class:")
print(df['class'].value_counts())

Unique values in 'class' column: [2 1 3]

Number of values in each class:
class
2    180
3    173
1     87
Name: count, dtype: int64


In [6]:
# Identify the numerical columns excluding the 'class' column
numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
numerical_cols.remove('class')

# Identify outliers for each numerical column
outliers = {}
for col in numerical_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    col_outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    outliers[col] = col_outliers

    print(f"Number of outliers in '{col}': {len(col_outliers)}")
    if not col_outliers.empty:
        print(f"Outliers in '{col}':")
        display(col_outliers)
    print("-" * 30)

Number of outliers in 'Region': 0
------------------------------
Number of outliers in 'Fresh': 20
Outliers in 'Fresh':


Unnamed: 0,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,class
29,3,43088,2100,2609,1200,1107,823,2
39,3,56159,555,902,10002,212,2916,2
47,3,44466,54259,55571,7782,24171,6465,2
52,3,40721,3916,5876,532,2587,1278,2
87,3,43265,5025,8117,6312,1579,14351,2
103,3,56082,3504,8906,18028,1480,2498,2
125,3,76237,3473,7102,16538,778,918,2
129,3,42312,926,1510,1718,410,1819,2
176,3,45640,6958,6536,7368,1532,230,2
181,3,112151,29627,18148,16745,4948,8550,2


------------------------------
Number of outliers in 'Milk': 28
Outliers in 'Milk':


Unnamed: 0,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,class
23,3,26373,36423,22019,5154,4337,16523,2
28,3,4113,20484,25957,1158,8604,5206,2
38,3,4591,15729,16709,33,6956,433,2
45,3,5181,22044,21531,1740,7353,4985,2
47,3,44466,54259,55571,7782,24171,6465,2
49,3,4967,21412,28921,1798,13583,1163,2
56,3,4098,29892,26866,2616,17740,1340,2
61,3,35942,38369,59598,3254,26701,2017,2
65,3,85,20959,45828,36,24231,1423,2
85,3,16117,46197,92780,1026,40827,2944,2


------------------------------
Number of outliers in 'Grocery': 24
Outliers in 'Grocery':


Unnamed: 0,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,class
28,3,4113,20484,25957,1158,8604,5206,2
43,3,630,11095,23998,787,9529,72,2
47,3,44466,54259,55571,7782,24171,6465,2
49,3,4967,21412,28921,1798,13583,1163,2
56,3,4098,29892,26866,2616,17740,1340,2
61,3,35942,38369,59598,3254,26701,2017,2
65,3,85,20959,45828,36,24231,1423,2
77,3,12205,12697,28540,869,12034,1009,2
85,3,16117,46197,92780,1026,40827,2944,2
86,3,22925,73498,32114,987,20070,903,2


------------------------------
Number of outliers in 'Frozen': 43
Outliers in 'Frozen':


Unnamed: 0,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,class
22,3,31276,1917,4469,9408,2381,4334,2
39,3,56159,555,902,10002,212,2916,2
40,3,24025,4332,4757,9510,1145,5864,1
47,3,44466,54259,55571,7782,24171,6465,2
70,3,16705,2037,3202,10643,116,1365,1
72,3,4420,5139,2661,8872,1321,181,3
73,3,19899,5332,8713,8132,764,648,1
88,3,7864,542,4042,9735,165,46,3
91,3,12754,2762,2530,8693,627,1117,1
93,3,11314,3090,2062,35009,71,2698,1


------------------------------
Number of outliers in 'Detergents_Paper': 30
Outliers in 'Detergents_Paper':


Unnamed: 0,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,class
43,3,630,11095,23998,787,9529,72,2
47,3,44466,54259,55571,7782,24171,6465,2
49,3,4967,21412,28921,1798,13583,1163,2
56,3,4098,29892,26866,2616,17740,1340,2
61,3,35942,38369,59598,3254,26701,2017,2
65,3,85,20959,45828,36,24231,1423,2
77,3,12205,12697,28540,869,12034,1009,2
85,3,16117,46197,92780,1026,40827,2944,2
86,3,22925,73498,32114,987,20070,903,2
92,3,9198,27472,32034,3232,18906,5130,2


------------------------------
Number of outliers in 'Delicassen': 27
Outliers in 'Delicassen':


Unnamed: 0,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,class
2,3,6353,8808,7684,2405,3516,7844,2
4,3,22615,5410,7198,3915,1777,5185,1
17,3,5876,6157,2933,839,370,4478,3
22,3,31276,1917,4469,9408,2381,4334,2
23,3,26373,36423,22019,5154,4337,16523,2
24,3,22647,9776,13792,2915,4482,5778,2
28,3,4113,20484,25957,1158,8604,5206,2
36,3,29955,4362,5428,1729,862,4626,2
40,3,24025,4332,4757,9510,1145,5864,1
45,3,5181,22044,21531,1740,7353,4985,2


------------------------------


In [10]:
# Identify the numerical columns excluding the 'class' column
numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
numerical_cols.remove('class')

# Identify and drop outliers for each numerical column using 2 * IQR
df_cleaned = df.copy()
outlier_indices = set()

for col in numerical_cols:
    Q1 = df_cleaned[col].quantile(0.25)
    Q3 = df_cleaned[col].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 2 * IQR  # Using 2 * IQR
    upper_bound = Q3 + 2 * IQR  # Using 2 * IQR

    col_outlier_indices = df_cleaned[(df_cleaned[col] < lower_bound) | (df_cleaned[col] > upper_bound)].index
    outlier_indices.update(col_outlier_indices)

print(f"Number of rows identified as outliers (using 2 * IQR): {len(outlier_indices)}")
print(f"Indices of outliers (using 2 * IQR): {outlier_indices}")

# Drop the outliers
df_cleaned = df_cleaned.drop(outlier_indices)

print("\nShape of the original DataFrame:", df.shape)
print("Shape of the cleaned DataFrame:", df_cleaned.shape)

print("\nFirst 5 rows of the cleaned DataFrame:\n", df_cleaned.head())
print("\nDataset Info of the cleaned DataFrame:\n")
print(df_cleaned.info())

Number of rows identified as outliers (using 2 * IQR): 76
Indices of outliers (using 2 * IQR): {384, 258, 259, 2, 4, 265, 145, 401, 277, 22, 23, 406, 24, 282, 283, 28, 284, 413, 155, 409, 163, 39, 40, 425, 171, 427, 45, 47, 176, 49, 431, 435, 304, 181, 309, 183, 56, 437, 310, 312, 61, 319, 65, 196, 325, 70, 71, 201, 202, 77, 333, 334, 209, 338, 211, 339, 85, 86, 343, 216, 88, 87, 218, 92, 93, 349, 351, 358, 103, 109, 239, 240, 381, 372, 251, 125}

Shape of the original DataFrame: (440, 8)
Shape of the cleaned DataFrame: (364, 8)

First 5 rows of the cleaned DataFrame:
    Region  Fresh  Milk  Grocery  Frozen  Detergents_Paper  Delicassen  class
0       3  12669  9656     7561     214              2674        1338      2
1       3   7057  9810     9568    1762              3293        1776      2
3       3  13265  1196     4221    6404               507        1788      1
5       3   9413  8259     5126     666              1795        1451      3
6       3  12126  3199     6975     480

In [11]:
# Identify the numerical columns excluding the 'class' column
numerical_cols = df_cleaned.select_dtypes(include=np.number).columns.tolist()
numerical_cols.remove('class')

# Identify outliers for each numerical column
outliers = {}
for col in numerical_cols:
    Q1 = df_cleaned[col].quantile(0.25)
    Q3 = df_cleaned[col].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    col_outliers = df_cleaned[(df_cleaned[col] < lower_bound) | (df_cleaned[col] > upper_bound)]
    outliers[col] = col_outliers

    print(f"Number of outliers in '{col}': {len(col_outliers)}")
    if not col_outliers.empty:
        print(f"Outliers in '{col}':")
        display(col_outliers)
    print("-" * 30)

Number of outliers in 'Region': 0
------------------------------
Number of outliers in 'Fresh': 12
Outliers in 'Fresh':


Unnamed: 0,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,class
29,3,43088,2100,2609,1200,1107,823,2
52,3,40721,3916,5876,532,2587,1278,2
124,3,36050,1642,2961,4787,500,1621,2
129,3,42312,926,1510,1718,410,1819,2
142,3,37036,7152,8253,2995,20,3,2
273,3,36817,3045,1493,4802,210,1824,2
285,3,40254,640,3600,1042,436,18,2
289,3,42786,286,471,1388,32,22,1
370,3,39679,3944,4955,1364,523,2235,2
377,3,38793,3154,2648,1034,96,1242,2


------------------------------
Number of outliers in 'Milk': 5
Outliers in 'Milk':


Unnamed: 0,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,class
38,3,4591,15729,16709,33,6956,433,2
46,3,3103,14069,21955,1668,6792,1452,2
254,1,10379,17972,4748,4686,1547,3265,2
315,2,1479,14982,11924,662,3891,3508,2
331,2,11223,14881,26839,1234,9606,1102,2


------------------------------
Number of outliers in 'Grocery': 14
Outliers in 'Grocery':


Unnamed: 0,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,class
43,3,630,11095,23998,787,9529,72,2
46,3,3103,14069,21955,1668,6792,1452,2
165,3,15615,12653,19858,4425,7108,2379,2
173,3,514,7677,19805,937,9836,716,2
193,3,180,3485,20292,959,5618,666,2
200,1,3067,13240,23127,3941,9959,731,2
205,1,1107,11711,23596,955,9265,710,2
266,1,572,9763,22182,2221,4882,2563,2
268,1,11908,8053,19847,1069,6374,698,2
301,2,5283,13316,20399,1809,8752,172,2


------------------------------
Number of outliers in 'Frozen': 26
Outliers in 'Frozen':


Unnamed: 0,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,class
3,3,13265,1196,4221,6404,507,1788,1
33,3,29729,4786,7326,6130,361,1083,2
72,3,4420,5139,2661,8872,1321,181,3
73,3,19899,5332,8713,8132,764,648,1
76,3,717,3587,6532,7530,529,894,3
91,3,12754,2762,2530,8693,627,1117,1
112,3,19046,2770,2469,8853,483,2708,1
126,3,19219,1840,1658,8195,349,483,1
130,3,7149,2428,699,6316,395,911,3
143,3,10405,1596,1096,8425,399,318,3


------------------------------
Number of outliers in 'Detergents_Paper': 10
Outliers in 'Detergents_Paper':


Unnamed: 0,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,class
43,3,630,11095,23998,787,9529,72,2
107,3,8797,10646,14886,2471,8969,1438,2
173,3,514,7677,19805,937,9836,716,2
200,1,3067,13240,23127,3941,9959,731,2
205,1,1107,11711,23596,955,9265,710,2
245,1,3062,6154,13916,230,8933,2784,2
301,2,5283,13316,20399,1809,8752,172,2
303,2,2599,3688,13829,492,10069,59,2
331,2,11223,14881,26839,1234,9606,1102,2
353,3,117,6264,21203,228,8682,1111,2


------------------------------
Number of outliers in 'Delicassen': 8
Outliers in 'Delicassen':


Unnamed: 0,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,class
17,3,5876,6157,2933,839,370,4478,3
36,3,29955,4362,5428,1729,862,4626,2
138,3,13537,4257,5034,155,249,3271,1
166,3,4822,6721,9170,993,4973,3637,2
254,1,10379,17972,4748,4686,1547,3265,2
267,1,20893,1222,2576,3975,737,3628,1
315,2,1479,14982,11924,662,3891,3508,2
411,3,2126,3289,3281,1535,235,4365,3


------------------------------


In [12]:
from imblearn.over_sampling import SMOTE
from collections import Counter

# Separate features (X) and target (y)
X = df_cleaned.drop('class', axis=1)
y = df_cleaned['class']

# Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

print("Original dataset shape:", Counter(y))
print("Resampled dataset shape:", Counter(y_resampled))

Original dataset shape: Counter({3: 164, 2: 123, 1: 77})
Resampled dataset shape: Counter({2: 164, 1: 164, 3: 164})


In [14]:
print("Shape of the cleaned dataset (df_cleaned):", df_cleaned.shape)
print("Shape of the balanced features dataset (X_resampled):", X_resampled.shape)
print("Shape of the balanced target dataset (y_resampled):", y_resampled.shape)

# Calculate the expected shape after SMOTE
original_class_counts = Counter(y)
majority_class_count = max(original_class_counts.values())
num_classes = len(original_class_counts)
expected_samples_after_smote = majority_class_count * num_classes

print(f"\nOriginal class counts: {original_class_counts}")
print(f"Majority class count: {majority_class_count}")
print(f"Number of classes: {num_classes}")
print(f"Expected number of samples after SMOTE: {expected_samples_after_smote}")

if X_resampled.shape[0] == expected_samples_after_smote and y_resampled.shape[0] == expected_samples_after_smote:
    print("\nThe shape of the balanced dataset matches the expected shape.")
else:
    print("\nThe shape of the balanced dataset does not match the expected shape.")

Shape of the cleaned dataset (df_cleaned): (364, 8)
Shape of the balanced features dataset (X_resampled): (492, 7)
Shape of the balanced target dataset (y_resampled): (492,)

Original class counts: Counter({3: 164, 2: 123, 1: 77})
Majority class count: 164
Number of classes: 3
Expected number of samples after SMOTE: 492

The shape of the balanced dataset matches the expected shape.


In [15]:
# Split the balanced dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (393, 7)
Shape of X_test: (99, 7)
Shape of y_train: (393,)
Shape of y_test: (99,)


In [16]:
import numpy as np
from collections import Counter
from sklearn.metrics import accuracy_score

def euclidean_distance(point1, point2):
    return np.sqrt(np.sum((point1 - point2)**2))

class KNNClassifier:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def predict(self, X_test):
        predictions = [self._predict(x) for x in X_test]
        return np.array(predictions)

    def _predict(self, x):
        # Calculate distances from the test instance to all training instances
        distances = [euclidean_distance(x, x_train) for x_train in self.X_train]

        # Get the indices of the k nearest neighbors
        k_indices = np.argsort(distances)[:self.k]

        # Get the labels of the k nearest neighbors
        k_nearest_labels = [self.y_train[i] for i in k_indices]

        # Find the most common class label among the k nearest neighbors
        most_common = Counter(k_nearest_labels).most_common(1)
        return most_common[0][0]

# Convert pandas DataFrames to NumPy arrays for the scratch implementation
X_train_np = X_train.values
X_test_np = X_test.values
y_train_np = y_train.values
y_test_np = y_test.values

# Instantiate and train the KNN classifier
knn_scratch = KNNClassifier(k=5) # You can choose a different value for k
knn_scratch.fit(X_train_np, y_train_np)

# Make predictions on the test set
predictions_scratch = knn_scratch.predict(X_test_np)

# Evaluate the accuracy
accuracy_scratch = accuracy_score(y_test_np, predictions_scratch)
print(f"Accuracy of the scratch KNN implementation: {accuracy_scratch}")

Accuracy of the scratch KNN implementation: 0.9696969696969697


In [18]:
from sklearn.metrics import classification_report

# Generate classification report
report_scratch = classification_report(y_test_np, predictions_scratch)

print("Classification Report for the scratch KNN implementation:")
print(report_scratch)

Classification Report for the scratch KNN implementation:
              precision    recall  f1-score   support

           1       1.00      0.91      0.95        33
           2       0.97      1.00      0.99        33
           3       0.94      1.00      0.97        33

    accuracy                           0.97        99
   macro avg       0.97      0.97      0.97        99
weighted avg       0.97      0.97      0.97        99



In [19]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Instantiate the KNN classifier (choose k=5 for example)
knn = KNeighborsClassifier(n_neighbors=5)

# Fit the model on training data
knn.fit(X_train, y_train)

# Make predictions on the test set
predictions_sklearn = knn.predict(X_test)

# Evaluate accuracy
accuracy_sklearn = accuracy_score(y_test, predictions_sklearn)
print(f"Accuracy of the sklearn KNN implementation: {accuracy_sklearn}")

Accuracy of the sklearn KNN implementation: 0.9696969696969697
