<a href="https://colab.research.google.com/github/alimomennasab/CS4210/blob/main/4210FinalProjectKNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## CS4210-02 Machine Learning
## Fall 2024 Semester Project
K-nearest neighbors model that classifies fish/shrimp from an image
### Members:
- Ali Momennasab
- Denise Thuong
- Eli Tolentino
- Armin Erika Polanco
- Sarah Choe
- Damian Varela


##Imports

In [1]:
import kagglehub
import os
import numpy as np
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import PIL

from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, cohen_kappa_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from pathlib import Path

##Data Preprocessing

In [None]:
# Download latest version
path = kagglehub.dataset_download("crowww/a-large-scale-fish-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/crowww/a-large-scale-fish-dataset?dataset_version_number=2...


  5%|▍         | 153M/3.24G [00:01<00:29, 113MB/s]

In [None]:
# Cleaning dataset
direc = Path(path)
filepaths = list(direc.glob(r'**/*.png'))
Labels = list(map(lambda x: os.path.split(os.path.split(x)[0])[1],filepaths))

filepaths = pd.Series(filepaths, name='FilePaths').astype(str)
Labels = pd.Series(Labels, name='Labels').astype(str)

img_df = pd.merge(filepaths, Labels, right_index = True, left_index = True)

img_df = img_df[ img_df['Labels'].apply(lambda x: x[-2:]!='GT') ] # ground truth binary masks not needed for classification

img_df = img_df[img_df['FilePaths'].str.endswith('.png')].reset_index(drop=True) # keep only pngs

# Shuffle images and display 9 random images
img_df=img_df.sample(frac = 1).reset_index(drop=True)
img_df.head()
f,a = plt.subplots(nrows=3, ncols=3,figsize=(13, 7),
                        subplot_kw={'xticks': [], 'yticks': []})

for i, ax in enumerate(a.flat):
    ax.imshow(plt.imread(img_df.FilePaths[i]))
    ax.set_title(img_df.Labels[i])

plt.tight_layout()
plt.show()

In [None]:
# Data analysis
img_df.shape

In [None]:
# Data analysis
img_df.columns

In [None]:
# Delete original, non-augmented photos that have different sizing/resolution
for index in img_df.index:
    try:
        img = plt.imread(img_df.FilePaths[index])
        if img.shape != (445, 590, 3):
            img_df.drop(index, inplace=True)
    except:
        # If we can't read the image, drop it
        img_df.drop(index, inplace=True)

img_df.shape

In [None]:
# Prepare features (X) and labels (y)
X = []
y = []

# Load and flatten images
for index in img_df.index:
    img = plt.imread(img_df.FilePaths[index])
    X.append(img.flatten())  # Flatten the image into 1D array for KNN
    y.append(img_df.Labels[index])

X = np.array(X)
y = np.array(y)

# Convert labels to numerical format
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=8000, random_state=1)


##Training

In [None]:
# Train KNN classifier
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train, y_train)

In [None]:
model.save('fish_model.keras')

##Testing

In [None]:
predictions = model.predict(X_test)

# evaluation metrics: confusion matrix, accuracy, classification report, kappa
print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))
print("Accuracy:", accuracy_score(y_test, predictions))
print("Classification Report:", classification_report(y_test, predictions, target_names=label_encoder.classes_))
print("Kappa:", cohen_kappa_score(y_test, predictions))