## KNN Model
### Anna Hauk

In [31]:
#! pip install opencv-python
#! pip install scikit-image
#! pip install tensorflow
#! pip install keras
#! pip install Pillow



In [32]:
# import required packages
import argparse
import tensorflow as tf
import os
import numpy as np
import pandas as pd
from keras.preprocessing import image
import cv2 as cv
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.model_selection import GridSearchCV, train_test_split
from skimage.io import imread
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing import image

In [33]:
filename_train = "/Users/annahauk/Desktop/bttai-nybg-2024/BTTAIxNYBG-train.csv"
df_train = pd.read_csv(filename_train, header = 0)

filename_test = "/Users/annahauk/Desktop/bttai-nybg-2024/BTTAIxNYBG-test.csv"
df_test = pd.read_csv(filename_test, header = 0)

filename_val = "/Users/annahauk/Desktop/bttai-nybg-2024/BTTAIxNYBG-validation.csv"
df_val = pd.read_csv(filename_val, header = 0)

df_train.head()

Unnamed: 0,uniqueID,classLabel,classID,source,imageFile
0,2,occluded-specimens,8,L,a1a8b48e8cb142b3.jpg
1,3,microscope-slides,6,L,79599db2ac9092b6.jpg
2,4,illustrations-color,2,BHL,c449696f2f0d0d92.jpg
3,5,illustrations-color,2,P,80a8f4a393b4e08c.jpg
4,6,animal-specimens,0,AK,041a1c6e73313638.jpg


# Processing Data

In [34]:
# Load dataset & Define image directory
df = pd.read_csv('/Users/annahauk/Desktop/bttai-nybg-2024/BTTAIxNYBG-train.csv')

# Load test and validation datasets
test_df = pd.read_csv('/Users/annahauk/Desktop/bttai-nybg-2024/BTTAIxNYBG-test.csv')
validate_df = pd.read_csv('/Users/annahauk/Desktop/bttai-nybg-2024/BTTAIxNYBG-validation.csv')

image_directory = "/Users/annahauk/Desktop/bttai-nybg-2024/BTTAIxNYBG-train/BTTAIxNYBG-train"

In [35]:
# Preprocessing function to load and process images
def load_and_preprocess_image(filename, target_size=(224, 224)):  # target size sets the image size to 224x224
    img_path = os.path.join(image_directory, filename)  # Create image path
    img = image.load_img(img_path, target_size=target_size)  # Load image and resize
    img_array = image.img_to_array(img)  # Convert image to numpy array of shape (224, 224, 3)
    # this is done to add a dimension to the image, so that it can be passed to the model
    img_array = np.expand_dims(img_array, axis=0)  # Model expects a batch of images
    # expand_dims is used to add a dimension to the image, so that it can be passed to the model

    return img_array / 255.0  # Normalize to [0, 1]
    # divide by 255 because the model expects the input to be in the range [0, 1]

In [36]:
# Apply preprocessing to all images
df['imageData'] = df['imageFile'].apply(load_and_preprocess_image)
# this will apply the function to each row of the dataframe

In [37]:
# "/Users/annahauk/Desktop/bttai-nybg-2024/BTTAIxNYBG-test/BTTAIxNYBG-test"

In [38]:
# apply preprocessing to test and validation datasets
#test_df['imageData'] = test_df['imageFile'].apply(load_and_preprocess_image)

In [39]:
#image_directory = "/Users/annahauk/Desktop/bttai-nybg-2024/BTTAIxNYBG-validation/BTTAIxNYBG-validation"

In [40]:
#validate_df['imageData'] = validate_df['imageFile'].apply(load_and_preprocess_image)

In [41]:
# Split dataset into training and validation sets
#train_df, validate_df = train_test_split(df, test_size=0.2, random_state=42)

### Note: This is a common step in ML training, but in this challenge, since the validation set is provided separately, 
### there is no need to call this function to distinguish between validation and train set.
# train_df, validate_df = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
# format validation and train datasets so the ImageData column can be passed to a Knn model
train_images = np.vstack(train_df['imageData'].values)
#train_images = np.vstack(train_df['imageData'].values)

In [43]:
# Data augmentation configuration for training
train_datagen = ImageDataGenerator(
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

# Note: No augmentation for validation data, only rescaling
validation_datagen = ImageDataGenerator(rescale=1./255)

# Convert dataframe to a format suitable for the model training
def df_to_dataset(dataframe, datagen, batch_size=32):
    return datagen.flow_from_dataframe(
        dataframe=dataframe,
        directory=image_directory,
        x_col='imageFile',
        y_col='classLabel',
        target_size=(224, 224),
        batch_size=batch_size,
        class_mode='categorical'  # Change this if not a multiclass classification
    )

# Create datasets for training and validation
train_dataset = df_to_dataset(train_df, train_datagen)
validation_dataset = df_to_dataset(validate_df, validation_datagen)
#test_dataset = df_to_dataset(test_df, validation_datagen)

# This setup is now ready for training with model.fit using the train_dataset and validation_dataset

Found 0 validated image filenames belonging to 0 classes.
Found 0 validated image filenames belonging to 0 classes.




In [44]:
# Get one batch of data
images, labels = next(train_dataset)

# Print out the images and labels
print("images:", images)
print("labels:", labels)

images: []
labels: []


In [45]:
# Load the KNN image classifier model
from sklearn.neighbors import KNeighborsClassifier

# Load the image data
X_train = np.array(train_df['imageData'].tolist())
X_test = np.array(df_test['imageData'].tolist())
X_val = np.array(df_val['imageData'].tolist())

# Load the labels
y_train = df_train['classLabel']
y_test = df_test['classLabel']
y_val = df_val['classLabel']

# Flatten the image data
n_samples, nx, ny, nz = X_train.shape
X_train = X_train.reshape((n_samples, nx * ny * nz))

n_samples, nx, ny, nz = X_test.shape
X_test = X_test.reshape((n_samples, nx * ny * nz))

n_samples, nx, ny, nz = X_val.shape
X_val = X_val.reshape((n_samples, nx * ny * nz))

# Create the KNN model
knn = KNeighborsClassifier(n_neighbors=3)

# Fit the model
knn.fit(X_train, y_train)

# Predict the test set
y_pred = knn.predict(X_test)

# Print the accuracy
print("Accuracy:", knn.score(X_test, y_test))

KeyboardInterrupt: 

In [None]:
train_df.dtypes

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

X = train_df['imageData']
y = train_df['classLabel']

# Flatten the images
#X = [x.flatten() for x in X]

# Split the data into training and testing sets
#X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

# Create and train the KNN model
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

# Evaluate the model
print("Train score:", knn.score(X_train, y_train))
#print("Test score:", knn.score(X_test, y_test))

In [None]:
# Ignore this cause this is what I did to process the images that worked but we have a similar func above

from keras.preprocessing import image
import numpy as np
import os

def load_images_from_directory(directory):
    images = []
    for filename in os.listdir(directory):
        if filename.endswith(".jpg") or filename.endswith(".png"):  # Add or modify if you have other image types
            img_path = os.path.join(directory, filename)
            img = image.load_img(img_path, target_size=(224, 224)) # resize image to 224x224
            img = image.img_to_array(img) # convert image to numpy array
            images.append(img) # add image to the list as an array
    return np.array(images) # convert list of arrays to a single array dimensions: (n_images, 224, 224, 3)

train = load_images_from_directory("/Users/annahauk/Desktop/bttai-nybg-2024/BTTAIxNYBG-train/BTTAIxNYBG-train")