In [1]:
from google.colab import drive
drive.mount("/content/drive/")

Mounted at /content/drive/


# Import essential modules

In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
from PIL import Image
from multiprocessing import Pool
from tqdm import tqdm

# Important functions

In [3]:
# Function to convert image to feature vector
def image_to_feature_vector(image_path, size=(32, 32)):
    image = Image.open(image_path)
    image = image.resize(size)
    return np.array(image).flatten()

In [4]:
# Parallelized function to load images and extract features
def process_image(row):
    image_path = row['Paths']
    features = image_to_feature_vector(image_path)
    return features

# Create the training and testing set

In [5]:
# Training set
## Get paths of the Training directory
Training = "/content/drive/MyDrive/SkinAI/train"
imgpaths = []
labels =[]
## Convert directories to list
tr_dir = os.listdir(Training)

## Get paths and labels of classes and images in Training directory
for i in tr_dir:
    classpath = os.path.join(Training, i)
    imglist = os.listdir(classpath)
    for img in imglist:
        imgpath = os.path.join(classpath, img)
        imgpaths.append(imgpath)
        labels.append(i)

## Create the data frame
Paths = pd.Series(imgpaths, name = 'Paths')
Labels = pd.Series(labels, name = 'Labels')
Tr_data = pd.concat([Paths, Labels], axis = 1)

## Extract 1000 random samples for the Training set
Tr_data = Tr_data.sample(n=300, random_state=42)

In [6]:
# Use multiprocessing to parallelize feature extraction for training set
with Pool() as pool:
  train_features = pool.map(process_image, Tr_data.to_dict('records'))
X_train = np.array(train_features)
y_train = Tr_data['Labels']

In [7]:
# Testing set
Testing = "/content/drive/MyDrive/SkinAI/test"
imgpaths = []
labels =[]
ts_dir = os.listdir(Testing)

## Get paths and labels of classes and images in Testing directory
for i in ts_dir:
    classpath = os.path.join(Testing, i)
    imglist = os.listdir(classpath)
    for img in imglist:
        imgpath = os.path.join(classpath, img)
        imgpaths.append(imgpath)
        labels.append(i)

## Create the data frame
Paths = pd.Series(imgpaths, name = 'Paths')
Labels = pd.Series(labels, name = 'Labels')
Ts_data = pd.concat([Paths, Labels], axis = 1)

## Extract 300 samples for the Testing set
Ts_data = Ts_data.sample(n=300, random_state=42)

In [8]:
# Use multiprocessing to parallelize feature extraction for testing set
with Pool() as pool:
    test_features = pool.map(process_image, Ts_data.to_dict('records'))
X_test = np.array(test_features)
y_test = Ts_data['Labels']

In [9]:
# Initialize KNN classifier
k = 5
knn_classifier = KNeighborsClassifier(n_neighbors=k)

# Train the classifier
knn_classifier.fit(X_train, y_train)

In [10]:
# Predictions on the test set
y_pred = knn_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.49666666666666665

Classification Report:
                                              precision    recall  f1-score   support

                     Acne and Rosacea Photos       0.68      0.62      0.65       179
Light Diseases and Disorders of Pigmentation       0.12      0.06      0.08        68
         Melanoma Skin Cancer Nevi and Moles       0.33      0.64      0.43        53

                                    accuracy                           0.50       300
                                   macro avg       0.38      0.44      0.39       300
                                weighted avg       0.49      0.50      0.48       300

