In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import cv2
import glob as gb
import tqdm

In [2]:
# set seed
np.random.seed(0)

In [3]:
# Define paths for training, testing, and prediction data

data_dir = './melanoma_cancer_dataset'
train_data = data_dir + '/train'
test_data = data_dir + '/test'

In [4]:

# Function to load images and labels
def load_data(data_path, max_images=None):
    images = []
    labels = []
    class_ = {'benign': 0, 'malignant': 1}
    for folder in os.listdir(data_path):
        data = gb.glob(pathname=data_path + "/" + folder + '/*.jpg')
        # Initialize a counter to keep track of how many images have been loaded
        counter = 0
        for img_path in data:
            img = cv2.imread(img_path)
            img_resized = cv2.resize(img, (100, 100))
            images.append(img_resized)
            labels.append(class_[folder])
            counter += 1
            # If the counter reaches the max_images limit, break from the loop
            if max_images is not None and counter >= max_images:
                break
    return np.array(images), np.array(labels)


In [5]:
# Load training and test data
# using only small subset of data for now to speed up the process
X_train, y_train = load_data(train_data, 300)
X_test, y_test = load_data(test_data)

print(X_train.shape, y_train.shape)

(600, 100, 100, 3) (600,)


In [6]:
# split the train data into train and validation, shuffle the data before splitting
validation_split = 0.2

# Shuffle the data
shuffled_indices = np.random.permutation(len(X_train))
X_train = X_train[shuffled_indices]
y_train = y_train[shuffled_indices]

# Split the data into training and validation
split_index = int(len(X_train) * (1 - validation_split))
X_train, X_val = X_train[:split_index], X_train[split_index:]
y_train, y_val = y_train[:split_index], y_train[split_index:]

In [8]:
# Flatten and normalize the data
scaler = StandardScaler()
X_train_flat = scaler.fit_transform(X_train.reshape(X_train.shape[0], -1))
X_val_flat = scaler.transform(X_val.reshape(X_val.shape[0], -1))

X_test_flat = scaler.transform(X_test.reshape(X_test.shape[0], -1))

In [9]:
# Train SVM model
linear_svm = SVC(kernel='linear')
linear_svm.fit(X_train_flat, y_train)


In [24]:
# Predict on validation data
y_pred = linear_svm.predict(X_val_flat)


In [25]:
# Evaluate the model
accuracy = accuracy_score(y_val, y_pred)
print('Validation Accuracy:', accuracy)
print(classification_report(y_val, y_pred))


Validation Accuracy: 0.8416666666666667
              precision    recall  f1-score   support

           0       0.74      0.96      0.84        51
           1       0.96      0.75      0.85        69

    accuracy                           0.84       120
   macro avg       0.85      0.86      0.84       120
weighted avg       0.87      0.84      0.84       120



In [26]:
# try RBF kernel
rbf_svm = SVC(kernel='rbf')
rbf_svm.fit(X_train_flat, y_train)


In [27]:
# Predict on validation data
rbf_y_pred = rbf_svm.predict(X_val_flat)

In [34]:
# Evaluate the model
accuracy = accuracy_score(y_val, rbf_y_pred)
print('Validation Accuracy:', accuracy)
print(classification_report(y_val, rbf_y_pred))

Validation Accuracy: 0.8916666666666667
              precision    recall  f1-score   support

           0       0.83      0.94      0.88        51
           1       0.95      0.86      0.90        69

    accuracy                           0.89       120
   macro avg       0.89      0.90      0.89       120
weighted avg       0.90      0.89      0.89       120



Looks like RBF performs better overall. Now try to do a hyper-parameter search to find the best C and gamma value

In [38]:
from sklearn.model_selection import GridSearchCV

C_range = np.logspace(-2, 10, 10)

# Adding 'scale', 'auto'
gamma_range =  np.append(np.logspace(-9, 3, 8), ['scale', 'auto'])

# Define the parameter grid
param_grid = {
    'C': C_range.tolist(),
    'gamma': gamma_range.tolist()
}

print(param_grid)

# Initialize the SVM model
svm_model = SVC(kernel='rbf')

# Initialize the Grid Search model
grid_search = GridSearchCV(svm_model, param_grid, cv=5, scoring='accuracy', verbose=3, n_jobs=-1)

# Fit the model
grid_search.fit(X_train_flat, y_train)

# Best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

{'C': [0.01, 0.21544346900318834, 4.6415888336127775, 100.0, 2154.4346900318824, 46415.888336127726, 1000000.0, 21544346.90031878, 464158883.3612773, 10000000000.0], 'gamma': ['1e-09', '5.1794746792312124e-08', '2.6826957952797274e-06', '0.0001389495494373136', '0.007196856730011514', '0.3727593720314938', '19.306977288832456', '1000.0', 'scale', 'auto']}
Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV 5/5] END .................C=0.01, gamma=1e-09;, score=nan total time=   0.4s
[CV 3/5] END .................C=0.01, gamma=1e-09;, score=nan total time=   0.5s
[CV 4/5] END C=0.01, gamma=5.1794746792312124e-08;, score=nan total time=   0.0s
[CV 5/5] END C=0.01, gamma=5.1794746792312124e-08;, score=nan total time=   0.0s
[CV 1/5] END C=0.01, gamma=2.6826957952797274e-06;, score=nan total time=   0.0s
[CV 2/5] END C=0.01, gamma=2.6826957952797274e-06;, score=nan total time=   0.0s
[CV 3/5] END C=0.01, gamma=2.6826957952797274e-06;, score=nan total time=   0.0s
[CV 4/5] END

400 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
35 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/Caskroom/miniconda/base/envs/cs4262/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/Caskroom/miniconda/base/envs/cs4262/lib/python3.10/site-packages/sklearn/base.py", line 1145, in wrapper
    estimator._validate_params()
  File "/usr/local/Caskroom/miniconda/base/envs/cs4262/lib/python3.10/site-packages/sklearn/base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/Caskroom/miniconda/base/envs/cs4262/l

Best parameters: {'C': 100.0, 'gamma': 'auto'}
Best score: 0.8895833333333334


In [11]:
# load the full data
X_train_full, y_train_full = load_data(train_data)

# Flatten and normalize the data
X_train_full_flat = scaler.fit_transform(X_train_full.reshape(X_train_full.shape[0], -1))

In [12]:
C = grid_search.best_params_['C']
# C = 100 # the result from previous grid search
gamma = grid_search.best_params_['gamma']
# gamma = "auto" # the result from previous grid search

# train the model with the best parameters
best_svm = SVC(kernel='rbf', C=C, gamma=gamma, verbose=1)
best_svm.fit(X_train_full_flat, y_train_full)


[LibSVM]...................*........*
optimization finished, #iter = 27688
obj = -20282.301132, rho = 1.669463
nSV = 3138, nBSV = 18
Total nSV = 3138


In [13]:
# evaluate the model on the test data
X_test_flat = scaler.transform(X_test.reshape(X_test.shape[0], -1))
y_pred = best_svm.predict(X_test_flat)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print('Test Accuracy:', accuracy)
print(classification_report(y_test, y_pred))

Test Accuracy: 0.91
              precision    recall  f1-score   support

           0       0.90      0.93      0.91       500
           1       0.92      0.89      0.91       500

    accuracy                           0.91      1000
   macro avg       0.91      0.91      0.91      1000
weighted avg       0.91      0.91      0.91      1000

