# IMA205 Challenge

TODO POSSIBLE IMPROVEMENTS:
*   Stratify train/test in segmentation 
*   Balance the classes
*   Tune hyper-parameters MFSNet
*   Try to fix problem with non-consistent NN size in MFSNet train/test
*   Uploaded trained MFSNet Snapshot to github!
*   More features.
*   Fix folder structure to say 'pred' instead of 'test' in segmentation folder

### Imports

In [67]:
# Standard libraries
import os
import shutil
import sys

# Third-party general utilities
from joblib import Parallel, delayed, parallel_backend
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

# Data manipulation and analysis
import numpy as np
import pandas as pd

# Machine Learning
import sklearn
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.svm import SVC

# Image processing
from skimage import measure
from skimage.io import imread
from skimage.transform import resize
from skimage.feature import graycomatrix, graycoprops
from scipy.signal import convolve2d

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

In [8]:
# Print kernel and libraries versions
print(f'Using Python version: {sys.version}')
print(f'Using Numpy version: {np.__version__}')
print(f'Using Pandas version: {pd.__version__}')
print(f'Using Matplotlib version: {plt.matplotlib.__version__}')
print(f'Using Scikit-learn version: {sklearn.__version__}')
print(f'Using Seaborn version: {sns.__version__}')
print(f'Using Joblib version: {sklearn.__version__}')

Using Python version: 3.12.2 | packaged by Anaconda, Inc. | (main, Feb 27 2024, 17:35:02) [GCC 11.2.0]
Using Numpy version: 1.26.4
Using Pandas version: 2.2.2
Using Matplotlib version: 3.8.4
Using Scikit-learn version: 1.4.2
Using Seaborn version: 0.13.2
Using Joblib version: 1.4.2


### Function definitions

#### Auxiliary functions

In [9]:
def copy_images(source_paths, target_dir):
    """
    Copies images from the source paths to the target directory.
    """
    for src_path in source_paths:
        # Extract the filename from the source path
        filename = os.path.basename(src_path)
        # Define the target path
        target_path = os.path.join(target_dir, filename)
        # Copy the file
        shutil.copy(src_path, target_path)

def dice_coefficient(true_mask, pred_mask):
    true_mask = np.asarray(true_mask).astype(bool)
    pred_mask = np.asarray(pred_mask).astype(bool)
    intersection = np.logical_and(true_mask, pred_mask)
    return 2. * intersection.sum() / (true_mask.sum() + pred_mask.sum())

def iou_coefficient(true_mask, pred_mask):
    true_mask = np.asarray(true_mask).astype(bool)
    pred_mask = np.asarray(pred_mask).astype(bool)
    intersection = np.logical_and(true_mask, pred_mask)
    union = np.logical_or(true_mask, pred_mask)
    return intersection.sum() / union.sum()

def process_image(filename, input_dir, output_dir_img, output_dir_mask, threshold):
    seg_path = os.path.join(input_dir, filename)
    seg = imread(seg_path, as_gray=True)

    subject_no = filename.split(".")[0]
    img_path = os.path.join(output_dir_img, f"{subject_no}.jpg")
    img = imread(img_path)

    seg = resize(seg, img.shape, preserve_range=True)
    seg = (seg > threshold).astype(float)

    seg_filename = subject_no + '_seg.png'
    output_path = os.path.join(output_dir_mask, seg_filename)

    if not os.path.exists(output_path):  # Check if the file already exists
        plt.imsave(output_path, seg, cmap='gray')
    else:
        print(f"File {seg_filename} already exists, skipping.")

#### Assymetry feature calculation

In [10]:
def area_seg(mask, mask_props=None):
    if mask_props is not None:
        return mask_props.area
    else:
        return measure.regionprops(mask.astype(int))[0].area

def perimeter_seg(mask, mask_props=None):
    if mask_props is not None:
        return mask_props.perimeter
    else:
        return measure.regionprops(mask.astype(int))[0].perimeter

def circularity_seg(mask, mask_props=None):
    if mask_props is not None:
        return 4*np.pi*mask_props.area / mask_props.perimeter**2
    else:
        mask_props = measure.regionprops(mask.astype(int))[0]
        area = mask_props.area
        perimeter = mask_props.perimeter
        return 4*np.pi*area / perimeter**2

def bulkiness_seg(mask, mask_props=None):
    if mask_props is not None:
        area = mask_props.area
        elipse_maj_len = mask_props.major_axis_length
        elipse_min_len = mask_props.minor_axis_length
        elipse_area = np.pi * elipse_maj_len * elipse_min_len
        return elipse_area / area
    else:
        mask_props = measure.regionprops(mask.astype(int))[0]
        elipse_maj_len = mask_props.major_axis_length
        elipse_min_len = mask_props.minor_axis_length
        elipse_area = np.pi * elipse_maj_len * elipse_min_len
        return elipse_area / mask_props.area

def solidity_seg(mask, mask_props=None):
    if mask_props is not None:
        return mask_props.solidity
    else:
        return measure.regionprops(mask.astype(int))[0].solidity

def eccentricity_seg(mask, mask_props=None):
    if mask_props is not None:
        return mask_props.eccentricity
    else:
        return measure.regionprops(mask.astype(int))[0].eccentricity

def calc_assymetry_features(mask):
    mask_props = measure.regionprops(mask.astype(int))[0]
    area = area_seg(mask, mask_props)
    perimeter = perimeter_seg(mask, mask_props)
    circ = circularity_seg(mask, mask_props)
    bulk = bulkiness_seg(mask, mask_props)
    solid = solidity_seg(mask, mask_props)
    ecc = eccentricity_seg(mask, mask_props)

    assymetry_features = {
        'AREA' : area,
        'PERIMETER' : perimeter,
        'CIRCULARITY' : circ,
        'BULKINESS' : bulk,
        'SOLIDITY' : solid,
        'ECCENTRICITY' : ecc
    }

    return assymetry_features

#### Border features calculation

In [11]:
def calc_border_features(image, mask):
    # Find the contour of the mask
    border = max(measure.find_contours(mask), key=len)
    # Calculate the mean and std of the gradient in each RGB channel along the contour
    grad_r = np.gradient(image[:,:,0], axis=0)
    grad_g = np.gradient(image[:,:,1], axis=0)
    grad_b = np.gradient(image[:,:,2], axis=0)

    border_grad_r = grad_r[border[:,0].astype(int), border[:,1].astype(int)]
    border_grad_g = grad_g[border[:,0].astype(int), border[:,1].astype(int)]
    border_grad_b = grad_b[border[:,0].astype(int), border[:,1].astype(int)]

    bgrad_r_mean = np.mean(border_grad_r)
    bgrad_r_std = np.std(border_grad_r)
    bgrad_g_mean = np.mean(border_grad_g)
    bgrad_g_std = np.std(border_grad_g)
    bgrad_b_mean = np.mean(border_grad_b)
    bgrad_b_std = np.std(border_grad_b)

    border_features = {
        'BGRAD_R_MEAN' : bgrad_r_mean,
        'BGRAD_R_STD' : bgrad_r_std,
        'BGRAD_G_MEAN' : bgrad_g_mean,
        'BGRAD_G_STD' : bgrad_g_std,
        'BGRAD_B_MEAN' : bgrad_b_mean,
        'BGRAD_B_STD' : bgrad_b_std
    }

    return border_features

#### Colour features calculation

In [12]:
def calc_colour_features(image, mask):
    # Crop the image to the mask
    image_cropped = image * mask[:,:,np.newaxis]
    # Calculate the mean for each RGB channel
    mean_r = np.mean(image_cropped[:,:,0])
    mean_g = np.mean(image_cropped[:,:,1])
    mean_b = np.mean(image_cropped[:,:,2])
    # Calculate the standard deviation for each RGB channel
    std_r = np.std(image_cropped[:,:,0])
    std_g = np.std(image_cropped[:,:,1])
    std_b = np.std(image_cropped[:,:,2])

    color_features = {
        'MEAN_R' : mean_r,
        'MEAN_G' : mean_g,
        'MEAN_B' : mean_b,
        'STD_R' : std_r,
        'STD_G' : std_g,
        'STD_B' : std_b
    }

    return color_features

#### Dermatoscopic features calculation

In [40]:
def calc_glcm_features(image_cropped):
    # Convert the image to greyscale
    image_gray = np.mean(image_cropped, axis=2)
    # Convert image to unsigned integer if necessary
    if image_gray.dtype == np.float64:
        image_gray = (image_gray * 255).astype(np.uint8)
    # Define the distances and angles for the GLCM
    distances = [1, 2, 3]
    angles = [0, np.pi/4, np.pi/2, 3*np.pi/4]
    # Compute the GLCM
    glcm = graycomatrix(image_gray, distances=distances, angles=angles, symmetric=True, normed=True)
    # Compute the GLCM properties
    glcm_props = ['contrast', 'dissimilarity', 'homogeneity', 'energy', 'correlation', 'ASM']
    glcm_props_name = ['GLCM_CONTRAST', 'GLCM_DISS', 'GLCM_HOMO', 'GLCM_ENERGY', 'GLCM_CORR', 'GLCM_ASM']
    glcm_features = {}
    for i, prop in enumerate(glcm_props):
        name = glcm_props_name[i]
        glcm_features[name] = graycoprops(glcm, prop).flatten()

    return glcm_features

def calc_weber(image_cropped):
    # Convert the image to greyscale
    image_gray = np.mean(image_cropped, axis=2)
    # Apply the Weber descriptor
    image_gray = image_gray.astype(np.float64)
    image_gray[image_gray==0] = np.finfo(float).eps
    neighbours_filter = np.array([
        [1,1,1],
        [1,0,1],
        [1,1,1]
    ])
    convolved = convolve2d(image_gray,neighbours_filter, mode='same')
    weber_descriptor = convolved-8*image_gray
    weber_descriptor = weber_descriptor/image_gray
    weber_descriptor = np.arctan(weber_descriptor)

    return weber_descriptor

def calc_texture_features(image, mask):
    # Crop the image to the mask
    image_cropped = image * mask[:,:,np.newaxis]
    # Calculate the Weber descriptor
    weber_descriptor = calc_weber(image_cropped)
    # Calculate the GLCM features
    glcm_features = calc_glcm_features(image_cropped)

    texture_features = {}

    # Flatten GLCM features
    for key, array in glcm_features.items():
        for index, value in enumerate(array):
            flat_key = f'{key}_{index+1}'
            texture_features[flat_key] = value

    # Calculate the mean and std of the Weber descriptor
    weber_mean = np.mean(weber_descriptor)
    weber_std = np.std(weber_descriptor)

    texture_features['WEBER_MEAN'] = weber_mean
    texture_features['WEBER_STD'] = weber_std

    return texture_features

### Hyper-parameters

In [15]:
TRAIN_DIR = './Train'
TEST_DIR = './Test'
TRAIN_IMG_DIR = './Train/TrainImages'
TEST_IMG_DIR = './Test/TestImages'
SEG_DIR = './Segmentation_MFSNet'

THRESH_SEG = 127

### Segmentation

Now we need to segment all the images in our dataset.

In [9]:
# Check which subjects have segmentation images

# Names of the files in the train data
train_filenames = os.listdir(TRAIN_IMG_DIR)
# Names of the files in the test data
test_filenames = os.listdir(TEST_IMG_DIR)

# Dictionary to store whether the subject has a segmentation image or not
train_subject_has_seg= {}
for filename in train_filenames:
    # Checking if the file is a segmentation image or not
    has_seg = "_seg" in filename
    # Extracting the subject number, considering whether it has '_seg' in it or not
    if has_seg:
        subject_no = 'ISIC_' + filename.split("_")[1]
    else:
        subject_no = 'ISIC_' + filename.split(".")[0].split("_")[1]
    # Updating the dictionary 
    train_subject_has_seg[subject_no] = train_subject_has_seg.get(subject_no, False) or has_seg

test_subject_has_seg = {}
for filename in test_filenames:
    # Checking if the file is a segmentation image or not
    has_seg = "_seg" in filename
    # Extracting the subject number, considering whether it has '_seg' in it or not
    if has_seg:
        subject_no = 'ISIC_' + filename.split("_")[1]
    else:
        subject_no = 'ISIC_' + filename.split(".")[0].split("_")[1]
    # Updating the dictionary
    test_subject_has_seg[subject_no] = test_subject_has_seg.get(subject_no, False) or has_seg

# Print the first elements of the dictionaries
print(f"Train subjects with segmenation: {list(train_subject_has_seg.items())[:5]}...")
print(f"Test subjects with segmenation: {list(test_subject_has_seg.items())[:5]}...")

# Print percentage of subjects with segmentation
print(f"Percentage of train subjects with segmentation: {sum(train_subject_has_seg.values())/len(train_subject_has_seg)*100:.2f}%")
print(f"Percentage of test subjects with segmentation: {sum(test_subject_has_seg.values())/len(test_subject_has_seg)*100:.2f}%")

Train subjects with segmenation: [('ISIC_0000000', True), ('ISIC_0000001', True), ('ISIC_0000002', False), ('ISIC_0000003', True), ('ISIC_0000004', True)]...
Test subjects with segmenation: [('ISIC_0000006', True), ('ISIC_0000011', True), ('ISIC_0000014', True), ('ISIC_0000018', True), ('ISIC_0000022', True)]...
Percentage of train subjects with segmentation: 10.24%
Percentage of test subjects with segmentation: 10.23%


In [10]:
# Create the directories if they don't exist
os.makedirs(os.path.join(TRAIN_DIR, 'TrainMasks'), exist_ok=True)
os.makedirs(os.path.join(TRAIN_DIR, 'TrainImagesNoSeg'), exist_ok=True)
os.makedirs(os.path.join(TEST_DIR, 'TestMasks'), exist_ok=True)
os.makedirs(os.path.join(TEST_DIR, 'TestImagesNoSeg'), exist_ok=True)

# In Train/TrainImages, copy every segmenation image to Train/TrainMasks
for filename in train_filenames:
    if "_seg" in filename:
        shutil.copy(os.path.join(TRAIN_IMG_DIR, filename), os.path.join(TRAIN_DIR, 'TrainMasks', filename))

# In Test/TestImages, copy every segmenation image to Test/TestMasks
for filename in test_filenames:
    if "_seg" in filename:
        shutil.copy(os.path.join(TEST_IMG_DIR, filename), os.path.join(TEST_DIR, 'TestMasks', filename))

# In Train/TrainImages, copy every image which doesn't have a segmentation image to Train/TrainImagesNoSeg
for filename in train_filenames:
    if "_seg" not in filename:
        subject_no = 'ISIC_' + filename.split(".")[0].split("_")[1]
        if not train_subject_has_seg[subject_no]:
            shutil.copy(os.path.join(TRAIN_IMG_DIR, filename), os.path.join(TRAIN_DIR, 'TrainImagesNoSeg', filename))

# In Test/TestImages, copy every image which doesn't have a segmentation image to Test/TestImagesNoSeg
for filename in test_filenames:
    if "_seg" not in filename:
        subject_no = 'ISIC_' + filename.split(".")[0].split("_")[1]
        if not test_subject_has_seg[subject_no]:
            shutil.copy(os.path.join(TEST_IMG_DIR, filename), os.path.join(TEST_DIR, 'TestImagesNoSeg', filename))

In [11]:
# Create the directories
# Create the train directory
os.makedirs(os.path.join(SEG_DIR, 'data', 'train', 'images'), exist_ok=True)
os.makedirs(os.path.join(SEG_DIR, 'data', 'train', 'masks'), exist_ok=True)
# Create the test directory
os.makedirs(os.path.join(SEG_DIR, 'data', 'test', 'images'), exist_ok=True)
os.makedirs(os.path.join(SEG_DIR, 'data', 'test', 'masks'), exist_ok=True)

# Copy the images to the MFSNet directory

X_seg = []
y_seg = []

for directory in [TRAIN_IMG_DIR, TEST_IMG_DIR]:
    # List all files in the directory
    files = os.listdir(directory)
    # Sort files for consistent order
    files.sort()
    # Iterate over the files
    for file in files:
        # Check if the segmentation file exists
        base_name, ext = os.path.splitext(file)
        seg_file = f"{base_name}_seg.png"
        if seg_file in files:
            # Add the image and its segmentation to the lists
            X_seg.append(os.path.join(directory, file))
            y_seg.append(os.path.join(directory, seg_file))

print(f"Number of images with segmentation: {len(X_seg)}")

Number of images with segmentation: 2593


In [17]:
# Copy every file in X_seg to the MFSNet directory
for file in X_seg:
    shutil.copy(file, os.path.join(SEG_DIR, 'data', 'train', 'images', os.path.basename(file)))

# Copy every file in y_seg to the MFSNet directory
# Remove the '_seg' part of the filename
for file in y_seg:
    file_savename = os.path.basename(file).replace("_seg", "")
    shutil.copy(file, os.path.join(SEG_DIR, 'data', 'train', 'masks', file_savename))

In [24]:
os.makedirs(os.path.join(SEG_DIR, 'data', 'test', 'images', 'train'), exist_ok=True)
os.makedirs(os.path.join(SEG_DIR, 'data', 'test', 'images', 'test'), exist_ok=True)
os.makedirs(os.path.join(SEG_DIR, 'data', 'test', 'masks', 'train'), exist_ok=True)
os.makedirs(os.path.join(SEG_DIR, 'data', 'test', 'masks', 'test'), exist_ok=True)

In [23]:
# Copy every file in Train/TrainImagesNoSeg to the MFSNet directory
for file in os.listdir(os.path.join(TRAIN_DIR, 'TrainImagesNoSeg')):
    shutil.copy(os.path.join(TRAIN_DIR, 'TrainImagesNoSeg', file), 
                os.path.join(SEG_DIR, 'data', 'test', 'images', 'train', file))

# Copy every file in Test/TestImagesNoSeg to the MFSNet directory
for file in os.listdir(os.path.join(TEST_DIR, 'TestImagesNoSeg')):
    shutil.copy(os.path.join(TEST_DIR, 'TestImagesNoSeg', file), 
                os.path.join(SEG_DIR, 'data', 'test', 'images', 'test', file))

**Steps:**

Copy the files to the server via SSH:

scp -r .\Segmentation_MFSNet\ login@gpuX.enst.fr:/home/infres/login/path

Perform the inpainting:

python inpaint.py --root "data/train/images/" --destination "data/train/images/"

Train:

python train.py --train_path "data/train"

Hyper-parameters:

```
--epoch: Number of epochs of training. Default = 100
--lr: Learning Rate. Default = 1e-4
--batchsize: Batch Size. Default = 20
--trainsize: Size of Training images (to be resized). Default = 352
--clip: Gradient Clipping Margin. Default = 0.5
--decay_rate: Learning rate decay. Default = 0.05
--decay_epoch: Number of epochs after which Learning Rate needs to decay. Default = 25
```

Predictions:

python test.py --data_path "data/test/images/train/" --save_path "data/test/masks/train/"

python test.py --data_path "data/test/images/test/" --save_path "data/test/masks/test/"

Copy the files back using SCP.

After we sent the images to the GPU and got a prediction using MSFNet:


In [32]:
# Directory paths for training
train_seg_dir = os.path.join(SEG_DIR, 'data', 'test', 'masks', 'train')
train_dir_img = os.path.join(TRAIN_DIR, 'TrainImagesNoSeg')
train_dir_mask = os.path.join(TRAIN_DIR, 'TrainMasks')

# Directory paths for testing
test_seg_dir = os.path.join(SEG_DIR, 'data', 'test', 'masks', 'test')
test_dir_img = os.path.join(TEST_DIR, 'TestImagesNoSeg')
test_dir_mask = os.path.join(TEST_DIR, 'TestMasks')

# Using ThreadPoolExecutor to parallelize image processing for training
with ThreadPoolExecutor() as executor:
    list(executor.map(lambda x: process_image(x, train_seg_dir, train_dir_img, train_dir_mask, THRESH_SEG), os.listdir(train_seg_dir)))

# Using ThreadPoolExecutor to parallelize image processing for testing
with ThreadPoolExecutor() as executor:
    list(executor.map(lambda x: process_image(x, test_seg_dir, test_dir_img, test_dir_mask, THRESH_SEG), os.listdir(test_seg_dir)))

File ISIC_0000452_seg.png already exists, skipping.
File ISIC_0000468_seg.png already exists, skipping.
File ISIC_0000475_seg.png already exists, skipping.
File ISIC_0000114_seg.png already exists, skipping.
File ISIC_0000002_seg.png already exists, skipping.
File ISIC_0000478_seg.png already exists, skipping.
File ISIC_0000516_seg.png already exists, skipping.
File ISIC_0000010_seg.png already exists, skipping.
File ISIC_0000333_seg.png already exists, skipping.
File ISIC_0000369_seg.png already exists, skipping.
File ISIC_0000285_seg.png already exists, skipping.
File ISIC_0000289_seg.png already exists, skipping.
File ISIC_0000467_seg.png already exists, skipping.
File ISIC_0000302_seg.png already exists, skipping.
File ISIC_0000347_seg.png already exists, skipping.
File ISIC_0000174_seg.png already exists, skipping.
File ISIC_0000445_seg.png already exists, skipping.
File ISIC_0000237_seg.png already exists, skipping.
File ISIC_0006326_seg.png already exists, skipping.
File ISIC_00

In [34]:
# mask_folder_seg = os.path.join(SEG_DIR, 'data', 'test', 'masks')
# prediction_folder_seg = os.path.join(SEG_DIR, 'data', 'test', 'predictions')

# dice_scores_seg = []
# iou_scores_seg = []

# for mask_filename in os.listdir(mask_folder_seg):
#     if mask_filename.endswith('_seg.png'):
#         ID = mask_filename.split('_seg.png')[0]
#         prediction_filename = ID + '.png'
#         prediction_path = os.path.join(prediction_folder_seg, prediction_filename)
        
#         mask = imread(os.path.join(mask_folder_seg, mask_filename))
#         prediction = imread(prediction_path)
        
#         prediction_resized = resize(prediction, mask.shape, preserve_range=True)
#         prediction_binary = (prediction_resized > THRESH_SEG).astype(int)
        
#         dice_score = dice_coefficient(mask, prediction_binary)
#         dice_scores_seg.append(dice_score)
        
#         iou_score = iou_coefficient(mask, prediction_binary)
#         iou_scores_seg.append(iou_score)

# mean_dice_score = np.mean(dice_scores_seg)
# mean_iou_score = np.mean(iou_scores_seg)

# print(f'Mean Dice score: {mean_dice_score:.4f}')
# print(f'Mean IoU score: {mean_iou_score:.4f}')
# print(f'Minimum Dice score: {np.min(dice_scores_seg):.4f}')
# print(f'Minimum IoU score: {np.min(iou_scores_seg):.4f}')

In [None]:
# # Choose an example
# ex_idx = 0

# # Show an example of mask and prediction
# ex_mask_filename = os.listdir(mask_folder_seg)[ex_idx]
# ex_ID = ex_mask_filename.split('_seg.png')[0]
# ex_prediction_filename = ex_ID + '.png'
# ex_mask = imread(os.path.join(mask_folder_seg, ex_mask_filename))
# ex_prediction = imread(os.path.join(prediction_folder_seg, ex_prediction_filename))

# # Resize the prediction to the mask size
# ex_prediction = resize(ex_prediction, ex_mask.shape, preserve_range=True)

# # Threshold the resized prediction
# ex_prediction = (ex_prediction > THRESH_SEG).astype(int)

# # Plot the images
# fig, ax = plt.subplots(1, 2, figsize=(15, 5))
# ax[0].imshow(ex_mask, cmap='gray')
# ax[0].set_title('Mask')
# ax[1].imshow(ex_prediction, cmap='gray')
# ax[1].set_title('Prediction')
# plt.show()

### Feature extraction

ABCD represent the asymmetry, border structure, variegated color, and dermatoscopical structures.


In [41]:
def calc_features(ID, img_dir, mask_dir):
    # Load the image and the mask
    img = imread(os.path.join(img_dir, f"{ID}.jpg"))
    mask = imread(os.path.join(mask_dir, f"{ID}_seg.png"), as_gray=True)
    # Calculate the features
    assymetry_features = calc_assymetry_features(mask)
    border_features = calc_border_features(img, mask)
    color_features = calc_colour_features(img, mask)
    texture_features = calc_texture_features(img, mask)
    # Combine the features
    features = {"ID":ID, **assymetry_features, **border_features, **color_features, **texture_features}
    return features

def calc_features_parallel(IDs, img_dir, mask_dir):
    features = Parallel(n_jobs=-1)(delayed(calc_features)(ID, img_dir, mask_dir) for ID in tqdm(IDs))
    return features

# Test the function
ID = 'ISIC_0000000'
img_dir = os.path.join(TRAIN_DIR, 'TrainImages')
mask_dir = os.path.join(TRAIN_DIR, 'TrainMasks')
features = calc_features(ID, img_dir, mask_dir)
features

{'ID': 'ISIC_0000000',
 'AREA': 92002.0,
 'PERIMETER': 1345.4011537017761,
 'CIRCULARITY': 0.6387097428649782,
 'BULKINESS': 4.079485210445974,
 'SOLIDITY': 0.9680243263433675,
 'ECCENTRICITY': 0.7569104191424366,
 'BGRAD_R_MEAN': 0.15205047318611986,
 'BGRAD_R_STD': 1.987318761208477,
 'BGRAD_G_MEAN': 0.09179810725552051,
 'BGRAD_G_STD': 3.044298946030759,
 'BGRAD_B_MEAN': 0.20630914826498423,
 'BGRAD_B_STD': 4.456097577535267,
 'MEAN_R': 79.8345819063927,
 'MEAN_G': 83.52720360404436,
 'MEAN_B': 80.9101078359426,
 'STD_R': 88.54884937945499,
 'STD_G': 93.00120924178132,
 'STD_B': 91.17605941530314,
 'GLCM_CONTRAST_1': 1905.6018280227993,
 'GLCM_CONTRAST_2': 2843.102994931108,
 'GLCM_CONTRAST_3': 2170.164567504525,
 'GLCM_CONTRAST_4': 2779.4636256587205,
 'GLCM_CONTRAST_5': 2935.477621563422,
 'GLCM_CONTRAST_6': 2843.102994931108,
 'GLCM_CONTRAST_7': 3210.271718528031,
 'GLCM_CONTRAST_8': 2779.4636256587205,
 'GLCM_CONTRAST_9': 3558.351829068622,
 'GLCM_CONTRAST_10': 3856.315823037595

In [43]:
# Load the train data into a pandas dataframe
# Data is in csv format in metadataTrain.csv
train_metadata_orig = pd.read_csv(os.path.join(TRAIN_DIR, 'metadataTrain.csv'))
train_metadata_orig

Unnamed: 0,ID,CLASS,SEX,AGE,POSITION
0,ISIC_0028766,2,male,30.0,
1,ISIC_0071222,8,male,85.0,lower extremity
2,ISIC_0069434,3,male,85.0,head/neck
3,ISIC_0062098,1,male,55.0,head/neck
4,ISIC_0057224,8,female,45.0,lower extremity
...,...,...,...,...,...
18993,ISIC_0028726,2,male,40.0,posterior torso
18994,ISIC_0033160,2,male,30.0,anterior torso
18995,ISIC_0030034,1,female,50.0,posterior torso
18996,ISIC_0027095,2,male,50.0,lower extremity


In [45]:
# Check for missing values
missing_values = train_metadata_orig.isnull().sum()
missing_values

ID             0
CLASS          0
SEX          284
AGE          324
POSITION    1970
dtype: int64

In [46]:
# Remove the rows with missing values
# Create a new dataframe without the rows with missing values
# train_metadata = train_metadata_orig.dropna()
# print(f"Number of rows dropped: {len(train_metadata_orig) - len(train_metadata)}")

# We'll fill only the missing "AGE" values with the mean
mean_age = train_metadata_orig['AGE'].mean()
train_metadata = train_metadata_orig.fillna({'AGE': mean_age})
train_metadata = train_metadata.fillna({'POSITION': 'unknown', 'SEX': 'unknown'})
train_metadata

Unnamed: 0,ID,CLASS,SEX,AGE,POSITION
0,ISIC_0028766,2,male,30.0,unknown
1,ISIC_0071222,8,male,85.0,lower extremity
2,ISIC_0069434,3,male,85.0,head/neck
3,ISIC_0062098,1,male,55.0,head/neck
4,ISIC_0057224,8,female,45.0,lower extremity
...,...,...,...,...,...
18993,ISIC_0028726,2,male,40.0,posterior torso
18994,ISIC_0033160,2,male,30.0,anterior torso
18995,ISIC_0030034,1,female,50.0,posterior torso
18996,ISIC_0027095,2,male,50.0,lower extremity


In [47]:
# Check for missing values
missing_values = train_metadata.isnull().sum()
missing_values

ID          0
CLASS       0
SEX         0
AGE         0
POSITION    0
dtype: int64

In [48]:
# Extract the CLASS column and drop it from the dataframe
y_train = train_metadata['CLASS'].to_numpy().astype(int)
train_metadata = train_metadata.drop(columns=['CLASS'])

In [49]:
sex_index = train_metadata.columns.get_loc("SEX")
position_index = train_metadata.columns.get_loc("POSITION")
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(sparse_output=False), 
                                      [sex_index, position_index])], 
                                      remainder='passthrough')

# Fit and transform the data
train_metadata_encoded = ct.fit_transform(train_metadata)
column_names = ct.get_feature_names_out()
train_metadata_encoded = pd.DataFrame(train_metadata_encoded, columns=column_names)
train_metadata_encoded = train_metadata_encoded.rename(columns={'remainder__ID': 'ID'})
train_metadata_encoded

Unnamed: 0,encoder__SEX_female,encoder__SEX_male,encoder__SEX_unknown,encoder__POSITION_anterior torso,encoder__POSITION_head/neck,encoder__POSITION_lateral torso,encoder__POSITION_lower extremity,encoder__POSITION_oral/genital,encoder__POSITION_palms/soles,encoder__POSITION_posterior torso,encoder__POSITION_unknown,encoder__POSITION_upper extremity,ID,remainder__AGE
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,ISIC_0028766,30.0
1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,ISIC_0071222,85.0
2,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ISIC_0069434,85.0
3,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ISIC_0062098,55.0
4,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,ISIC_0057224,45.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18993,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,ISIC_0028726,40.0
18994,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ISIC_0033160,30.0
18995,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,ISIC_0030034,50.0
18996,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,ISIC_0027095,50.0


In [50]:
# Extract the IDs of the images
train_IDs = train_metadata['ID'].values

# Calculate the features for the train data
train_features = calc_features_parallel(train_IDs, os.path.join(TRAIN_DIR, 'TrainImages'), os.path.join(TRAIN_DIR, 'TrainMasks'))

100%|██████████| 18998/18998 [02:21<00:00, 134.08it/s]


In [51]:
# Create a train dataframe with the features
train_df = pd.DataFrame(train_features)
train_df

Unnamed: 0,ID,AREA,PERIMETER,CIRCULARITY,BULKINESS,SOLIDITY,ECCENTRICITY,BGRAD_R_MEAN,BGRAD_R_STD,BGRAD_G_MEAN,...,GLCM_ASM_5,GLCM_ASM_6,GLCM_ASM_7,GLCM_ASM_8,GLCM_ASM_9,GLCM_ASM_10,GLCM_ASM_11,GLCM_ASM_12,WEBER_MEAN,WEBER_STD
0,ISIC_0028766,3209.0,228.024387,0.775563,4.114408,0.956198,0.829791,0.050562,4.462733,-0.217228,...,0.901920,0.903516,0.903466,0.903573,0.899269,0.899806,0.901588,0.899921,-0.014390,0.223347
1,ISIC_0071222,23956.0,595.612265,0.848588,4.031006,0.982689,0.570555,-0.012960,1.075668,0.137108,...,0.822466,0.823469,0.822994,0.823538,0.820784,0.821064,0.821542,0.821231,-0.008255,0.150911
2,ISIC_0069434,34095.0,829.595021,0.622541,4.493432,0.890558,0.749933,-0.104945,1.630495,0.242180,...,0.753612,0.753665,0.752047,0.753812,0.751888,0.750296,0.749618,0.750610,-0.007734,0.164769
3,ISIC_0062098,16115.0,490.031529,0.843319,4.031188,0.976903,0.451376,0.124361,5.328305,0.082624,...,0.878383,0.878878,0.878261,0.878900,0.877159,0.876898,0.876944,0.876927,-0.008086,0.153020
4,ISIC_0057224,663.0,155.468037,0.344700,6.664345,0.631429,0.953729,-0.005291,1.708204,0.216931,...,0.994149,0.994355,0.994325,0.994478,0.993748,0.993735,0.994000,0.993996,-0.009373,0.121689
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18993,ISIC_0028726,15924.0,509.688384,0.770287,4.017914,0.974183,0.812196,0.542693,2.404289,0.312808,...,0.576115,0.575438,0.572168,0.576618,0.572252,0.566952,0.566344,0.569197,-0.013199,0.255843
18994,ISIC_0033160,8593.0,372.391919,0.778671,4.020315,0.977477,0.821906,0.370203,3.190628,0.497743,...,0.755774,0.755030,0.752591,0.756224,0.752774,0.748417,0.748201,0.750814,-0.013948,0.236554
18995,ISIC_0030034,26542.0,683.251875,0.714466,4.159736,0.957711,0.796720,-0.034829,1.919183,-0.006494,...,0.358663,0.360303,0.355496,0.358725,0.353796,0.352216,0.349218,0.349062,-0.010869,0.281630
18996,ISIC_0027095,9847.0,409.747258,0.737024,4.040837,0.968240,0.851763,0.043434,2.658820,0.393939,...,0.720315,0.722568,0.722406,0.723738,0.715647,0.715337,0.718733,0.717834,-0.013474,0.241862


In [52]:
# Add the metadata to the train dataframe
train_df_merged = pd.merge(train_df, train_metadata_encoded, on='ID')
train_df_merged

Unnamed: 0,ID,AREA,PERIMETER,CIRCULARITY,BULKINESS,SOLIDITY,ECCENTRICITY,BGRAD_R_MEAN,BGRAD_R_STD,BGRAD_G_MEAN,...,encoder__POSITION_anterior torso,encoder__POSITION_head/neck,encoder__POSITION_lateral torso,encoder__POSITION_lower extremity,encoder__POSITION_oral/genital,encoder__POSITION_palms/soles,encoder__POSITION_posterior torso,encoder__POSITION_unknown,encoder__POSITION_upper extremity,remainder__AGE
0,ISIC_0028766,3209.0,228.024387,0.775563,4.114408,0.956198,0.829791,0.050562,4.462733,-0.217228,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,30.0
1,ISIC_0071222,23956.0,595.612265,0.848588,4.031006,0.982689,0.570555,-0.012960,1.075668,0.137108,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,85.0
2,ISIC_0069434,34095.0,829.595021,0.622541,4.493432,0.890558,0.749933,-0.104945,1.630495,0.242180,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,85.0
3,ISIC_0062098,16115.0,490.031529,0.843319,4.031188,0.976903,0.451376,0.124361,5.328305,0.082624,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,55.0
4,ISIC_0057224,663.0,155.468037,0.344700,6.664345,0.631429,0.953729,-0.005291,1.708204,0.216931,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,45.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18993,ISIC_0028726,15924.0,509.688384,0.770287,4.017914,0.974183,0.812196,0.542693,2.404289,0.312808,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,40.0
18994,ISIC_0033160,8593.0,372.391919,0.778671,4.020315,0.977477,0.821906,0.370203,3.190628,0.497743,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0
18995,ISIC_0030034,26542.0,683.251875,0.714466,4.159736,0.957711,0.796720,-0.034829,1.919183,-0.006494,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,50.0
18996,ISIC_0027095,9847.0,409.747258,0.737024,4.040837,0.968240,0.851763,0.043434,2.658820,0.393939,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,50.0


In [53]:
# Drop the columnns corresponding to one-hot encoded unknown values
train_df_merged = train_df_merged.drop(columns=['encoder__POSITION_unknown', 'encoder__SEX_unknown'])

In [54]:
# Scale the features
scaler = StandardScaler()
train_df_scaled = scaler.fit_transform(train_df_merged.drop(columns=['ID']))
train_df_scaled = pd.DataFrame(train_df_scaled, columns=train_df_merged.columns[1:])
train_df_scaled['ID'] = train_df_merged['ID']
train_df_scaled

Unnamed: 0,AREA,PERIMETER,CIRCULARITY,BULKINESS,SOLIDITY,ECCENTRICITY,BGRAD_R_MEAN,BGRAD_R_STD,BGRAD_G_MEAN,BGRAD_G_STD,...,encoder__POSITION_anterior torso,encoder__POSITION_head/neck,encoder__POSITION_lateral torso,encoder__POSITION_lower extremity,encoder__POSITION_oral/genital,encoder__POSITION_palms/soles,encoder__POSITION_posterior torso,encoder__POSITION_upper extremity,remainder__AGE,ID
0,-0.885894,-1.206715,0.455835,-0.045869,0.308294,1.251235,0.096181,0.859086,-0.383293,0.748748,...,-0.613407,-0.470805,-0.047071,-0.494928,-0.049266,-0.12494,-0.350826,-0.360685,-1.337433,ISIC_0028766
1,-0.490389,-0.512278,0.887260,-0.052733,0.545368,-0.384853,-0.018345,-0.830774,0.222550,-0.736776,...,-0.613407,-0.470805,-0.047071,2.020494,-0.049266,-0.12494,-0.350826,-0.360685,1.728551,ISIC_0071222
2,-0.297107,-0.070243,-0.448198,-0.014678,-0.279127,0.747236,-0.184185,-0.553962,0.402201,-0.518487,...,-0.613407,2.124020,-0.047071,-0.494928,-0.049266,-0.12494,-0.350826,-0.360685,1.728551,ISIC_0069434
3,-0.639864,-0.711738,0.856133,-0.052718,0.493590,-1.137009,0.229236,1.290933,0.129393,1.156363,...,-0.613407,2.124020,-0.047071,-0.494928,-0.049266,-0.12494,-0.350826,-0.360685,0.056196,ISIC_0062098
4,-0.934429,-1.343787,-2.089654,0.163974,-2.598097,2.033431,-0.004517,-0.515192,0.359032,-0.618724,...,-0.613407,-0.470805,-0.047071,2.020494,-0.049266,-0.12494,-0.350826,-0.360685,-0.501256,ISIC_0057224
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18993,-0.643505,-0.674603,0.424666,-0.053810,0.469247,1.140192,0.983456,-0.167904,0.522961,-0.196323,...,-0.613407,-0.470805,-0.047071,-0.494928,-0.049266,-0.12494,2.850417,-0.360685,-0.779982,ISIC_0028726
18994,-0.783257,-0.933980,0.474200,-0.053613,0.498722,1.201474,0.672470,0.224413,0.839162,1.110293,...,1.630240,-0.470805,-0.047071,-0.494928,-0.049266,-0.12494,-0.350826,-0.360685,-1.337433,ISIC_0033160
18995,-0.441091,-0.346711,0.094885,-0.042139,0.321834,1.042518,-0.057772,-0.409931,-0.022979,-0.329588,...,-0.613407,-0.470805,-0.047071,-0.494928,-0.049266,-0.12494,2.850417,-0.360685,-0.222530,ISIC_0030034
18996,-0.759352,-0.863409,0.228153,-0.051924,0.416059,1.389906,0.083331,-0.040914,0.661680,0.132585,...,-0.613407,-0.470805,-0.047071,2.020494,-0.049266,-0.12494,-0.350826,-0.360685,-0.222530,ISIC_0027095


In [55]:
# Now we need to predict the class of the test data
# Load the test data into a pandas dataframe
# Data is in csv format in metadataTest.csv
test_metadata_orig = pd.read_csv(os.path.join(TEST_DIR, 'metadataTest.csv'))
test_metadata_orig

Unnamed: 0,ID,SEX,AGE,POSITION
0,ISIC_0055289,female,50.0,lower extremity
1,ISIC_0062682,male,65.0,lower extremity
2,ISIC_0057706,female,75.0,
3,ISIC_0031702,male,70.0,head/neck
4,ISIC_0069272,male,60.0,lower extremity
...,...,...,...,...
6328,ISIC_0028560,male,85.0,upper extremity
6329,ISIC_0068188,female,75.0,upper extremity
6330,ISIC_0072611,female,50.0,palms/soles
6331,ISIC_0032918,male,50.0,head/neck


In [56]:
# Check for missing values
missing_values = test_metadata_orig.isnull().sum()
missing_values

ID            0
SEX         100
AGE         113
POSITION    661
dtype: int64

In [57]:
# We'll fill only the missing "AGE" values with the mean
mean_age = test_metadata_orig['AGE'].mean()
test_metadata = test_metadata_orig.fillna({'AGE': mean_age})
test_metadata = test_metadata.fillna({'POSITION': 'unknown', 'SEX': 'unknown'})
test_metadata

Unnamed: 0,ID,SEX,AGE,POSITION
0,ISIC_0055289,female,50.0,lower extremity
1,ISIC_0062682,male,65.0,lower extremity
2,ISIC_0057706,female,75.0,unknown
3,ISIC_0031702,male,70.0,head/neck
4,ISIC_0069272,male,60.0,lower extremity
...,...,...,...,...
6328,ISIC_0028560,male,85.0,upper extremity
6329,ISIC_0068188,female,75.0,upper extremity
6330,ISIC_0072611,female,50.0,palms/soles
6331,ISIC_0032918,male,50.0,head/neck


In [58]:
# Check for missing values
missing_values = test_metadata.isnull().sum()
missing_values

ID          0
SEX         0
AGE         0
POSITION    0
dtype: int64

In [59]:
# Encode the metadata
test_metadata_encoded = ct.transform(test_metadata)
test_metadata_encoded = pd.DataFrame(test_metadata_encoded, columns=column_names)
test_metadata_encoded = test_metadata_encoded.rename(columns={'remainder__ID': 'ID'})
test_metadata_encoded

Unnamed: 0,encoder__SEX_female,encoder__SEX_male,encoder__SEX_unknown,encoder__POSITION_anterior torso,encoder__POSITION_head/neck,encoder__POSITION_lateral torso,encoder__POSITION_lower extremity,encoder__POSITION_oral/genital,encoder__POSITION_palms/soles,encoder__POSITION_posterior torso,encoder__POSITION_unknown,encoder__POSITION_upper extremity,ID,remainder__AGE
0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,ISIC_0055289,50.0
1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,ISIC_0062682,65.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,ISIC_0057706,75.0
3,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ISIC_0031702,70.0
4,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,ISIC_0069272,60.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6328,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,ISIC_0028560,85.0
6329,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,ISIC_0068188,75.0
6330,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,ISIC_0072611,50.0
6331,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ISIC_0032918,50.0


In [60]:
# Drop the columnns corresponding to one-hot encoded unknown values
test_metadata_encoded = test_metadata_encoded.drop(columns=['encoder__POSITION_unknown', 'encoder__SEX_unknown'])

# Extract the IDs of the images
test_IDs = test_metadata['ID'].values

# Calculate the features for the test data
test_features = calc_features_parallel(test_IDs, os.path.join(TEST_DIR, 'TestImages'), os.path.join(TEST_DIR, 'TestMasks'))

# Create a test dataframe with the features
test_df = pd.DataFrame(test_features)
test_df

100%|██████████| 6333/6333 [00:45<00:00, 138.93it/s]


Unnamed: 0,ID,AREA,PERIMETER,CIRCULARITY,BULKINESS,SOLIDITY,ECCENTRICITY,BGRAD_R_MEAN,BGRAD_R_STD,BGRAD_G_MEAN,...,GLCM_ASM_5,GLCM_ASM_6,GLCM_ASM_7,GLCM_ASM_8,GLCM_ASM_9,GLCM_ASM_10,GLCM_ASM_11,GLCM_ASM_12,WEBER_MEAN,WEBER_STD
0,ISIC_0055289,89781.0,1208.915872,0.771973,4.062491,0.964288,0.667154,0.038730,1.219206,0.051640,...,0.427327,0.427959,0.427332,0.428700,0.424625,0.423137,0.424595,0.424590,-0.005883,0.192161
1,ISIC_0062682,88958.0,1345.307791,0.617663,4.117654,0.931556,0.562224,0.004281,1.071730,0.051682,...,0.430459,0.432140,0.430979,0.431830,0.427373,0.427572,0.428165,0.426978,-0.006599,0.189791
2,ISIC_0057706,917.0,175.509668,0.374091,31.832511,0.138666,0.900818,-0.688000,3.518900,-0.300000,...,0.992096,0.992371,0.992218,0.992432,0.991637,0.991728,0.991821,0.991851,-0.009182,0.125070
3,ISIC_0031702,7147.0,335.806133,0.796446,4.044377,0.944995,0.459206,-0.132678,5.318212,0.131450,...,0.792513,0.794026,0.792741,0.794213,0.789115,0.788676,0.789351,0.788996,-0.015345,0.222575
4,ISIC_0069272,136659.0,1708.354473,0.588426,4.224325,0.917205,0.649337,-0.053299,4.588735,-0.177982,...,0.223466,0.225022,0.225232,0.225305,0.220612,0.220810,0.223245,0.221403,-0.002487,0.246462
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6328,ISIC_0028560,2133.0,344.977705,0.225226,10.176045,0.439975,0.645575,-0.666667,4.274910,-0.766082,...,0.931794,0.933277,0.930865,0.932758,0.928769,0.928669,0.927318,0.927770,-0.014548,0.226711
6329,ISIC_0068188,84292.0,1135.224530,0.821926,4.023535,0.973191,0.209750,-0.020996,2.582260,0.013167,...,0.455188,0.456279,0.454970,0.456321,0.452504,0.452037,0.452235,0.452021,-0.005830,0.193765
6330,ISIC_0072611,19967.0,533.002092,0.883212,4.025584,0.988808,0.465030,-0.164050,1.488895,-0.153061,...,0.850781,0.851291,0.850597,0.851262,0.849388,0.849099,0.849148,0.849028,-0.007873,0.152705
6331,ISIC_0032918,13616.0,455.244733,0.825600,4.098481,0.973475,0.445324,-0.127451,2.537469,0.031194,...,0.628436,0.630229,0.628126,0.630300,0.623992,0.623162,0.623440,0.623281,-0.013757,0.245622


In [61]:
# Add the metadata to the test dataframe
test_df_merged = pd.merge(test_df, test_metadata_encoded, on='ID')
test_df_merged

Unnamed: 0,ID,AREA,PERIMETER,CIRCULARITY,BULKINESS,SOLIDITY,ECCENTRICITY,BGRAD_R_MEAN,BGRAD_R_STD,BGRAD_G_MEAN,...,encoder__SEX_male,encoder__POSITION_anterior torso,encoder__POSITION_head/neck,encoder__POSITION_lateral torso,encoder__POSITION_lower extremity,encoder__POSITION_oral/genital,encoder__POSITION_palms/soles,encoder__POSITION_posterior torso,encoder__POSITION_upper extremity,remainder__AGE
0,ISIC_0055289,89781.0,1208.915872,0.771973,4.062491,0.964288,0.667154,0.038730,1.219206,0.051640,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,50.0
1,ISIC_0062682,88958.0,1345.307791,0.617663,4.117654,0.931556,0.562224,0.004281,1.071730,0.051682,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,65.0
2,ISIC_0057706,917.0,175.509668,0.374091,31.832511,0.138666,0.900818,-0.688000,3.518900,-0.300000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,75.0
3,ISIC_0031702,7147.0,335.806133,0.796446,4.044377,0.944995,0.459206,-0.132678,5.318212,0.131450,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,70.0
4,ISIC_0069272,136659.0,1708.354473,0.588426,4.224325,0.917205,0.649337,-0.053299,4.588735,-0.177982,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,60.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6328,ISIC_0028560,2133.0,344.977705,0.225226,10.176045,0.439975,0.645575,-0.666667,4.274910,-0.766082,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,85.0
6329,ISIC_0068188,84292.0,1135.224530,0.821926,4.023535,0.973191,0.209750,-0.020996,2.582260,0.013167,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,75.0
6330,ISIC_0072611,19967.0,533.002092,0.883212,4.025584,0.988808,0.465030,-0.164050,1.488895,-0.153061,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,50.0
6331,ISIC_0032918,13616.0,455.244733,0.825600,4.098481,0.973475,0.445324,-0.127451,2.537469,0.031194,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0


In [62]:
# Scale the test features
test_df_scaled = scaler.transform(test_df_merged.drop(columns=['ID']))
test_df_scaled = pd.DataFrame(test_df_scaled, columns=test_df_merged.columns[1:])
test_df_scaled['ID'] = test_df_merged['ID']
test_df_scaled

Unnamed: 0,AREA,PERIMETER,CIRCULARITY,BULKINESS,SOLIDITY,ECCENTRICITY,BGRAD_R_MEAN,BGRAD_R_STD,BGRAD_G_MEAN,BGRAD_G_STD,...,encoder__POSITION_anterior torso,encoder__POSITION_head/neck,encoder__POSITION_lateral torso,encoder__POSITION_lower extremity,encoder__POSITION_oral/genital,encoder__POSITION_palms/soles,encoder__POSITION_posterior torso,encoder__POSITION_upper extremity,remainder__AGE,ID
0,0.764449,0.646360,0.434627,-0.050142,0.380693,0.224805,0.074849,-0.759161,0.076417,-0.703171,...,-0.613407,-0.470805,-0.047071,2.020494,-0.049266,-0.124940,-0.350826,-0.360685,-0.222530,ISIC_0055289
1,0.748760,0.904028,-0.477019,-0.045602,0.087771,-0.437428,0.012741,-0.832738,0.076489,-0.920527,...,-0.613407,-0.470805,-0.047071,2.020494,-0.049266,-0.124940,-0.350826,-0.360685,0.613648,ISIC_0062682
2,-0.929587,-1.305925,-1.916015,2.235145,-7.007872,1.699498,-1.235389,0.388193,-0.524815,0.509227,...,-0.613407,-0.470805,-0.047071,-0.494928,-0.049266,-0.124940,-0.350826,-0.360685,1.171099,ISIC_0057706
3,-0.810823,-1.003097,0.579211,-0.051632,0.208042,-1.087597,-0.234187,1.285898,0.212876,1.318474,...,-0.613407,2.124020,-0.047071,-0.494928,-0.049266,-0.124940,-0.350826,-0.360685,0.892374,ISIC_0031702
4,1.658095,1.589887,-0.649746,-0.036824,-0.040655,0.112357,-0.091073,0.921950,-0.316190,0.703948,...,-0.613407,-0.470805,-0.047071,2.020494,-0.049266,-0.124940,-0.350826,-0.360685,0.334922,ISIC_0069272
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6328,-0.906406,-0.985770,-2.795492,0.452963,-4.311430,0.088616,-1.196927,0.765378,-1.321720,1.222273,...,-0.613407,-0.470805,-0.047071,-0.494928,-0.049266,-0.124940,-0.350826,2.772505,1.728551,ISIC_0028560
6329,0.659811,0.507144,0.729743,-0.053348,0.460370,-2.661956,-0.032833,-0.079112,0.010637,-0.207390,...,-0.613407,-0.470805,-0.047071,-0.494928,-0.049266,-0.124940,-0.350826,2.772505,1.171099,ISIC_0068188
6330,-0.566432,-0.630559,1.091813,-0.053179,0.600125,-1.050837,-0.290748,-0.624608,-0.273580,-0.522701,...,-0.613407,-0.470805,-0.047071,-0.494928,-0.049266,8.003852,-0.350826,-0.360685,-0.222530,ISIC_0072611
6331,-0.687503,-0.777456,0.751447,-0.047180,0.462912,-1.175205,-0.224762,-0.101459,0.041460,-0.162780,...,-0.613407,2.124020,-0.047071,-0.494928,-0.049266,-0.124940,-0.350826,-0.360685,-0.222530,ISIC_0032918


In [63]:
X_train = train_df_scaled.drop(columns=['ID']).to_numpy()
X_test = test_df_scaled.drop(columns=['ID']).to_numpy()

# We'll use the weights parameter to account for the class imbalance
class_weights_kaggle = {
    1: 0.7005531,
    2: 0.24592265,
    3: 0.95261733,
    4: 3.64804147,
    5: 1.20674543,
    6: 13.19375,
    7: 12.56547619,
    8: 5.04219745
}

In [66]:
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization parameter
    'gamma': [1, 0.1, 0.01, 0.001],  # Kernel coefficient
}

# We will use joblib to parallelize the grid search
with parallel_backend('loky', n_jobs=-1):
    grid_search = GridSearchCV(SVC(class_weight=class_weights_kaggle, kernel='rbf'), param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")
print(f"Best score: {grid_search.best_score_:.4f}")


Best parameters: {'C': 100, 'gamma': 0.01}
Best score: 0.6512
(6333, 2)


In [None]:
# Fit the model with the best parameters
svm_best = SVC(class_weight=class_weights_kaggle, kernel='rbf', **best_params)
svm_best.fit(X_train, y_train)

# Predict the class of the test data
y_pred = svm_best.predict(X_test)

# Output the predictions with their ID to a csv file
output = pd.DataFrame({'ID': test_df_scaled['ID'], 'CLASS': y_pred})
output.to_csv('predictions_SVM.csv', index=False)
print(output.shape)