In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications.resnet50 import preprocess_input, ResNet50
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Dense, Masking
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.metrics import Precision, Recall


### 1. Load the data

In [2]:
filename = 'metadata_filtered.csv'

df = pd.read_csv(os.path.join("data", filename))

FileNotFoundError: [Errno 2] No such file or directory: 'data\\metadata_filtered.csv'

In [None]:
# Benign = 0, Malignant = 1
df['target'] = df['benign_malignant'].map({'benign': 0, 'malignant': 1})
df.head(3)

Unnamed: 0.1,Unnamed: 0,image_name,patient_id,lesion_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target
0,2,ISIC_0052212,IP_2842074,IL_9087444,female,50.0,lower extremity,nevus,benign,0
1,3,ISIC_0068279,IP_6890425,IL_4255399,female,45.0,head/neck,unknown,benign,0
2,6,ISIC_0074542,IP_4698288,IL_5017890,male,25.0,lower extremity,unknown,benign,0


In [None]:
base_model = ResNet50(weights='imagenet', include_top=False, pooling='avg')
model_feature_extractor = Model(inputs=base_model.input, outputs=base_model.output)

In [None]:
def load_and_preprocess_image(image_path):
    """
    Load an image, convert it to an array, preprocess it, and extract features using ResNet50.
    """
    # Adjust the path according to your dataset structure
    full_path = os.path.join('data', 'train', image_path + '.jpg')
    
    # Load and preprocess the image
    img = load_img(full_path, target_size=(224, 224))  # ResNet50 expects input size of 224x224
    img_array = img_to_array(img)
    img_array_expanded = np.expand_dims(img_array, axis=0)
    preprocessed_img = preprocess_input(img_array_expanded)
    
    # Extract features using the ResNet50 model
    features = model_feature_extractor.predict(preprocessed_img)
    
    return np.squeeze(features)

### Find patient records with the same body location present in multiple appointments

In [None]:
unique_patients = df['patient_id'].unique()

In [None]:
# Lets find series of images for each patient and body area. 1 array for each time step
preprocessing_train_arr = []
preprocessing_label_arr = []

# Iterate over each unique patient
for patient_id in unique_patients:
    patient_df = df[df['patient_id'] == patient_id]
    
    # Iterate over each unique body area for the current patient
    for anatom_site in patient_df['anatom_site_general_challenge'].unique():
        anatom_site_df = patient_df[patient_df['anatom_site_general_challenge'] == anatom_site].copy()
        
        # Sort by 'age_approx' to maintain temporal order. Sort by target to ensure if melanoma is present its in our array.
        anatom_site_df.sort_values(by=['age_approx', 'target'], ascending=[True, False], inplace=True)
        first_images_by_age = anatom_site_df.groupby('age_approx').first().reset_index()
        # Check if there are multiple ages for the current body area
        if len(first_images_by_age) > 1:
            sub_arr = []
            sub_labels = []

            # Keep first N rows:
            # Dataset has 1-4, we've excluded the 1's and will only keep first 2 to keep our dataset homogenous
            rows_to_keep = 2
            first_two_rows = first_images_by_age.head(rows_to_keep)

            # Iterate over each row in the sorted DataFrame
            for _, row in first_two_rows.iterrows():
                # load the image and the image and its label for each age to the sub-lists
                # sub_arr.append(load_and_preprocess_image(row['image_name']))
                sub_arr.append(row['image_name'])
                sub_labels.append(row['target'])
                
            # After processing all ages for the current anatom site, append the sub-lists to the main lists
            preprocessing_train_arr.append(sub_arr)
            preprocessing_label_arr.append(sub_labels)


In [None]:
print(f"Number of training sequences: {len(preprocessing_label_arr)}")
# number of arrays in label_arr with a 1 in them
print(f"Number of training sequences with melanoma: {sum([1 in labels for labels in preprocessing_label_arr])}")

Number of training sequences: 1801
Number of training sequences with melanoma: 185


In [None]:
def balance(train_arr, label_arr):
    train_arr = np.array(train_arr)
    label_arr = np.array(label_arr)

    melanoma_indices = [index for index, labels in enumerate(label_arr) if labels[-1] == 1]
    non_melanoma_indices = [index for index, labels in enumerate(label_arr) if 1 not in labels]

    # Balance 60/40. Find count for 60% of melanoma images
    non_melanoma_count = int(len(melanoma_indices) / 0.6 * 0.4)
        
    np.random.seed(0)  # Seed for reproducibility
    selected_non_melanoma_indices = np.random.choice(non_melanoma_indices, len(melanoma_indices), replace=False)

    selected_indices = np.concatenate((melanoma_indices, selected_non_melanoma_indices))
    np.random.shuffle(selected_indices) 

    balanced_train_arr = train_arr[selected_indices]
    balanced_label_arr = label_arr[selected_indices]

    return balanced_train_arr, balanced_label_arr


In [None]:
preprocessing_train_arr, preprocessing_label_arr = balance(preprocessing_train_arr, preprocessing_label_arr)

In [None]:
train_arr = [[load_and_preprocess_image(image_name) for image_name in image_sequence] for image_sequence in preprocessing_train_arr]



In [None]:

# Pad sequences to ensure uniform length
X = pad_sequences(train_arr, padding='post', dtype='float32', maxlen=2)

y = np.array([seq[-1] for seq in preprocessing_label_arr]) 


In [None]:
number_of_sequences_with_melanoma = np.sum(y == 1)
print(f"Number of training sequences with melanoma: {number_of_sequences_with_melanoma}")

Number of training sequences with melanoma: 86


In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f"Training set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Test set size: {len(X_test)}")

Training set size: 120
Validation set size: 26
Test set size: 26


In [None]:
model = Sequential([
    Masking(mask_value=0., input_shape=(X.shape[1], X.shape[2])),  
    LSTM(64, return_sequences=False),
    Dense(1, activation='sigmoid')  
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', Precision(), Recall()])
model.summary()

Model: "sequential_16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 masking_16 (Masking)        (None, 2, 2048)           0         
                                                                 
 lstm_16 (LSTM)              (None, 64)                540928    
                                                                 
 dense_16 (Dense)            (None, 1)                 65        
                                                                 
Total params: 540993 (2.06 MB)
Trainable params: 540993 (2.06 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x314efd950>

In [None]:
model.summary()


Model: "sequential_16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 masking_16 (Masking)        (None, 2, 2048)           0         
                                                                 
 lstm_16 (LSTM)              (None, 64)                540928    
                                                                 
 dense_16 (Dense)            (None, 1)                 65        
                                                                 
Total params: 540993 (2.06 MB)
Trainable params: 540993 (2.06 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
results = model.evaluate(X_val, y_val)

print(f"Validation Loss: {results[0]}")
print(f"Validation Accuracy: {results[1]}")
print(f"Test Precision: {results[2]}")
print(f"Test Recall: {results[3]}")

results = model.evaluate(X_test, y_test)

print(f"Test Loss: {results[0]}")
print(f"Test Accuracy: {results[1]}")
print(f"Test Precision: {results[2]}")
print(f"Test Recall: {results[3]}")


Validation Loss: 0.8382492661476135
Validation Accuracy: 0.5
Test Precision: 0.3076923191547394
Test Recall: 0.5
Test Loss: 0.6363160014152527
Test Accuracy: 0.7307692170143127
Test Precision: 0.7333333492279053
Test Recall: 0.7857142686843872


In [None]:
model.fit(X_train, y_train, epochs=50, validation_data=(X_test, y_test))

Epoch 1/50


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x318df48d0>

In [None]:
results = model.evaluate(X_val, y_val)

print(f"Validation Loss: {results[0]}")
print(f"Validation Accuracy: {results[1]}")
print(f"Test Precision: {results[2]}")
print(f"Test Recall: {results[3]}")

results = model.evaluate(X_test, y_test)

print(f"Test Loss: {results[0]}")
print(f"Test Accuracy: {results[1]}")
print(f"Test Precision: {results[2]}")
print(f"Test Recall: {results[3]}")

Validation Loss: 1.7832016944885254
Validation Accuracy: 0.5
Test Precision: 0.3333333432674408
Test Recall: 0.625
Test Loss: 1.3825035095214844
Test Accuracy: 0.6153846383094788
Test Precision: 0.625
Test Recall: 0.7142857313156128


Avoid overtraining to prevent overfitting. 
Precision and recall are better indicators here of success in melanoma detection.