<a href="https://www.kaggle.com/code/annamalkova88/skin-cancer-myvers?scriptVersionId=152104724" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

<div class="list-group" id="list-tab" role="tablist">
<h2 class="list-group-item list-group-item-action active" data-toggle="list" style='background:orange; border:0; color:white' role="tab" aria-controls="home"><center>Quick navigation</center></h2>

* [0. Installation of libraries](#0)
* [1. Basic Data Overview](#1)
* [2. Transformation of data](#2)
* [3. Convolutional networks](#3)

<a id="0"></a>
<h2 style='background:orange; border:0; color:white'><center>0. Installation of libraries</center><h2>

In [None]:
import numpy as np
import os
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import imageio
import plotly.express as px
from plotly.subplots import make_subplots
from skimage import io 
from PIL import Image

from sklearn.preprocessing import LabelBinarizer, StandardScaler,LabelEncoder
from sklearn import preprocessing 
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.tree import DecisionTreeRegressor,DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.layers import Input, Dense, GlobalAveragePooling2D,Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization,Concatenate
from tensorflow.keras.applications import EfficientNetB7
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam, Adamax
import tensorflow.keras.models as M
import tensorflow.keras.layers as L
import tensorflow.keras.backend as K
from tensorflow.keras.models import Sequential

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

<a id="1"></a>
<h2 style='background:orange; border:0; color:white'><center>1. Basic Data Overview</center><h2>

In [None]:
train_df = pd.read_csv('/kaggle/input/isic-2019/ISIC_2019_Training_Metadata.csv')
display(train_df.head(),
        train_df.info(), 
        'Missing values',
        train_df.isna().sum())

In [None]:
train_y = pd.read_csv('/kaggle/input/isic-2019/ISIC_2019_Training_GroundTruth.csv')
display(train_y.head(),
        train_y.info(), 
        'Missing values',
        train_y.isna().sum())

In [None]:
train = train_df.merge(train_y, on='image', how='outer')
display(train.head(),
        train.info(), 
        'Missing values',
        train.isna().sum())

Melanoma

Melanocytic nevus

Basal cell carcinoma

Actinic keratosis

Benign keratosis (solar lentigo / seborrheic keratosis / lichen planus-like keratosis)

Dermatofibroma

Vascular lesion

Squamous cell carcinoma

Uknown

In [None]:
cols = ['MEL', 'NV', 'BCC', 'AK', 'BKL', 'DF', 'VASC', 'SCC', 'UNK']
train['cancer'] = ''

for i in range(train.shape[0]):
    for el in cols:
        if train.loc[i, el] == 1:
            train.loc[i, 'cancer'] = el

train.head()

In [None]:
fig = px.pie(train, names='cancer', title='Distribution of histological types')
fig.show()


Its better to exclude samples of the rarest types:

AK,SCC,VASC, DF

In [None]:
fig = px.pie(train, names='anatom_site_general', title='Distribution of histological types')
fig.show()

Its better to exclude the rarest anatomic sides:

palms/soles

oral/genital

lateral torso

In [None]:
cols = ['MEL', 'NV', 'BCC', 'AK', 'BKL', 'DF', 'VASC', 'SCC', 'UNK']
fig = make_subplots(rows=3, cols=3, subplot_titles=cols)
for i, el in enumerate(cols, 1):
    df_filtered = train[train['cancer'] == el]
    hist = px.histogram(df_filtered, x='age_approx', nbins=20, title=el)
    fig.add_trace(hist['data'][0], row=(i - 1) // 3 + 1, col=(i - 1) % 3 + 1)

fig.update_layout(height=600, width=800, showlegend=False, title_text='Age Histograms for Each Cancer Type')
fig.show()


In [None]:
fig = make_subplots(rows=3, cols=3, subplot_titles=cols)
for i, el in enumerate(cols, 1):
    df_filtered = train[train['cancer'] == el]
    bar_chart = px.bar(df_filtered, x='anatom_site_general', title=el)
    
    bar_chart.update_layout(xaxis=dict(tickangle=45, tickmode='array', tickvals=list(range(len(df_filtered['anatom_site_general'])))))
    color = px.colors.qualitative.Set1[i - 1]
    
    bar_trace = bar_chart['data'][0]
    bar_trace['marker']['color'] = color
    fig.add_trace(bar_trace, row=(i - 1) // 3 + 1, col=(i - 1) % 3 + 1)

fig.update_layout(height=1000, width=800, showlegend=False, title_text='Anatomic location for Each Cancer Type')
fig.show()


In [None]:
import plotly.graph_objects as go
fig = make_subplots(
    rows=3, cols=3,
    vertical_spacing=0.09,
    specs=[[{"type": "pie"}, {"type": "pie"},{"type": "pie"}],
           [{"type": "pie"}, {"type": "pie"},{"type": "pie"}],
           [{"type": "pie"}, {"type": "pie"},{"type": "pie"}]
          ],
    subplot_titles=cols
)

for i, el in enumerate(cols, 1):
    df_filtered = train[train['cancer'] == el]
    
    # Calculate row and col for each subplot
    row = (i - 1) // 3 + 1
    col = (i - 1) % 3 + 1

    fig.add_trace(
        go.Pie(
            values=df_filtered.sex.value_counts().values,
            labels=['<b>Female<b>', '<b>Male<b>', '<b>None<b>'],
            hole=0.3, pull=[0, 0.08, 0.3],
            marker_colors=['pink', 'lightblue', 'lightgreen'],
            textposition='inside'
        ),
        row=row, col=col
    )

fig.update_layout(
    height=500,
    showlegend=True,
    title_text="<b>Sex distribution for cancer types<b>",
)

fig.show()


In [None]:
#There is no data for unknown - delete this column
train_known = train.drop('UNK', axis=1)
train_known.info()

In [None]:
import os

directory = '/kaggle/input/isic-2019/ISIC_2019_Training_Input/ISIC_2019_Training_Input'

# Dictionary to store file paths
file_paths = {}

# Iterate over files in the directory
for root, _, files in os.walk(directory):
    for filename in files:
        if filename.endswith('.jpg'):
            path = os.path.join(root, filename)
            file_paths[filename] = path


In [None]:
train_known['image'] = train_known['image'].apply(lambda x: x+'.jpg')

In [None]:
train_known['path'] = train_known['image'].map(file_paths)

train_known.head()

In [None]:
import matplotlib.image as mpimg

labels = train_known['cancer'].unique()
image_paths = []

for label in labels:
    path = train_known[train_known['cancer'] == label].sample(n=1, random_state=42)['path'].values[0]
    image_paths.append(path)
fig, axes = plt.subplots(1, len(image_paths), figsize=(20, 5))

for i, path in enumerate(image_paths):
    img = mpimg.imread(path)
    axes[i].imshow(img)
    axes[i].set_title(labels[i])
    axes[i].axis('off')

plt.show()

In [None]:
# Your existing code to get image paths
import pandas as pd

anatom_site_counts = train_known['anatom_site_general'].value_counts()
labels = anatom_site_counts.index

# Sort the labels based on their counts
labels = sorted(labels, key=lambda x: anatom_site_counts[x], reverse=True)

image_paths = []

for label in labels:
    df_label = train_known[train_known['anatom_site_general'] == label]
    
    if not df_label.empty:
        # Sample one image for each label
        path = df_label.sample(n=1, random_state=42)['path'].values[0]
        image_paths.append(path)

image_paths

In [None]:
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import matplotlib.image as mpimg
import plotly.express as px

fig = plt.figure(figsize=(20, 10))
gs = gridspec.GridSpec(2, len(image_paths), height_ratios=[2, 1])

# Display images in the upper part
for i, path in enumerate(image_paths):
    img = mpimg.imread(path)
    ax = plt.subplot(gs[0, i])
    ax.imshow(img)
    ax.set_title(labels[i])
    ax.axis('off')

# Create a bar chart in the lower part
ax = plt.subplot(gs[1, :])

ax.bar(anatom_site_counts.index, anatom_site_counts.values)
ax.set_title('Bar Chart')
ax.set_xlabel('Anatom Site General')
ax.set_ylabel('Count')

plt.show()


<a id="2"></a>
<h2 style='background:orange; border:0; color:white'><center>3. EDA</center><h2>

In [None]:
train_exp = train_known.copy()
train_exp.dropna(axis=0, inplace=True)
train_exp.reset_index()
#Create new feature - lesion
train_exp['lesion'] = train_exp['lesion_id'].apply(lambda x: str(x).split('_')[0] if isinstance(x, str) else '')
train_exp = train_exp.drop('lesion_id', axis=1)
train_exp.head()

In [None]:
#delete the rarest h.types
hist = ['AK','SCC','VASC', 'DF']
train_exp1 = train_exp[~train_exp['cancer'].isin(hist)]

#delete the rarest sides
sides=['palms/soles','oral/genital','lateral torso']
train_exp2 = train_exp1[~train_exp1['anatom_site_general'].isin(sides)]
display(train_exp2.cancer.value_counts(),
        
        train_exp2['anatom_site_general'].value_counts())

In [None]:
train_coded = train_exp2.copy()

In [None]:
#Coding categorical features
coder = preprocessing.LabelEncoder()
train_coded['lesion']= coder.fit_transform(train_coded['lesion'])

transformer = preprocessing.LabelBinarizer()
train_coded['sex'] = transformer.fit_transform(train_coded['sex'])

train_coded.head()

In [None]:
#Scaling numeric features
scaler = StandardScaler()
cols=['age_approx','lesion']
train_coded[cols] = scaler.fit_transform(train_coded[cols])

In [None]:
train_coded.head(2)

<a id="3"></a>
<h2 style='background:orange; border:0; color:white'><center>3. Transformation of images</center><h2>

In [None]:
import cv2
output_folder = '/kaggle/working/train_small/'
os.makedirs(output_folder, exist_ok=True)
train_coded.reset_index(inplace=True)
for i in range(train_coded.shape[0]):
    input_path = train_coded.loc[i, 'path']
    image = cv2.imread(input_path)
    height, width, _ = image.shape
    new_width = int(width/2)  # specify the new width
    new_height = int(height * (new_width / width))  # calculate the new height to maintain the aspect ratio
    resized_img = cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_AREA)
    output_path = os.path.join(output_folder, train_coded.loc[i, 'image'])
    cv2.imwrite(output_path, resized_img)

In [None]:
train_small=train_coded.copy()
file_paths = {}
for root, _, files in os.walk('/kaggle/working/train_small/'):
    for filename in files:
        path = os.path.join(root, filename)  
        file_paths[filename]=path
        
train_small['path'] = train_small['image'].map(file_paths)
train_small.head()

In [None]:
train_1000 = train_small.sample(n=1000)

In [None]:
cols_to_drop = ['image', 'cancer']
train_1000.drop(cols_to_drop, axis=1, inplace=True)

<a id="3"></a>
<h2 style='background:orange; border:0; color:white'><center>3. Convolutional network model</center><h2>

In [None]:
# Separate tabular data and image paths
X_tabular = train_1000[['sex','age_approx','lesion', 'MEL', 'NV', 'BCC', 'AK', 'BKL', 'DF', 'VASC', 'SCC']]
X_image_paths = train_1000['path']
y_labels = train_1000['anatom_site_general']

In [None]:
y_labels.value_counts()

In [None]:
# Step 1: Load and Preprocess Images
# Load and preprocess your images
image_size = (100, 100)  # Set the desired size for EfficientNet
X_images = []  # List to store preprocessed images

for path in X_image_paths:
    # Load image
    img = tf.keras.preprocessing.image.load_img(path, target_size=image_size)
    img_array = tf.keras.preprocessing.image.img_to_array(img)
    
    # Preprocess image
    img_array = tf.keras.applications.efficientnet.preprocess_input(img_array)
    
    X_images.append(img_array)

X_images = np.array(X_images)

In [None]:
# Step 2: Prepare Labels
# Encode the labels
label_encoder = LabelEncoder()
y_labels_encoded = label_encoder.fit_transform(y_labels)
y_labels_encoded = tf.keras.utils.to_categorical(y_labels_encoded)

In [None]:
# Step 3: Split Data
# Split your data into training and validation sets
X_train_tabular, X_val_tabular, X_train_images, X_val_images, y_train, y_val = train_test_split(
    X_tabular, X_images, y_labels_encoded, test_size=0.2, random_state=42
)
display(X_train_tabular.shape, 
        X_val_tabular.shape, 
        X_train_images.shape, 
        X_val_images.shape)

In [None]:
from keras.preprocessing.image import ImageDataGenerator
image_datagen = ImageDataGenerator(
    rescale=1./255,
    horizontal_flip=True
)

# Create generator for image data
train_image_generator = image_datagen.flow(X_train_images, 
                                           y_train, 
                                           batch_size=32, 
                                           seed=42,
                                          shuffle=False)
val_image_generator = image_datagen.flow(X_val_images, 
                                         y_val, 
                                         batch_size=32, 
                                         seed=42,
                                        shuffle=False)

In [None]:
#Visualization of augmented pictures
def plotImages(images_arr):
    fig, axes = plt.subplots(4, 4, figsize=(10,10))
    axes = axes.flatten()
    for img, ax in zip( images_arr, axes):
        ax.imshow(img)
        ax.axis('off')
    plt.tight_layout()
    plt.show()

In [None]:
sample_training_images, _ = next(train_image_generator)
# Plot 16 random images from training data    
plotImages(sample_training_images[:16])

In [None]:
#Plot for accuracy and val_loss
def plot_accur(history, epochs=20):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    epochs_range = range(epochs)
    plt.figure(figsize=(12, 4))
    plt.subplot(1, 2, 1)
    plt.plot(epochs_range, acc, label='Training Accuracy')
    plt.plot(epochs_range, val_acc, label='Validation Accuracy')
    plt.legend(loc='lower right')
    plt.title('Training and Validation Accuracy')
    
    plt.subplot(1, 2, 2)
    plt.plot(epochs_range, loss, label='Training Loss')
    plt.plot(epochs_range, val_loss, label='Validation Loss')
    plt.legend(loc='upper right')
    plt.title('Training and Validation Loss')
    plt.show()

In [None]:
model = Sequential([    
    Conv2D(32, (3, 3), activation='relu', input_shape = (100,100,3)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    Flatten(),
    Dense(64, activation='relu'),
    Dense(5, activation='softmax')  # num_classes is the number of output classes
])
K.clear_session()
model.compile(optimizer=Adamax(lr=0.0001),  #Adam(lr=x)
              loss='categorical_crossentropy',  
              metrics=['accuracy'])
history = model.fit(
    train_image_generator,
    epochs=20,
    validation_data=val_image_generator,
)
history

In [None]:
# Load pre-trained EfficientNetB0 model without the top (classification) layer
base_model = EfficientNetB7(weights=None, include_top=False, input_shape=(100, 100, 3))
num_classes = 5   

# Add your own classification layers on top of the base model
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.7)(x)
    
# Output layer
predictions = Dense(num_classes, activation='softmax')(x)

# Create the final model
model = Model(inputs=base_model.input, outputs=predictions)
K.clear_session()
# Compile the model
model.compile(optimizer=Adam(lr=0.01),  
              loss='categorical_crossentropy',  
              metrics=['accuracy'])
history = model.fit(
    train_image_generator,
    epochs=20,
    validation_data=val_image_generator,
)
history

In [None]:
plot_accur(history, epochs=50)

<a id="3"></a>
<h2 style='background:orange; border:0; color:white'><center>4. Multimodal convolutional network model</center><h2>

In [None]:
BUFFER_SIZE = 100
length_t = len(y_train)
BATCH_SIZE_tr = sorted([int(length_t/n) for n in range(1,length_t+1) 
                     if length_t % n ==0 and length_t/n<=80],reverse=True)[0] 
length_v = len(y_val)
BATCH_SIZE_val = sorted([int(length_v/n) for n in range(1,length_v+1) 
                     if length_v % n ==0 and length_v/n<=80],reverse=True)[0] 
print(BATCH_SIZE_tr,
     BATCH_SIZE_val)

In [None]:
def eff_model(batch_tr=64,batch_val=32,optimizer=Adamax(learning_rate=0.0001), epochs=10):
    BUFFER_SIZE = 100
    batch_size=batch_tr
    train_dataset = tf.data.Dataset.from_tensor_slices((
        {'image_input': X_train_images, 'tabular_input': X_train_tabular},
        y_train)).shuffle(BUFFER_SIZE).batch(batch_size)
    
    batch_size=batch_val
    val_dataset = tf.data.Dataset.from_tensor_slices((
        {'image_input': X_val_images, 'tabular_input': X_val_tabular},
        y_val)).shuffle(BUFFER_SIZE).batch(batch_size)
    
    # Image Branch
    input_shape = X_train_images[0].shape
    image_inputs = Input(shape=input_shape, name='image_input')
    x_image = EfficientNetB7(input_shape=input_shape, include_top=False, weights=None)(image_inputs)
    x_image = GlobalAveragePooling2D()(x_image)

    # Tabular Branch
    tabular_inputs = Input(shape=(X_train_tabular.shape[1],), name='tabular_input')
    x_tabular = Dense(128, activation='relu')(tabular_inputs)

    # Concatenate branches
    merged = Concatenate()([x_image, x_tabular])

    # Output Layer
    outputs = Dense(len(label_encoder.classes_), activation='softmax')(merged)

    # Create the model
    model = Model(inputs=[image_inputs, tabular_inputs], outputs=outputs)

    # Compile the model
    model.compile(optimizer=optimizer,
                  loss='categorical_crossentropy',
                  metrics=['accuracy'],
                 run_eagerly=True)
    
    test_steps_train = int(len(train_dataset) / batch_tr)
    test_steps_val = int(len(val_dataset) / batch_val)

    history = model.fit(train_dataset,
                        epochs=epochs,
                        validation_data=val_dataset,
                        steps_per_epoch=test_steps_train,
                        validation_steps=test_steps_val)

    return history


In [None]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

In [None]:
BUFFER_SIZE = 100
BATCH_SIZE=10
train_dataset = tf.data.Dataset.from_tensor_slices((
        {'image_input': X_train_images, 'tabular_input': X_train_tabular},
        y_train)).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)

val_dataset = tf.data.Dataset.from_tensor_slices((
        {'image_input': X_val_images, 'tabular_input': X_val_tabular},
        y_val)).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
    
# Image Branch
input_shape = X_train_images[0].shape
image_inputs = Input(shape=input_shape, name='image_input')
x_image = EfficientNetB7(input_shape=input_shape, include_top=False, weights=None)(image_inputs)
x_image = GlobalAveragePooling2D()(x_image)

# Tabular Branch
tabular_inputs = Input(shape=(X_train_tabular.shape[1],), name='tabular_input')
x_tabular = Dense(128, activation='relu')(tabular_inputs)

    # Concatenate branches
merged = Concatenate()([x_image, x_tabular])

    # Output Layer
outputs = Dense(len(label_encoder.classes_), activation='softmax')(merged)

    # Create the model
model = Model(inputs=[image_inputs, tabular_inputs], outputs=outputs)

    # Compile the model
model.compile(optimizer=Adamax(learning_rate=0.001),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])


In [None]:
history = model.fit(train_dataset,
                        epochs=30,
                        validation_data=val_dataset)

In [None]:
plot_accur(history, epochs=30)

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, GlobalAveragePooling2D, Concatenate, Dropout, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adamax
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras import regularizers

# Image Branch
input_shape = X_train_images[0].shape
image_inputs = Input(shape=input_shape, name='image_input')
x_image = EfficientNetB7(input_shape=input_shape, include_top=False, weights=None)(image_inputs)
x_image = GlobalAveragePooling2D()(x_image)
x_image = BatchNormalization()(x_image)
x_image = Dropout(0.4)(x_image)

# Tabular Branch
tabular_inputs = Input(shape=(X_train_tabular.shape[1],), name='tabular_input')
x_tabular = Dense(256, kernel_regularizer=regularizers.l2(0.016), activity_regularizer=regularizers.l1(0.006),
                  bias_regularizer=regularizers.l1(0.006), activation='relu')(tabular_inputs)
x_tabular = BatchNormalization()(x_tabular)

# Concatenate branches
merged = Concatenate()([x_image, x_tabular])

# Additional Dense layer with Batch Normalization and Dropout
x = Dense(128, activation='relu')(merged)
x = BatchNormalization()(x)
x = Dropout(0.4)(x)

# Output Layer
outputs = Dense(len(label_encoder.classes_), activation='softmax')(x)

# Create the model
model = Model(inputs=[image_inputs, tabular_inputs], outputs=outputs)

# Learning Rate Schedule
def lr_schedule(epoch):
    lr = 0.001 * 0.9**epoch
    return lr

lr_scheduler = LearningRateScheduler(lr_schedule)

# Compile the model
model.compile(optimizer=Adamax(learning_rate=0.001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Update the model.fit() call with callbacks
history = model.fit(
    train_dataset,
    epochs=20,
    validation_data=val_dataset,
    callbacks=[lr_scheduler]
)

In [None]:
plot_accur(history, epochs=20)

In [None]:
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras import regularizers

def lr_schedule(epoch, lr):
    """Learning Rate Schedule.
    Adjust the learning rate based on the current epoch.
    """
    if epoch < 10:
        return 0.001
    elif 10 <= epoch < 20:
        return 0.0005
    else:
        return 0.0001

# Image Branch
input_shape = X_train_images[0].shape
image_inputs = Input(shape=input_shape, name='image_input')
x_image = EfficientNetB7(input_shape=input_shape, include_top=False, weights=None)(image_inputs)
x_image = GlobalAveragePooling2D()(x_image)
x_image = BatchNormalization()(x_image)
x_image = Dropout(0.5)(x_image)  # Adjust dropout rate as needed

# Tabular Branch
tabular_inputs = Input(shape=(X_train_tabular.shape[1],), name='tabular_input')
x_tabular = Dense(128, activation='relu')(tabular_inputs)
x_tabular = BatchNormalization()(x_tabular)
x_tabular = Dropout(0.2)(x_tabular)  # Adjust dropout rate as needed

# Concatenate branches
merged = Concatenate()([x_image, x_tabular])
merged = BatchNormalization()(merged)
merged = Dropout(0.3)(merged)  # Adjust dropout rate as needed

# Output Layer
outputs = Dense(len(label_encoder.classes_), activation='softmax')(merged)

# Create the model
model = Model(inputs=[image_inputs, tabular_inputs], outputs=outputs)

# Compile the model with learning rate schedule
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Define learning rate scheduler
lr_scheduler = LearningRateScheduler(lr_schedule)

# Train the model
history = model.fit(train_dataset,
                    epochs=20,
                    validation_data=val_dataset,
                    callbacks=[lr_scheduler])


In [None]:
plot_accur(history, epochs=20)