<a href="https://colab.research.google.com/github/arjunthillairajah/CSDS395/blob/main/CSDS_395.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install tensorflow pandas numpy




In [3]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


Data processing

In [4]:
import pandas as pd

metadata_path = "/content/drive/Shared drives/CSDS395_Senior_Project/archive/HAM10000_metadata.csv"
df = pd.read_csv(metadata_path)

df = df.loc[:, ~df.columns.duplicated()]
df["filename"] = df["image_id"] + ".jpg"

valid_labels = ['bkl', 'mel', 'vasc', 'nv', 'akiec']
df = df[df["dx"].isin(valid_labels)]
df["label"] = df["dx"]

df = df[["filename", "label"]]
df = df.dropna()
df.to_csv("/content/drive/Shared drives/CSDS395_Senior_Project/archive/processed_metadata.csv", index=False)


Training model

In [20]:
import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB3
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import pandas as pd
import os

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

data_dir_1 = "/content/drive/Shared drives/CSDS395_Senior_Project/archive/HAM10000_images_part_1"
data_dir_2 = "/content/drive/Shared drives/CSDS395_Senior_Project/archive/HAM10000_images_part_2"
metadata_path = "/content/drive/Shared drives/CSDS395_Senior_Project/archive/processed_metadata.csv"

df = pd.read_csv(metadata_path)
df["filename"] = df["filename"].astype(str)

batch_size = 16
img_size = (300, 300)
epochs = 10
num_classes = 5

data_gen = ImageDataGenerator(rescale=1./255, validation_split=0.2)

classes = ['bkl', 'mel', 'vasc', 'nv', 'akiec']

train_data = data_gen.flow_from_dataframe(
    dataframe=df,
    directory=data_dir_1,
    x_col="filename",
    y_col="label",
    target_size=img_size,
    batch_size=batch_size,
    class_mode='categorical',
    classes=classes,
    subset='training'
)

val_data = data_gen.flow_from_dataframe(
    dataframe=df,
    directory=data_dir_2,
    x_col="filename",
    y_col="label",
    target_size=img_size,
    batch_size=batch_size,
    class_mode='categorical',
    classes=classes,
    subset='validation'
)

base_model = EfficientNetB3(weights='imagenet', include_top=False, input_shape=(300, 300, 3))
x = GlobalAveragePooling2D()(base_model.output)
x = Dropout(0.5)(x)
x = Dense(num_classes, activation='softmax')(x)
model = Model(inputs=base_model.input, outputs=x)

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(train_data, validation_data=val_data, epochs=epochs)

model.save("/content/drive/Shared drives/CSDS395_Senior_Project/saved_model_5class.h5")


Mounted at /content/drive
Found 3743 validated image filenames belonging to 5 classes.




Found 941 validated image filenames belonging to 5 classes.


  self._warn_if_super_not_called()


Epoch 1/10
[1m234/234[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2733s[0m 11s/step - accuracy: 0.8792 - loss: 0.3731 - val_accuracy: 0.0000e+00 - val_loss: 8.1523
Epoch 2/10
[1m234/234[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2414s[0m 10s/step - accuracy: 0.9422 - loss: 0.1808 - val_accuracy: 0.0000e+00 - val_loss: 3.9033
Epoch 3/10
[1m234/234[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2437s[0m 10s/step - accuracy: 0.9585 - loss: 0.1379 - val_accuracy: 0.0000e+00 - val_loss: 4.7132
Epoch 4/10
[1m234/234[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2448s[0m 10s/step - accuracy: 0.9604 - loss: 0.1265 - val_accuracy: 0.0000e+00 - val_loss: 9.4843
Epoch 5/10
[1m234/234[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2450s[0m 10s/step - accuracy: 0.9648 - loss: 0.1201 - val_accuracy: 0.0000e+00 - val_loss: 1415.7062
Epoch 6/10
[1m234/234[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2425s[0m 10s/step - accuracy: 0.9716 - loss: 0.0967 - val_accuracy: 0.0000e+00 - 



In [5]:
label_counts = df["label"].value_counts()
print(label_counts)


label
nv       6705
mel      1113
bkl      1099
akiec     327
vasc      142
Name: count, dtype: int64
