## Import Libraries 

In [24]:
import pandas as pd
import numpy as np
from tensorflow import keras
from keras import layers
import matplotlib.pyplot as plt 

## Clean + Preprocess Data

In [25]:
PATH = "../kaggle/data/"
# PATH = "/kaggle/input/isic-2024-challenge/"


In [26]:
df_0 = pd.read_csv(PATH + "train-metadata.csv")
cols_to_drop = ["isic_id", "patient_id", "mel_thick_mm", "iddx_full", "iddx_1", "iddx_2", "iddx_3", "iddx_4", "iddx_5", "attribution", "copyright_license", "tbp_lv_location", "tbp_lv_location_simple", "tbp_lv_dnn_lesion_confidence", "anatom_site_general", "image_type", "tbp_tile_type"]
df_0.drop(columns=cols_to_drop, inplace=True)
df_0["sex"] = df_0["sex"].replace({"male": 1, "female": 0})

# replace all NaNs with np.nan
df_0.replace("", np.nan, inplace=True)

# Drop any column with a string vallue 
for col in df_0.columns:
    if df_0[col].dtype == "object":
        df_0.drop(columns=[col], inplace=True)

  df_0 = pd.read_csv(PATH + "train-metadata.csv")


In [27]:
syn_mal_df = pd.read_csv("mal_samples.csv")
# Malignant Examples 
mal_df = df_0[df_0["target"] == 1]
mal_df = pd.concat([mal_df, syn_mal_df], ignore_index=True)

In [28]:
# How many entries in each table 
print("Malignant samples: ", mal_df.shape[0])
print("Benign samples: ", df_0.shape[0])

Malignant samples:  10393
Benign samples:  401059


In [29]:
df_1 = df_0.sample(n=25000)
df_1 = pd.concat([df_1, mal_df], ignore_index=True)

# clean up for training, split into train and test
train_frac = 0.8
train_size = int(train_frac * len(df_1))

train = df_1[:train_size]
x_train = df_1.drop(columns=["target"])
y_train = df_1["target"]

test = df_1[train_size:]
x_test = df_1.drop(columns=["target"])
y_test = df_1["target"]

## Define and Train Model

In [30]:
# Define the NN 
inputs = keras.Input(shape=(len(x_train.keys()),)) # One input for each feature in the dataset 

# Add two hidden layers with 64 units each and ReLU activation
x = layers.Dense(64, activation="relu")(inputs)
x = layers.Dense(32, activation="relu")(x)

# Output layer with sigmoid activation 
outputs = layers.Dense(1, activation="sigmoid")(x)

model = keras.Model(inputs=inputs, outputs=outputs, name="isic-v0")


In [31]:
model.compile(
    loss=keras.losses.BinaryCrossentropy(from_logits=True),
    optimizer=keras.optimizers.RMSprop(),
    metrics=[keras.metrics.BinaryAccuracy()],
)

history = model.fit(x_train, y_train, batch_size=128, epochs=5, validation_split=0.2)

test_scores = model.evaluate(x_test, y_test, verbose=2)
print("Test loss:", test_scores[0])
print("Test accuracy:", test_scores[1])

Epoch 1/20


  output, from_logits = _get_logits(


Epoch 2/20

  output, from_logits = _get_logits(


Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
1107/1107 - 1s - loss: 0.7191 - binary_accuracy: 0.7058 - 795ms/epoch - 718us/step
Test loss: 0.7190524935722351
Test accuracy: 0.7057610154151917


In [32]:
test_df = pd.read_csv(PATH + "test-metadata.csv")
isic_ids = test_df["isic_id"]

try: 
    test_df.drop(columns=cols_to_drop, inplace=True)
except KeyError: 
    pass

test_df.replace("", np.nan, inplace=True)
test_df["sex"] = test_df["sex"].replace({"male": 1, "female": 0})
test_df.dropna(inplace=True)

# Drop any column with a string vallue 
for col in test_df.columns:
    if test_df[col].dtype == "object":
        test_df.drop(columns=[col], inplace=True)

In [33]:
print(len(test_df.keys()), len(x_train.keys()))

35 35


In [34]:
preds = model.predict(test_df)

df_sub = pd.DataFrame({"isic_id": isic_ids, "target": preds[:,0]})
df_sub.to_csv("kaggle/working/submission.csv", index=False)




In [35]:
df_sub

Unnamed: 0,isic_id,target
0,ISIC_0015657,0.117048
1,ISIC_0015729,0.117048
2,ISIC_0015740,0.117048
