<a href="https://www.kaggle.com/code/amarininsfran/notebook5fc3fc7d4b?scriptVersionId=154455688" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.losses import SparseCategoricalCrossentropy
import tensorflow as tf
from tensorflow.keras.regularizers import l2
import warnings
import cv2
import seaborn as sns
import glob
from skimage import io
import PIL.Image
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")

In [None]:
main_fol = "/kaggle/input/UBC-OCEAN/"
train = pd.read_csv(r"/kaggle/input/UBC-OCEAN/train.csv")
test = pd.read_csv(r"/kaggle/input/UBC-OCEAN/test.csv")
train = pd.read_csv(r"/kaggle/input/UBC-OCEAN/train.csv")
train_img = glob.glob(main_fol + "/train_images/*.png")
train_img_thumb = glob.glob(main_fol + "/train_thumbnails/*.png")
train_images_dir = '/kaggle/input/UBC-OCEAN/train_images/'
thumbnail_images_dir = '/kaggle/input/UBC-OCEAN/train_thumbnails/'
test_images_dir = '/kaggle/input/UBC-OCEAN/test_images/'
test_thumbnail_dir = '/kaggle/input/UBC-OCEAN/train_thumbnails/'


In [None]:
PIL.Image.MAX_IMAGE_PIXELS = None



In [None]:
train_tma = train[train["is_tma"] == True]
train_no_tma = train[train["is_tma"] == False]
train_tma.loc[:, 'img_id_ext'] = train_tma['image_id'].apply(lambda i: str(i) + ".png")
train_no_tma.loc[:, 'img_id_ext'] = train_no_tma['image_id'].apply(lambda i: str(i) + "_thumbnail.png")

test['img_id_ext']=[str(i)+"_thumbnail.png" for i in test['image_id']]
train_df = pd.concat([train_tma, train_no_tma])
train_df.sort_index(ascending = True, inplace = True)
print(train_df.shape)
   

### **Explorar el dataset**

In [None]:
plt.figure(figsize=(10,8))
plt.title("Número de casos", fontsize=12)
sns.countplot(x="label", data=train_df)
plt.show()

In [None]:
for label in ['HGSC', 'CC', 'EC', 'LGSC', 'MC']:
    df_tmp = train_df[train_df['label']==label]
    image_id_list = list(df_tmp[df_tmp['is_tma']]['image_id'])
    plt.figure(figsize=(20.0, 6.0))
    
    for i in range(len(image_id_list)):
        image_id = image_id_list[i]
        plt.subplot(1, 5, i+1)
        if i == 0:
            plt.title(f'image_id:{image_id} (TMA)', fontsize=14)
            plt.ylabel(label, fontsize=14)
        else:
            plt.title(f'image_id:{image_id} (TMA)', fontsize=14)
        io.imshow(f'/kaggle/input/UBC-OCEAN/train_images/{image_id}.png')
        plt.tick_params(labelbottom=False, labelleft=False, labelright=False, labeltop=False, bottom=False, left=False, right=False, top=False)


In [None]:
train_data, valid_data = train_test_split(train_df, test_size=0.2, stratify=train_df["label"], random_state=42)
train_data['label'] = train_data['label'].astype(str)
valid_data['label'] = valid_data['label'].astype(str)
train_data['full_path'] = train_data.apply(lambda row: os.path.join(train_images_dir if row['is_tma'] else thumbnail_images_dir, row['img_id_ext']), axis=1)    
valid_data['full_path'] = valid_data.apply(lambda row: os.path.join(train_images_dir if row['is_tma'] else thumbnail_images_dir, row['img_id_ext']), axis=1)  
test.sort_index(ascending = True, inplace = True)
test['full_path'] = test.apply(lambda row: os.path.join(test_images_dir, str(row['image_id']) + ".png"), axis=1)  

IMAGE_SIZE = (1000, 1000)
BATCH_SIZE = 8
EPOCHS = 15


In [None]:
train_datagen = ImageDataGenerator(
        rescale=1./255,
        rotation_range=10,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,
        fill_mode='nearest'
)
val_datagen = ImageDataGenerator(
        rescale=1./255,
)
train_generator = train_datagen.flow_from_dataframe(
        dataframe=train_data,
        x_col='full_path',
        y_col='label',
        subset='training',
        batch_size=BATCH_SIZE,
        seed=42,
        shuffle=True,
        class_mode='categorical',
        validate_filenames=True,
        target_size=IMAGE_SIZE
)

valid_generator = val_datagen.flow_from_dataframe(
    dataframe=valid_data,
    x_col='full_path',
    y_col='label',
    batch_size=BATCH_SIZE,
    seed=42,
    shuffle=False,
    class_mode='categorical',
    target_size=IMAGE_SIZE
)

test_generator = val_datagen.flow_from_dataframe(
    dataframe=test,
    x_col='full_path',
    y_col=None,  # No hay etiquetas en el conjunto de prueba
    batch_size=BATCH_SIZE,
    seed=42,
    shuffle=False,  # No mezclar los datos de prueba
    class_mode=None,  # No hay etiquetas en el conjunto de prueba
    target_size=IMAGE_SIZE
)

In [None]:
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint
num_classes = train_df['label'].nunique()

lr_reduce = ReduceLROnPlateau(monitor="val_accuracy", factor=0.5, min_delta=0.0001, patience=1, verbose=1)
filepath = "weights.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor="val_accuracy", verbose=1, save_best_only=True, mode="max")
base_model = tf.keras.applications.ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
model = tf.keras.Sequential([
    base_model,
    tf.keras.layers.GlobalAveragePooling2D(),
    tf.keras.layers.Dropout(0.7),  # Capa de Dropout
    tf.keras.layers.Dense(num_classes, kernel_regularizer=l2(0.01), activation='softmax')  # Regularización L2
])
# Compile the model
model.compile(optimizer=Adam(learning_rate=3e-4), loss='categorical_crossentropy', metrics=['accuracy'])
# Entrenar el modelo
history = model.fit(
    train_generator,
    epochs=EPOCHS,  # Ajustar según sea necesario
    validation_data=valid_generator,
    callbacks=[lr_reduce, checkpoint]
)
model.save('ovarian_cancer_classifier.h5')
validation_loss, validation_accuracy = model.evaluate(valid_generator)



In [None]:
train_predictions = model.predict(train_generator)

# Convertir las predicciones en etiquetas
train_predictions_label = [np.argmax(prediction) for prediction in train_predictions]

# Crear un DataFrame para la presentación
submission_df = pd.DataFrame({
    'image_id': train_data['image_id'],
    'label': train_predictions_label
})
submission_df

In [182]:

test_prediction = model.predict(test_generator)
label_map = train_generator.class_indices
index_map = {v: k for k, v in label_map.items()}

# Convertir las predicciones en etiquetas
train_predictions_label = [np.argmax(prediction) for prediction in test_prediction]
train_predictions_label = [index_map[i] for i in train_predictions_label]

# Crear un DataFrame para la presentación
submission_df = pd.DataFrame({
    'image_id': test['image_id'],
    'label': train_predictions_label
})
submission_df.to_csv('submission.csv', index=False)

