In [3]:
from cld_data_processing import download_data

In [4]:
import keras
import pandas as pd
import shutil
import tensorflow as tf
import os
import zipfile

from pathlib import Path
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers, callbacks
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing import image_dataset_from_directory

In [5]:
IMAGE_SIZE = 255
BATCH_SIZE = 196

EPOCHS = 50
ES = True
ES_PATIENCE = 10

In [6]:
def create_ds_tree(x, y, path, name):
    """
    
    """
    ds_path = path / name
    if not ds_path.exists():
        ds_path.mkdir(parents=True, exist_ok=True)

        for lab in y.unique():
            (ds_path / str(lab)).mkdir(exist_ok=True)

        source_path = path / "train_images"
        
        for img, lab in zip(x, y):
            src = source_path / img
            dest = ds_path / str(lab) / img
            shutil.move(src, dest)
        
    return ds_path

In [7]:
def create_dataset(path: Path):
    return image_dataset_from_directory(
        path,
        labels='inferred',
        label_mode='categorical',
        image_size=[IMAGE_SIZE, IMAGE_SIZE],
        batch_size=BATCH_SIZE,
    )   
    

In [8]:
def data_preprocessing(path: Path):
    file = download_data() # pass data_path
    
    data_path = Path(path) / "data"
    data_path.mkdir(parents=True, exist_ok=True)
    
    with zipfile.ZipFile(file, "r") as z:
        z.extractall(data_path)
        
    df_images = pd.read_csv((data_path / "train.csv"))
    
    X_train, X_test, y_train, y_test = train_test_split(df_images.image_id, df_images.label, test_size=0.2, random_state=12)
    
    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.25, random_state=12)
    
    train_path = create_ds_tree(X_train, y_train, data_path, "train")
    valid_path = create_ds_tree(X_valid, y_valid, data_path, "valid")
    test_path = create_ds_tree(X_test, y_test, data_path, "test")
    
    ds_train = create_dataset(train_path)
    ds_valid = create_dataset(valid_path)
    ds_test = create_dataset(test_path)
    
    # clean-up
    os.rmdir((data_path / "train_images"))
    os.remove(file)
                            
    return ds_train, ds_valid, ds_test
    

In [9]:
cwd = os.getcwd()
ds_train, ds_valid, ds_test = data_preprocessing(cwd)

Downloading...
From (original): https://drive.google.com/uc?id=1TJBf1HZxAMpowZ92BcgS5N_NPHE7LPOT
From (redirected): https://drive.google.com/uc?id=1TJBf1HZxAMpowZ92BcgS5N_NPHE7LPOT&confirm=t&uuid=fbcf03f1-bfd0-4bb3-9598-6c0789ba23dc
To: /kaggle/working/data.zip
100%|██████████| 656M/656M [00:05<00:00, 126MB/s]  


Found 12837 files belonging to 5 classes.
Found 4280 files belonging to 5 classes.
Found 4280 files belonging to 5 classes.


In [10]:
print("done")

done


In [11]:
def use_pre_trained_model_convnext_tiny() -> Model:
    
    base_model = tf.keras.applications.ConvNeXtTiny(weights='imagenet', include_top=False)
    base_model.trainable = False
    
    return keras.Sequential([
        
        tf.keras.Input(shape=(IMAGE_SIZE, IMAGE_SIZE, 3)),
        
        base_model,
        layers.GlobalAveragePooling2D(),

        # Classifier Head
        layers.Flatten(),
        layers.Dense(1024, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(1024, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(units=5, activation="softmax"),
    ])

In [12]:
def run_experiment_lr(exp_id: str, sub_exp_id: int, model_fn: str, ds_train_, ds_valid_):
    
    #data = {"exp_id":"1", "desc":"something", "batch_size":BATCH_SIZE, "img_width":0, "img_height":0, "model": model_str}
    #df_experiment = pd.concat([pd.DataFrame([data]), df_experiment])
    
    model = model_fn()
    
    model.compile(
        optimizer=tf.keras.optimizers.Adam(epsilon=0.005),
        loss="categorical_crossentropy",
        metrics=['accuracy']
    )

    early_stopping = callbacks.EarlyStopping(
        min_delta=0.0001,
        patience=ES_PATIENCE,
        restore_best_weights=True,
    )
    
    reduce_lr = callbacks.ReduceLROnPlateau(monitor = 'val_loss', factor = 0.3, 
                                            patience = 3, min_delta = 0.001, 
                                            mode = 'min', verbose = 1)
    
    cbs = []
    if ES:
        print("Using EarlyStopping")
        cbs += [early_stopping]
    if LR:
        print("Using ReduceLROnPlateau")
        cbs += [reduce_lr]

    history = model.fit(
        ds_train_,
        validation_data=ds_valid_,
        epochs=EPOCHS,
        verbose=1,
        callbacks=cbs
    )
    
    df_hist = pd.DataFrame(history.history)
    df_hist = df_hist.reset_index()
    df_hist["exp_id"] = exp_id
    df_hist["sub_exp_id"] = sub_exp_id
    df_hist["epoch"] = df_hist.index
   
    return model, df_hist

In [13]:
def add_results(df_all_results, df_new_results):
    if df_all_results.empty:
        return df_new_results
    else:
        return pd.concat([df_all_results, df_new_results])

In [14]:
df_results = pd.DataFrame

In [15]:
ES = True
LR = False
IMAGE_SIZE = 255
BATCH_SIZE = 196

# ds_train, ds_valid = get_equal_split_aug_dataset_raw(dataset_path, 2)
(m, df_hist) = run_experiment_lr("use_pre_trained_model_convnext_tiny", 1, use_pre_trained_model_convnext_tiny, ds_train, ds_valid)
df_results = add_results(df_results, df_hist)
    
df_results.to_csv("results.csv")

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/convnext/convnext_tiny_notop.h5
[1m111650432/111650432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step
Using EarlyStopping
Epoch 1/50


I0000 00:00:1710711107.322863     122 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
W0000 00:00:1710711107.360929     122 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update
W0000 00:00:1710711107.361360     122 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update
W0000 00:00:1710711107.361752     122 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update
W0000 00:00:1710711107.362101     122 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update
W0000 00:00:1710711107.362485     122 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update
W0000 00:00:1710711107.362869     122 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update
W0000 00:00:1710711107.363268     122 graph_launch.cc:671] Fallback to op-by-op mode because m

[1m65/66[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 516ms/step - accuracy: 0.5545 - loss: 1.2718

W0000 00:00:1710711151.143604     119 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update
W0000 00:00:1710711151.143950     119 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update
W0000 00:00:1710711151.144663     119 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 670ms/step - accuracy: 0.5553 - loss: 1.2696

W0000 00:00:1710711156.673453     122 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update
W0000 00:00:1710711156.673755     122 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update
W0000 00:00:1710711156.674179     122 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update
W0000 00:00:1710711156.674519     122 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update
W0000 00:00:1710711156.674829     122 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update
W0000 00:00:1710711156.675147     122 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update
W0000 00:00:1710711156.675453     122 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update
W0000 00:00:1710711156.675748     122 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 1s/step - accuracy: 0.5560 - loss: 1.2674 - val_accuracy: 0.6801 - val_loss: 0.8630
Epoch 2/50
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 705ms/step - accuracy: 0.6534 - loss: 0.9328 - val_accuracy: 0.7044 - val_loss: 0.7867
Epoch 3/50
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 703ms/step - accuracy: 0.6788 - loss: 0.8636 - val_accuracy: 0.7147 - val_loss: 0.7552
Epoch 4/50
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 704ms/step - accuracy: 0.6906 - loss: 0.8281 - val_accuracy: 0.7173 - val_loss: 0.7362
Epoch 5/50
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 705ms/step - accuracy: 0.7055 - loss: 0.7885 - val_accuracy: 0.7292 - val_loss: 0.7175
Epoch 6/50
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 702ms/step - accuracy: 0.7128 - loss: 0.7697 - val_accuracy: 0.7348 - val_loss: 0.7082
Epoch 7/50
[1m66/66[0m [32m━━━━━

In [18]:
m.evaluate(ds_test)

[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 516ms/step - accuracy: 0.7841 - loss: 0.6000


[0.5987370610237122, 0.7820093631744385]

In [16]:
#!rm -rf /kaggle/working/data