In [9]:
# Imports
import keras
import tensorflow as tf
import numpy as np
from pathlib import Path
from utils import plot_history
from keras.preprocessing import text_dataset_from_directory

ds_dir = Path('data/C50/')
train_dir = ds_dir / 'train'
test_dir = ds_dir / 'test'
seed = 1000
batch_size = 16


train_ds = text_dataset_from_directory(train_dir,
                                     label_mode='int',
                                     seed=seed,
                                     shuffle=True,
                                     validation_split=0.2,
                                     subset='training')

val_ds = text_dataset_from_directory(train_dir,
                                      label_mode='int',
                                      seed=seed,
                                      shuffle=True,
                                      validation_split=0.2,
                                     subset='validation')

test_ds = text_dataset_from_directory(test_dir,
                                       label_mode='int',
                                       seed=seed,
                                       shuffle=True,
                                       batch_size=batch_size)

class_names = train_ds.class_names


Found 2500 files belonging to 50 classes.
Using 2000 files for training.
Found 2500 files belonging to 50 classes.
Using 500 files for validation.
Found 2500 files belonging to 50 classes.


In [10]:
# Prepare and Configure the datasets
from utils import get_text, prepare_batched
from transformers import DistilBertTokenizerFast

AUTOTUNE = tf.data.AUTOTUNE

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

batch_size=2

train_ds = prepare_batched(train_ds, tokenizer, batch_size=batch_size)
val_ds = prepare_batched(val_ds, tokenizer, batch_size=batch_size)
test_ds = prepare_batched(test_ds, tokenizer, batch_size=batch_size)

In [None]:
# Fine-tuning the model
keras.backend.clear_session()
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=50)

model.compile(
    optimizer=tf.keras.optimizers.Adam(lr=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=tf.metrics.SparseCategoricalAccuracy()
)

history = model.fit(train_ds, validation_data=val_ds, epochs=20)

plot_history(history, 'sparse_categorical_accuracy')
model.save("DistilBERT_finetuned.h5")
