In [None]:
%pip install gdown transformers datasets tensorflow scikit-learn tf-keras

## Dataset loading options

From Google Drive

In [None]:
file_id = "0Bz8a_Dbh9QhbZVhsUnRWRDhETzA"
output_name = 'amazon_review_full_csv.tar.gz'
!gdown --fuzzy https://drive.google.com/uc?id={file_id} -O {output_name}

From local storage

In [None]:
# from google.colab import files

# uploaded = files.upload()
# # Assume che il CSV sia il primo file caricato
# print(f"Select train dataset .csv from your local storage:. . .")
# train_data_filename = list(uploaded.keys())[0]

In [None]:
# uploaded = files.upload()
# # Assume che il CSV sia il primo file caricato
# print(f"Select test dataset .csv from your local storage:. . .")
# test_data_filename = list(uploaded.keys())[0]

In [None]:
import tarfile

with tarfile.open(output_name, "r:gz") as tar:
    tar.extractall("Dataset")

In [None]:
!ls -R Dataset

## Import Libraries

In [None]:
import pandas as pd
import json
import tensorflow as tf
import tf_keras as keras 
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, TFAutoModel

In [None]:
gpus = tf.config.list_physical_devices('GPU')
gpu_name = None
if gpus:
    details = tf.config.experimental.get_device_details(gpus[0])
    gpu_name = details.get('device_name', gpus[0].name)
else:
    gpu_name = 'CPU'

print(gpu_name)

## Model settings

In [None]:
MODEL_NAME    = "roberta-base"
BATCH_SIZE    = 32
EPOCHS        = 5
LEARNING_RATE = 2e-5
NUM_LABELS    = 2
RANDOM_STATE = 42
BASE_DIR_STORAGE = '/content/drive/MyDrive/HLT_models'

In [None]:
train_data_filename = "./Dataset/amazon_review_full_csv/train.csv"
test_data_filename = "./Dataset/amazon_review_full_csv/test.csv"

## Train dataset loading

In [None]:
print(f"Loading dataset from: {train_data_filename}")

df_train = pd.read_csv(
    train_data_filename,
    header=None,
    names=['label', 'title', 'text'],
    quotechar='"',
    doublequote=True,
    escapechar='\\',
    engine='python',
    encoding='utf-8',
    on_bad_lines='skip'  
    )

In [None]:
df_train.shape[0]

In [None]:
df_train.describe()

In [None]:
df_train.head()

In [None]:
df_train.isna().sum()

Drop rows with at least a null value

In [None]:
df_train.dropna(inplace=True)
df_train.shape[0]

In [None]:
df_train.isna().sum()

In [None]:
df_test = pd.read_csv(
    test_data_filename,
    header=None,
    names=['label', 'title', 'text'],
    quotechar='"',
    doublequote=True,
    escapechar='\\',
    engine='python',
    encoding='utf-8',
    on_bad_lines='skip'  
    )

In [None]:
df_test.describe()

In [None]:
df_test.shape[0]

In [None]:
df_test.isna().sum()

In [None]:
df_test.dropna(inplace=True)

In [None]:
df_test.isna().sum()

In [None]:
df_test.shape[0]

Drop rows with rating '3' from both train and test sets

In [None]:
df_train = df_train[df_train['label'] != 3]
df_test = df_test[df_test['label'] != 3]

In [None]:
df_train.shape[0]

In [None]:
df_test.shape[0]

## Ratings binary mapping

In [None]:
# Function to convert ratings (1-5) into 3 classes:
def map_rating(row):
    rating = row['label']
    if rating <= 2:
        return 0  # negative
    else:
        return 1  # positive

In [None]:
df_train_mapped = df_train.copy(deep=True)

df_train_mapped['sentiment'] = df_train_mapped.apply(map_rating, axis=1)


 ### Downsample Balanced Train Set

In [None]:
from sklearn.model_selection import train_test_split

### Experiment with a subset of 200k samples

In [None]:
import os
SAMPLE_SIZE = 200_000
BASE_DIR_STORAGE = os.path.join(BASE_DIR_STORAGE, f"{MODEL_NAME}_{SAMPLE_SIZE}")

In [None]:
df_train_mapped, _ = train_test_split(
    df_train_mapped,
    train_size=SAMPLE_SIZE,
    random_state=RANDOM_STATE,
    stratify=df_train_mapped['label']
)
print(f"Train size: {len(df_train_mapped)}")

Let's check whether classes are balanced

In [None]:
print(df_train_mapped['label'].value_counts())

In [None]:
df_train_mapped.drop(columns=["label"], inplace=True)
display(df_train_mapped.head())

In [None]:
df_train_mapped['review'] = df_train_mapped['title'].fillna('') + ' ' + df_train_mapped['text'].fillna('')

train_samples = df_train_mapped['review']
train_labels = df_train_mapped['sentiment'].values

## NLTK libraries

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('popular')

In [None]:
train_samples = train_samples.str.replace('\n', ' ', regex=False) 
train_samples = train_samples.str.lower()

## Setting Tokenizer

This approach uses the same tokenizer used during the model `MODEL_NAME` pretraining. This allow to preserve the context and language semantics.

`AutoTokenizer` is able to infer automatically the model used.

For example we can possible use RobertaTokenizer, but only if we are sure that we'll use RoBERTa model.

## Analyze tokens distribution to choose the best trade-off for MAX_LEN.
The idea is to use the 95th percentile to reduce padding and truncate only outliers.

In [None]:
import numpy as np

N = len(train_samples)
if N < 10_000:
    n_samples = N
elif N < 100_000:
    n_samples = min(N, 5000)
else:
    n_samples = min(N, 10_000)
    

#  n_samples are good enough to get a stable estimation of tokens distribution
sample_texts = train_samples.sample(n=n_samples, random_state=RANDOM_STATE).tolist()

# Tokenizza solo per analisi (senza padding)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
token_lens = [len(tokenizer.tokenize(t)) for t in sample_texts]


In [None]:
import matplotlib.pyplot as plt

plt.hist(token_lens, bins=50)
plt.title("Distribution length per token")
plt.xlabel("Token per review")
plt.ylabel("Frequence")
plt.show()

In [None]:
print("Token length stats:")
print(f"Mean: {np.mean(token_lens):.1f}")
print(f"95th percentile: {np.percentile(token_lens, 95):.0f}")
print(f"Max: {np.max(token_lens)}")

The 95th percentile of tokenized length is around ~200, meaning that 95% of the reviews are shorter than this threshold.

To balance memory efficiency and minimize information loss, we set `MAX_LEN = 205`:
- This truncates only the top 5% longest reviews (outliers).
- It reduces unnecessary padding for the remaining 95% of the data.
- It ensures consistent input size for the model without significant loss of content.

In [None]:
MAX_LEN = 205

In [None]:
train_samples.shape

### Una-tantum tokenization

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

texts   = train_samples.tolist()

encodings = tokenizer(
    texts,
    max_length=MAX_LEN,
    truncation=True,
    padding='do_not_pad',
    return_attention_mask=True,
   
)


In [None]:
from google.colab import drive
import os
drive.mount("/content/drive")         

save_dir = "/content/drive/MyDrive/Tokenization_cache"
os.makedirs(save_dir, exist_ok=True)

# save on disk
np.savez_compressed(
    os.path.join(save_dir,f"train_enc_roberta{SAMPLE_SIZE}.npz"),
    ids   = np.array(encodings["input_ids"], dtype=object),
    attn  = np.array(encodings["attention_mask"], dtype=object),
    label = train_labels
)

# save on temporary session
np.savez_compressed(
   f"train_enc_roberta_{SAMPLE_SIZE}.npz",
    ids   = np.array(encodings["input_ids"], dtype=object),
    attn  = np.array(encodings["attention_mask"], dtype=object),
    label = train_labels
)

print("Tokenization completed and saved in", save_dir)

### Load saved indexes

In [None]:
d = np.load(f"train_enc_roberta_{SAMPLE_SIZE}.npz", allow_pickle=True)

In [None]:
def generator():
    for ids, attentions, labels in zip(d["ids"], d["attn"], d["label"]):
        yield {
            "input_ids":     np.array(ids,  dtype=np.int32),
            "attention_mask": np.array(attentions, dtype=np.int32)
        }, np.int32(labels)

In [None]:
raw_ds = tf.data.Dataset.from_generator(
    generator,
    output_signature=(
        {
            "input_ids":      tf.TensorSpec(shape=(None,), dtype=tf.int32),
            "attention_mask": tf.TensorSpec(shape=(None,), dtype=tf.int32),
        },
        tf.TensorSpec(shape=(), dtype=tf.int32),
    ),
)

In [None]:
N = len(d["label"])

### Custom encoder to enable dynamic padding. It inserts padding till the max length of the current batch.

This code is used only for on the fly encoding

In [None]:
# from transformers import AutoTokenizer
# tokenizer = AutoTokenizer.from_pretrained('roberta-base')

# # funzione Python pura: restituisce SOLO tensori, senza dict
# def _py_encode(text, label):
#     enc = tokenizer(
#         text.numpy().decode(),
#         truncation=True,
#         max_length=MAX_LEN,
#         return_attention_mask=True,
#     )
#     return (                # tuple flat di tre tensor
#         tf.constant(enc["input_ids"],      dtype=tf.int32),
#         tf.constant(enc["attention_mask"], dtype=tf.int32),
#         tf.cast(label, tf.int32),
#     )

# # wrapper tf → rimappa la tupla in un dict per Keras
# def tf_encode(text, label):
#     input_ids, attn_mask, lab = tf.py_function(
#         _py_encode,
#         inp=[text, label],
#         Tout=(tf.int32, tf.int32, tf.int32)   # usato per dichiarare a TensorFlow il tipo di ciascun tensore che la funzione Python restituirà.
#     )
#     # Imposta le shape dinamiche, altrimenti sono <unknown>
#     input_ids.set_shape([None])
#     attn_mask.set_shape([None])
#     lab.set_shape([])

#     return {'input_ids': input_ids,
#             'attention_mask': attn_mask}, lab



In [None]:
# tensor_dataset = (
#     raw_ds
#     .map(tf_encode, num_parallel_calls=tf.data.AUTOTUNE)
#     .padded_batch(
#         BATCH_SIZE,
#         padded_shapes=({'input_ids':[None], 'attention_mask':[None]}, []),
#         padding_values=({'input_ids': tokenizer.pad_token_id,
#                          'attention_mask': 0}, 0),
#     )
#     .prefetch(tf.data.AUTOTUNE)                       
# )

In [None]:
tensor_dataset = (
    raw_ds
    .shuffle(N, seed=RANDOM_STATE)
    .padded_batch(
        BATCH_SIZE,
        padded_shapes=({"input_ids": [None], "attention_mask": [None]}, []),
        padding_values=({"input_ids": tokenizer.pad_token_id,
                         "attention_mask": 0}, 0),
    )
    .prefetch(tf.data.AUTOTUNE)
)

Split dataset

In [None]:
test_size = int(0.3 * N)
val_size  = int(0.2 * N)
train_size = N - test_size - val_size

In [None]:
print("Train size:", train_size)
print("Val   size:", val_size)
print("Test  size:", test_size)

In [None]:
shuffled = raw_ds.shuffle(buffer_size=N, seed=RANDOM_STATE, reshuffle_each_iteration=False)

test_raw   = shuffled.take(test_size)
remainder  = shuffled.skip(test_size)
val_raw    = remainder.take(val_size)
train_raw  = remainder.skip(val_size)

The following function wraps two key steps into one reusable pipeline stage:
1. padded_batch(...)
	- Groups elements into batches of size BATCH_SIZE
	- Dynamically pads each batch’s input_ids and attention_mask to the length of the longest sequence in that batch
2.	.prefetch(tf.data.AUTOTUNE)
	- Overlaps data preparation and model execution to keep the GPU fed

In short: *prepare(ds)* turns a raw dataset of variable‐length examples into an efficient, dynamically‐padded, batched dataset ready for training.

Dynamic padding means that, instead of padding every sequence up to a fixed global MAX_LEN, you pad each batch only up to the length of its longest example.

In [None]:
def prepare(ds, *, shuffle_buffer=None, do_repeat=False):
    """
    Prepares a Dataset that was created with `from_generator` by applying:
      - reshuffling (only for training)
      - batching with padded_batch (dynamic padding)
      - infinite repetition (only for training)
      - prefetching for performances

    """
    pipeline = ds

    # only for TRAIN: shuffle before each epoch
    if shuffle_buffer:
        pipeline = pipeline.shuffle(buffer_size=shuffle_buffer, seed=RANDOM_STATE)

    # batch + dynamic padding
    pipeline = pipeline.padded_batch(
        BATCH_SIZE,
        padded_shapes=(
            {"input_ids":      [None],
             "attention_mask": [None]},
            []),
        padding_values=(
            {"input_ids":      tokenizer.pad_token_id,
             "attention_mask": 0},
            0),
    )

    # in train set, during training without repeat, train set emits all batches only one time
    # if steps_for_epochs > real batches -> input ran out of data
    # with epochs > 1, at the second epoch there is nothing to read 
    if do_repeat:
        pipeline = pipeline.repeat()

    # prefetch per overlap CPU/GPU
    return pipeline.prefetch(tf.data.AUTOTUNE)

In [None]:
train_dataset = prepare(train_raw,
                        shuffle_buffer=train_size,  # shuffle for each epoch
                        do_repeat=True)             # repeat
val_dataset   = prepare(val_raw,
                        shuffle_buffer=None,        # no shuffle for epoch no repeat
                        do_repeat=False)   

test_dataset = prepare(
    test_raw,
    shuffle_buffer=None,   
    do_repeat=False        
)

### Checking for Overlap Between Train, Validation, and Test Splits

In [None]:
# We use a sample of 1000 to check statistically whether the 3 sets have any intersections
def sample_hashes(ds, n=1000):
    h = set()
    for ex, _ in ds.take(n):
        h.add(hash(ex["input_ids"].numpy().tobytes()))
    return h

h_tr = sample_hashes(train_raw)
h_va = sample_hashes(val_raw)
h_te = sample_hashes(test_raw)

print("Intersections between Train and Val  :", len(h_tr & h_va))
print("Intersection between Train and Test :", len(h_tr & h_te))
print("Intersections between Val and Test :", len(h_va & h_te))

In [None]:
import math
steps_per_epoch   = math.ceil(train_size / BATCH_SIZE)
validation_steps  = math.ceil(val_size   / BATCH_SIZE)

## Choose whether retrain encoder or not

In [None]:
ENCODER_TRAINABLE = True

In [None]:
from keras import mixed_precision
mixed_precision.set_global_policy('mixed_float16')

In [None]:
from transformers import TFRobertaModel
encoder = TFRobertaModel.from_pretrained(MODEL_NAME)
encoder.trainable = True


In [None]:
input_ids      = keras.Input(shape=(None,), dtype="int32", name="input_ids")
attention_mask = keras.Input(shape=(None,), dtype="int32", name="attention_mask")

Link to pretrained encoders info: https://huggingface.co/transformers/v2.4.0/pretrained_models.html

In [None]:
# executed for each batch
# here roberta converts input_ids into embedding vectors (batch_size, seq_len, hidden_size) where hidden_size = 768 for roberta-base.
encoder_outputs = encoder({'input_ids': input_ids, 'attention_mask': attention_mask}, training = True)
pooled_output = encoder_outputs.pooler_output

# last hidden state is a tensor (batch_size, seq_len, hidden_size) containing the contextual representation of each token.
# cls is used to represent the entire sentence
# hidden_states = encoder_outputs.last_hidden_state
# cls_token = hidden_states[:, 0, :]

### Building Classification Head

In [None]:
from tf_keras.layers import LayerNormalization

x = keras.layers.Dense(256, activation='relu', name='dense_relu')(pooled_output)
x = keras.layers.Dropout(0.2)(x)
x = keras.layers.Dense(64, activation='relu')(x)


logits = keras.layers.Dense(1, name='logits')(x)


### Entire model

In [None]:
model = keras.Model(inputs=[input_ids, attention_mask], outputs=logits)
model.summary()

Check that encoder parameters are not retrainable

In [None]:
for i, layer in enumerate(model.layers):
    print(f"{layer.name:25s}  trainable={layer.trainable}")

In [None]:
base_opt = keras.optimizers.AdamW(           # da tensorflow-addons
    learning_rate = LEARNING_RATE,
    weight_decay  = 1e-2,
    epsilon       = 1e-8,
    clipnorm      = 1.0,

)

optimizer = keras.mixed_precision.LossScaleOptimizer(base_opt)
loss= keras.losses.BinaryCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=[keras.metrics.BinaryAccuracy(name="accuracy")])

In [None]:
import time
start_time = time.time()

In [None]:
from tf_keras.callbacks import EarlyStopping

early_stop = EarlyStopping(
    monitor='val_loss',        
    patience=2,                
    restore_best_weights=True, 
    verbose=1                 
)

In [None]:
history = model.fit(
    train_dataset,
    validation_data= val_dataset,
    epochs=EPOCHS,
    callbacks=[early_stop] ,
    validation_steps=validation_steps,
    steps_per_epoch=steps_per_epoch,
   
)

In [None]:
end_time = time.time()
elapsed = end_time - start_time

print(f"Total training time: {elapsed:.1f} s ({elapsed/60:.2f} min)")

### Training History plot

In [None]:
import matplotlib.pyplot as plt

# Estrai le liste di loss dal history
train_loss = history.history['loss']
val_loss   = history.history['val_loss']
epochs     = range(1, len(train_loss) + 1)

train_acc = history.history['accuracy']
val_acc   = history.history['val_accuracy']


# Disegna il plot
plt.figure(figsize=(8, 5))
plt.plot(epochs, train_loss, marker='o', label='Train Loss')
plt.plot(epochs, val_loss,   marker='o', label='Validation Loss')
plt.title('Training vs Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.xticks(epochs)           # mostra tutte le epoche sull'asse x
plt.legend()
plt.grid(True, linestyle='--', alpha=0.5)
plt.show()


plt.figure(figsize=(8, 5))
plt.plot(epochs, train_acc, marker='o', label='Train Accuracy')
plt.plot(epochs, val_acc,   marker='o', label='Validation Accuracy')
plt.title('Training vs Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.xticks(epochs)
plt.legend()
plt.grid(True, linestyle='--', alpha=0.5)
plt.show()

## Model evaluation with Internal test set

In [None]:
y_true = np.concatenate([y for x, y in test_dataset], axis=0)

pred_logits = model.predict(test_dataset)
pred_probs  = tf.sigmoid(pred_logits).numpy().ravel()
pred_labels = (pred_probs >= 0.5).astype(int)

In [None]:
from sklearn.metrics import accuracy_score

test_acc = accuracy_score(y_true, pred_labels)
print(f"Test accuracy: {test_acc:.4f}")

### Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
conf_matrix = confusion_matrix(y_true, pred_labels)

# 4. Plot it
fig, ax = plt.subplots()
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix)
disp.plot(ax=ax)
ax.set_title("Confusion Matrix on Internal Test Set")
plt.show()

### Save model

In [None]:
import time, json, os
from google.colab import drive

drive.mount('/content/drive')


os.makedirs(BASE_DIR_STORAGE, exist_ok=True)

file = os.path.join(BASE_DIR_STORAGE, f'roberta_{SAMPLE_SIZE}.keras')


model.save(file)
tokenizer.save_pretrained(BASE_DIR_STORAGE)

metadata = {
    "timestamp":          time.strftime("%Y-%m-%d %H:%M:%S"),
    "total_training_time_s": round(elapsed, 1),
    "num_samples":           SAMPLE_SIZE,
    "train_size":            train_size,
    "val_size":              val_size,
    "internal_test_size":    test_size,
    "batch_size":            BATCH_SIZE,
    "epochs":                EPOCHS,
    "encoder_trainable":     ENCODER_TRAINABLE,
    "history":               history.history,  
    "gpu": gpu_name,
    "internal_test_accuracy": test_acc,
    
}

with open(os.path.join(BASE_DIR_STORAGE, f'training_metadata_roberta_{SAMPLE_SIZE}.json'), 'w') as f:
    json.dump(metadata, f, indent=2)

print(f"Saved model + tokenizer + metadata in {BASE_DIR_STORAGE}")