In [None]:
pip install tensorflow transformers

In [None]:
import pandas as pd

# Load your dataset
data = pd.read_csv("/kaggle/input/cleaned-amazon-reviews/your_file.csv")

In [None]:
data['t5_input'] = "summarize: " + data['cleaned_text']
data['t5_output'] = data['cleaned_summary']

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(data['t5_input'], data['t5_output'], test_size=0.1)

In [None]:
from transformers import T5Tokenizer
import tensorflow as tf

tokenizer = T5Tokenizer.from_pretrained('t5-small')
max_length = 512  # Adjust as needed
max_target_length = 150  # Adjust as needed

# Ensure all data is converted to strings and handle any null values
x_train = x_train.astype(str).fillna('')  # Replace '' with some placeholder text if necessary
y_train = y_train.astype(str).fillna('')

x_val = x_val.astype(str).fillna('')
y_val = y_val.astype(str).fillna('')

def tokenize(inputs, targets, tokenizer, max_length, max_target_length):
    # Tokenizing the inputs (source text)
    model_inputs = tokenizer(inputs, max_length=max_length, truncation=True, padding="max_length", return_tensors="tf")

    # Preparing decoder_input_ids
    # For T5, prepend with the pad token
    pad_token = tokenizer.pad_token_id
    decoder_input_ids = [[pad_token] + tokenizer.encode(target, add_special_tokens=False) for target in targets]
    decoder_input_ids = tf.keras.preprocessing.sequence.pad_sequences(decoder_input_ids, maxlen=max_target_length, padding="post", truncating="post")

    # Labels (target text)
    labels = tokenizer(targets, max_length=max_target_length, truncation=True, padding="max_length", return_tensors="tf").input_ids
    labels = tf.where(labels == pad_token, -100, labels)  # Replace pad_token_id in labels with -100

    return model_inputs, decoder_input_ids, labels

x_train_tokenized, x_train_decoder_input_ids, y_train_tokenized = tokenize(list(x_train), list(y_train), tokenizer, max_length, max_target_length)
x_val_tokenized, x_val_decoder_input_ids, y_val_tokenized = tokenize(list(x_val), list(y_val), tokenizer, max_length, max_target_length)

In [None]:
from transformers import TFT5ForConditionalGeneration

model = TFT5ForConditionalGeneration.from_pretrained('t5-small')

In [None]:
print("x_train_tokenized['input_ids'].shape:", x_train_tokenized['input_ids'].shape)
print("x_trin_decoder_input_ids.shape:", x_train_decoder_input_ids.shape)
print("y_train_tokenized.shape:", y_train_tokenized.shape)

print("Data type of x_train_tokenized['input_ids']:", x_train_tokenized['input_ids'].dtype)
print("Data type of x_train_decoder_input_ids:", x_train_decoder_input_ids.dtype)
print("Data type of y_train_tokenized:", y_train_tokenized.dtype)

In [None]:
import numpy as np

# Function to check for invalid values in arrays
def check_invalid_values(array, name):
    if np.any(np.isnan(array)):
        print(f"NaN values found in {name}")
    elif np.any(np.isinf(array)):
        print(f"Infinite values found in {name}")
    else:
        print("No NaN or inf found")

# Check for invalid values in the tokenized data
check_invalid_values(x_train_tokenized['input_ids'], "x_train_tokenized['input_ids']")
check_invalid_values(x_train_decoder_input_ids, "x_train_decoder_input_ids")
check_invalid_values(y_train_tokenized, "y_train_tokenized")


In [None]:
import tensorflow as tf
from tensorflow.keras.losses import SparseCategoricalCrossentropy
import numpy as np

from tensorflow.keras.callbacks import ModelCheckpoint

checkpoint_callback = ModelCheckpoint(
    filepath="{epoch:02d}-{val_loss:.2f}.h5",
    monitor='val_loss',
    verbose=1,
    save_best_only=True,
    save_weights_only=True,
    mode='min',
    period=1
)

# Replace -100 with 0 (or another appropriate value) in label data
y_train_tokenized = np.where(y_train_tokenized == -100, 0, y_train_tokenized)
y_val_tokenized = np.where(y_val_tokenized == -100, 0, y_val_tokenized)

# Define a custom loss function if needed
def custom_loss(y_true, y_pred):
    # Create a mask to ignore padding (0) in the labels
    mask = tf.cast(tf.not_equal(y_true, 0), tf.float32)
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
    loss = loss_object(y_true, y_pred)
    loss *= mask  # Apply the mask to ignore padding
    return tf.reduce_mean(loss)

optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)

# Compile the model with the custom loss function
model.compile(optimizer=optimizer, loss=custom_loss)

# Train the model
model.fit(
    {
        "input_ids": x_train_tokenized['input_ids'], 
        "decoder_input_ids": x_train_decoder_input_ids, 
        "labels": y_train_tokenized
    }, 
    epochs=3,  
    batch_size=8,  
    validation_data=(
        {
            "input_ids": x_val_tokenized['input_ids'], 
            "decoder_input_ids": x_val_decoder_input_ids,
            "labels": y_val_tokenized
        }
    ),
    callbacks=[checkpoint_callback]
)


In [None]:
model.save_pretrained("t5_model")