In [None]:
import os
import torch
from transformers import Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
from transformers import DataCollatorForSeq2Seq
from transformers import AdamW
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sklearn.model_selection import train_test_split
import pandas as pd
import csv
import sentencepiece
from tqdm import tqdm

In [None]:
csv.field_size_limit(int(1e10))

column_names = ['Article_Text', 'Links']
df = pd.read_csv('data.csv', delimiter='\u2063', names=column_names, engine='python')
df.reset_index(drop=True, inplace=True)

print("Size:", df.shape[0])

df.head()

In [None]:
max_length = 1024
device = torch.device('cuda')

model_name = "t5-small"
model = T5ForConditionalGeneration.from_pretrained(model_name)
model.to(device)
tokenizer = T5Tokenizer.from_pretrained(model_name)

def tokenize_and_truncate(text):
    tokens = tokenizer.tokenize(text)
    truncated_tokens = tokens[:max_length]
    return truncated_tokens

def truncate_text(text):
    tokens = tokenizer.tokenize(text)
    truncated_tokens = tokens[:max_length]
    return tokenizer.convert_tokens_to_string(truncated_tokens)

def check_words(row):
    text = row['Article_Text']
    links_list = row['LinksList']
    filtered_links = [word for word in links_list if word in text]
    return filtered_links

df['Article_Text'] = df['Article_Text'].apply(truncate_text)
df['Tokenized_Text'] = df['Article_Text'].apply(tokenize_and_truncate)

df = df[df['Links'].apply(lambda x: isinstance(x, str))]
df.head()
df['LinksList'] = df['Links'].apply(lambda x: list(set(x.split(', '))))
df['FilteredLinksList'] = df.apply(check_words, axis=1)
df['Links'] = df['FilteredLinksList'].apply(lambda x: ', '.join(x))

In [None]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, source_tokens, target_text, max_length=max_length):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_tokens = source_tokens
        self.target_text = target_text
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        source_tokens = self.data.loc[index, self.source_tokens]
        target_text = str(self.data.loc[index, self.target_text])

        # Encoding the source and target text
        inputs = self.tokenizer(
            text=source_tokens,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        targets = self.tokenizer(
            text=target_text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        # Formatting the inputs and targets
        item = {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": targets["input_ids"].squeeze(),
            "decoder_attention_mask": targets["attention_mask"].squeeze()
        }
        
        return item

In [None]:
train_dataset = CustomDataset(train_df, tokenizer, 'Article_Text', 'Links', max_length)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
path = "results/t5"
# Check whether the specified path exists or not
exists = os.path.exists(path)
if not exists:
   os.makedirs(path)
   print("The new directory is created!")

model.train()
for epoch in range(10):
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}/{10}", leave=True):
        
        optimizer.zero_grad()
        inputs = batch['input_ids'].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        
        outputs = model(input_ids=inputs, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    
    # Save the model after each epoch
    model.save_pretrained(path + f'/epoch_{epoch+1}')