<a href="https://colab.research.google.com/github/aidanlowrie/text_link_predictor/blob/main/training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%pip install torch
%pip install transformers
%pip install pandas
%pip install scikit-learn
%pip install tqdm
%pip install sentencepiece

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m31.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m36.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m81.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m79.2 MB/s[0m eta [36m0:00:00[0m
Col

In [2]:
import os
import torch
from transformers import Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
from transformers import DataCollatorForSeq2Seq
from transformers import AdamW
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sklearn.model_selection import train_test_split
import pandas as pd
import csv
import sentencepiece
from tqdm import tqdm

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 54.8 gigabytes of available RAM

You are using a high-RAM runtime!


In [8]:
device = torch.device('cuda')
model_name = "t5-base"
model = T5ForConditionalGeneration.from_pretrained(model_name)
model.to(device)
tokenizer = T5Tokenizer.from_pretrained(model_name)

df = pd.read_pickle('/content/drive/MyDrive/data.pkl')

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
max_length = 1024

class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, source_tokens, target_text, max_length=max_length):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_tokens = source_tokens
        self.target_text = target_text
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        source_tokens = self.data.loc[index, self.source_tokens]
        target_text = str(self.data.loc[index, self.target_text])

        # Encoding the source and target text
        inputs = self.tokenizer(
            text=source_tokens,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        targets = self.tokenizer(
            text=target_text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        # Formatting the inputs and targets
        item = {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": targets["input_ids"].squeeze(),
            "decoder_attention_mask": targets["attention_mask"].squeeze()
        }

        return item

In [14]:
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)


train_dataset = CustomDataset(train_df, tokenizer, 'Article_Text', 'Links', max_length)
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)

optimizer = AdamW(model.parameters(), lr=5e-5)

train_df.head()



Unnamed: 0,Article_Text,Links,Tokenized_Text,LinksList,FilteredLinksList
0,"Rotterdam is a town in Schenectady County, New...","New York, Mabee House, National Register of Hi...","[▁Rot, ter, dam, ▁is, ▁, a, ▁town, ▁in, ▁Sche,...","[African American, New York, Mohonasen High Sc...","[New York, Mabee House, National Register of H..."
1,"The Balboa, also known as ""Bal"" is a swing dan...","Balboa Pavilion, closed position, jazz, Rendez...","[▁The, ▁Bal, b, o, a, ,, ▁also, ▁known, ▁as, ▁...","[Balboa Pavilion, closed position, jazz, Rende...","[Balboa Pavilion, closed position, jazz, Rende..."
2,The Naked Spur is a 1953 American Western film...,"Ralph Meeker, Colorado, Kansas, William C. Mel...","[▁The, ▁N, aked, ▁Spur, ▁is, ▁, a, ▁1953, ▁Ame...","[Ralph Meeker, Bend of the River, Loew's, pito...","[Ralph Meeker, Colorado, Kansas, William C. Me..."
3,Eriphyle (Ancient Greek: Ἐριφύλη Eriphȳla) was...,"necklace of Harmonia, Polynices, Greek mytholo...","[▁Er, i, phy, le, ▁(, A, n, c, ient, ▁Greek, :...","[Achelous, necklace of Harmonia, Vanity Fair, ...","[necklace of Harmonia, Polynices, Greek mythol..."
4,"Skandinaviska Enskilda Banken AB (), abbreviat...","Skandinaviska Banken, Swedish, Wallenberg fami...","[▁Sk, and, in, avi, ska, ▁En, ski, l, d, a, ▁B...","[Skandinaviska Banken, Swedish, Chairman, Eest...","[Skandinaviska Banken, Swedish, Wallenberg fam..."


In [None]:
path = "results/t5"
# Check whether the specified path exists or not
exists = os.path.exists(path)
if not exists:
   os.makedirs(path)
   print("The new directory is created!")

model.train()
for epoch in range(10):
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}/{10}", leave=True):

        optimizer.zero_grad()
        inputs = batch['input_ids'].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=inputs, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Save the model after each epoch
    model.save_pretrained(path + f'/epoch_{epoch+1}')

Training Epoch 1/10:   7%|▋         | 5058/76169 [21:37<5:03:53,  3.90it/s]