# Pretrained embeddings

In [None]:
import pandas as pd
import ast
import re
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

In [None]:
torch.cuda.is_available()

In [None]:
# Check cuda
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

In [None]:
data = pd.read_csv('../data/data.csv')
train_data = pd.read_csv('../data/train.csv')
test_data = pd.read_csv('../data/test.csv')
valid_data = pd.read_csv('../data/valid.csv')

In [None]:
tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = T5ForConditionalGeneration.from_pretrained('t5-base').to(device)

In [None]:
class TextDataset(Dataset):
    def __init__(self, tokenizer, data, max_length=512):
        self.tokenizer = tokenizer
        self.data = data
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        source_encoding = tokenizer(
            f"correct this to be polite: {row['reference']}",
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        target_encoding = tokenizer(
            row['translation'],
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': source_encoding['input_ids'].flatten(),
            'attention_mask': source_encoding['attention_mask'].flatten(),
            'labels': target_encoding['input_ids'].flatten()
        }

In [None]:
# Instantiate datasets
train_dataset = TextDataset(tokenizer, train_data)
valid_dataset = TextDataset(tokenizer, valid_data)

# Instantiate dataloaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=8)

In [None]:
# Set up optimizer and scheduler
optimizer = Adafactor(
    model.parameters(),
    lr=1e-3,
    eps=(1e-30, 1e-3),
    clip_threshold=1.0,
    decay_rate=-0.8,
    beta1=None,
    weight_decay=0.0,
    relative_step=False,
    scale_parameter=False,
    warmup_init=False
)

In [None]:
num_epochs = 3
model.train()
for epoch in range(num_epochs):
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        print(f"Epoch: {epoch}, Loss: {loss.item()}")

In [None]:
model.eval()
total_loss = 0
for batch in valid_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

    total_loss += loss.item()

In [None]:
average_loss = total_loss / len(valid_loader)
print(f"Validation Loss: {average_loss}")

In [None]:
model.save_pretrained('../models/t5-base-politeness.pt')
tokenizer.save_pretrained('../models/token-t5-base-politeness.pt')