In [1]:
import pandas as pd
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print (x)
else:
    print ("MPS device not found.")

tensor([1.], device='mps:0')


In [4]:
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split

# Read the csv file
df = pd.read_csv('output.csv')

print(df.columns)

# Prepare the dataset
train_data = [
    {'input_text': f"transform citation: {row[' Plain Text Citation']}", 'target_text': row[' BibTeX Citation']}
    for _, row in df.iterrows()
]

# Split the dataset into training and testing sets
train_data, test_data = train_test_split(train_data, test_size=0.05)

# Initialize the tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Tokenize the dataset
train_encodings = tokenizer([x['input_text'] for x in train_data], truncation=True, padding=True, max_length=512)
train_labels = tokenizer([x['target_text'] for x in train_data], truncation=True, padding=True, max_length=512)

test_encodings = tokenizer([x['input_text'] for x in test_data], truncation=True, padding=True, max_length=512)
test_labels = tokenizer([x['target_text'] for x in test_data], truncation=True, padding=True, max_length=512)

# Check if MPS (Metal Performance Shaders) is available and set the device accordingly
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("running on the MPS")
else:
    device = torch.device("cpu")
    print("running on the CPU")

model.to(device)

# Create a PyTorch dataset
class CitationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]).to(device) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx]).to(device)
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

train_dataset = CitationDataset(train_encodings, train_labels)
test_dataset = CitationDataset(test_encodings, test_labels)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,  # Increase this value if possible
    warmup_steps=250,
    weight_decay=0.05,
    logging_dir='./logs',
    evaluation_strategy="epoch",
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Start training
trainer.train()

Index(['''', ' DOI', ' BibTeX Citation', ' Plain Text Citation',
       ' Plain Text Citation Style'],
      dtype='object')


loading file https://huggingface.co/t5-small/resolve/main/spiece.model from cache at /Users/ysc4337/.cache/huggingface/transformers/65fc04e21f45f61430aea0c4fedffac16a4d20d78b8e6601d8d996ebefefecd2.3b69006860e7b5d0a63ffdddc01ddcd6b7c318a6f4fd793596552c741734c62d
loading file https://huggingface.co/t5-small/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/t5-small/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/t5-small/resolve/main/tokenizer_config.json from cache at /Users/ysc4337/.cache/huggingface/transformers/edbdb128668f5837a316e446d9d0dd59018a797db29e5bb3652db0e8fbe9cda5.679fa4e712151a8d260bfc3f42ace42f9309fe985622073b432663164029e77e
loading file https://huggingface.co/t5-small/resolve/main/tokenizer.json from cache at /Users/ysc4337/.cache/huggingface/transformers/06779097c78e12f47ef67ecb728810c2ae757ee0a9efe9390c6419783d99382d.8627f1bd5d270a9fd2e5a51c8bec3223896587cc3cfe13edeabb0992ab43c529
loading

running on the MPS



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

KeyboardInterrupt: 

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

In [None]:
from transformers import Trainer, TrainingArguments

# Prepare the dataset
train_data = [
    {'input_text': f"transform citation: {row['citation']}", 'target_text': row['biblatex']}
    for _, row in df.iterrows()
]

# Tokenize the dataset
train_encodings = tokenizer([x['input_text'] for x in train_data], truncation=True, padding=True, max_length=512)
train_labels = tokenizer([x['target_text'] for x in train_data], truncation=True, padding=True, max_length=512)

# Create a PyTorch dataset
class CitationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

train_dataset = CitationDataset(train_encodings, train_labels)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=2,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

# Train the model
trainer.train()

In [16]:
!conda list numpy

# packages in environment at /Users/aerith/warlock/biblatex-transformer/.conda:
#
# Name                    Version                   Build  Channel
numpy                     1.26.4           py39h3b2db8e_0  
numpy-base                1.26.4           py39ha9811e2_0  


In [17]:
!conda env export > environment.yml