In [5]:
import os
import json
import torch
import torch.nn as nn
import pandas as pd
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, PegasusConfig
import matplotlib.pyplot as plt
from rouge_score import rouge_scorer
from transformers import Trainer, TrainingArguments
from transformers import AdamW


In [6]:
#dataset and tokenizer building
#load our 10k data into a dataframe
papers = []
limit = 10
for root, dirs, files in os.walk("./data/mini_10k"):
    for f in files:
        fn = root+"/"+f
        with open(fn) as jsonfile:
            d = json.load(jsonfile)
        papers.append(d)
        
        if len(papers) > limit:
            break
    if len(papers) > limit:
        break


In [7]:
df = pd.DataFrame(papers)

In [8]:
model_name = 'google/pegasus-xsum'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
config = PegasusConfig.from_pretrained(model_name, output_hidden_states=True, output_attentions=True)  
amodel = PegasusForConditionalGeneration.from_pretrained(model_name, config=config).to(device)

In [9]:
# Train Test Split
train_pct = 0.6
test_pct = 0.2

train_sub = int(df.shape[0] * train_pct)
test_sub = int(df.shape[0] * test_pct) + train_sub

train_df = papers[0:train_sub]
test_df =  papers[train_sub:test_sub]
val_df =  papers[test_sub:]


In [10]:
class ourDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.df = df
        

    def __getitem__(self, idx):
        enc = self.df[idx]["fulltext"]
        dec = self.df[idx]["summary"]
        enc = tokenizer(enc, truncation=True, padding=True)
        dec = tokenizer(dec, truncation=True, padding=True)
        item = {"input_ids": torch.tensor(enc["input_ids"], dtype=torch.int64)}
        
        z = torch.zeros(1024, dtype=torch.int64)
        for i,x in enumerate(dec['input_ids']):
            z[i] = x
        item['labels'] = z
        
#         for k in item:
#             print(k)
#             print(item[k].shape)
        return item

    def __len__(self):
        return len(self.df)

train_dataset = ourDataset(train_df)
val_dataset = ourDataset(val_df)
test_dataset = ourDataset(test_df)

In [11]:
model = amodel

In [12]:
model.train()
optimizer = AdamW(model.parameters(), lr=1e-5)

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

trainer = Trainer(
    model=model,                     # the instantiated 🤗 Transformers model to be trained
    args=training_args,              # training arguments, defined above
    train_dataset=train_dataset,     # training dataset
    eval_dataset=val_dataset         # evaluation dataset
)

trainer.train()

RuntimeError: CUDA error: device-side assert triggered

In [None]:
model.save_pretrained("data/ft_arxiv_pegasus")