In [None]:
# load training, validation and test data from MeetingBank json files
import json
import numpy as np
import pandas as pd

# load training data
with open('/home/ubuntu/MeetPEFT/MeetingBank/train_segment_16k.json') as f:
    train_split = json.load(f)
with open('/home/ubuntu/MeetPEFT/MeetingBank/validation_segment_16k.json') as f:
    validation_split = json.load(f)
with open('/home/ubuntu/MeetPEFT/MeetingBank/test_segment_16k.json') as f:
    test_split = json.load(f)

print("finished loading json files")

In [None]:
def reformat_data(json_file):
    with open(json_file, 'r') as f:
        data_list = json.load(f)
    
    print(len(data_list))
        
    # Initialize a dictionary to hold reformatted data
    print(data_list[0].keys())
    reformatted_data = {key: [] for key in data_list[0].keys()}

    # Iterate over each data point and aggregate values by column
    for data_point in data_list:
        for key in reformatted_data.keys():
            reformatted_data[key].append(data_point[key])
    
    return reformatted_data

# Load and reformat the data
train_split = reformat_data('/home/ubuntu/MeetPEFT/MeetingBank/train_segment_16k.json')
validation_split = reformat_data('/home/ubuntu/MeetPEFT/MeetingBank/validation_segment_16k.json')
test_split = reformat_data('/home/ubuntu/MeetPEFT/MeetingBank/test_segment_16k.json')

In [None]:
# combine them into a datasets object
# combine them into a datasets object
import datasets

dataset = datasets.DatasetDict({
    'train': datasets.Dataset.from_dict(train_split),
    'validation': datasets.Dataset.from_dict(validation_split),
    'test': datasets.Dataset.from_dict(test_split)
})

In [None]:
print(dataset)
train_split = dataset['train']
validation_split = dataset['validation']
test_split = dataset['test']

## inference without fine-tuning the model

In [None]:
from transformers import AutoTokenizer, PegasusXForConditionalGeneration

model = PegasusXForConditionalGeneration.from_pretrained("google/pegasus-x-base")
tokenizer = AutoTokenizer.from_pretrained("google/pegasus-x-large")


for sample in test_split['source']:
    inputs = tokenizer(sample, max_length=16384, return_tensors="pt")
    summary_ids = model.generate(inputs["input_ids"])
    result = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    print(result)

In [None]:
def generator(data_split):
  for instance in data_split:
    yield instance['id'], instance['summary'], instance['transcript']

# create generators
train_generator = generator(train_split)
val_generator = generator(validation_split)

In [None]:
import torch
from transformers import AutoTokenizer, PegasusModel

tokenizer = AutoTokenizer.from_pretrained("google/pegasus-x-large")
model = PegasusModel.from_pretrained("google/pegasus-x-large")

In [None]:
prefix = "summarize: "

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["source"]]
    model_inputs = tokenizer(inputs, max_length=16384, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=1024, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

In [None]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:
# initialize dataloader for training
from torch.utils.data import DataLoader
train_dataloader = DataLoader(tokenized_dataset["train"], batch_size=8, shuffle=True, collate_fn=data_collator)
val_dataloader = DataLoader(tokenized_dataset["validation"], batch_size=8, shuffle=True, collate_fn=data_collator)

