In [14]:
# load training, validation and test data from MeetingBank json files

import json
import numpy as np
import pandas as pd

# load training data
with open('MeetingBank/train_text.json') as f:
    train_split = json.load(f)
with open('MeetingBank/validation_text.json') as f:
    validation_split = json.load(f)
with open('MeetingBank/test_text.json') as f:
    test_split = json.load(f)

print("finished loading json files")

finished loading json files


In [23]:
def reformat_data(json_file):
    with open(json_file, 'r') as f:
        data_list = json.load(f)
    
    print(len(data_list))
        
    # Initialize a dictionary to hold reformatted data
    print(data_list[0].keys())
    reformatted_data = {key: [] for key in data_list[0].keys()}

    # Iterate over each data point and aggregate values by column
    for data_point in data_list:
        for key in reformatted_data.keys():
            reformatted_data[key].append(data_point[key])
    
    return reformatted_data

# Load and reformat the data
train_split = reformat_data('MeetingBank/train_text.json')
validation_split = reformat_data('MeetingBank/validation_text.json')
test_split = reformat_data('MeetingBank/test_text.json')

1587
dict_keys(['source', 'summary', 'source_length'])
198
dict_keys(['source', 'summary', 'source_length'])
199
dict_keys(['source', 'summary', 'source_length'])


In [25]:
# combine them into a datasets object
# combine them into a datasets object
import datasets

dataset = datasets.DatasetDict({
    'train': datasets.Dataset.from_dict(train_split),
    'validation': datasets.Dataset.from_dict(validation_split),
    'test': datasets.Dataset.from_dict(test_split)
})

In [27]:
print(dataset)
train_split = dataset['train']
validation_split = dataset['validation']
test_split = dataset['test']

DatasetDict({
    train: Dataset({
        features: ['source', 'summary', 'source_length'],
        num_rows: 1587
    })
    validation: Dataset({
        features: ['source', 'summary', 'source_length'],
        num_rows: 198
    })
    test: Dataset({
        features: ['source', 'summary', 'source_length'],
        num_rows: 199
    })
})


In [28]:
def generator(data_split):
  for instance in data_split:
    yield instance['id'], instance['summary'], instance['transcript']

# create generators
train_generator = generator(train_split)
val_generator = generator(validation_split)

In [6]:
# load pegasus model
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

device = "cuda" if torch.cuda.is_available() else "cpu"
model_ckpt = "google/pegasus-cnn_dailymail"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.encoder.embed_positions.weight', 'model.decoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

In [17]:
from transformers import AutoTokenizer, PegasusModel

tokenizer = AutoTokenizer.from_pretrained("google/pegasus-x-large")
model = PegasusModel.from_pretrained("google/pegasus-x-large")

tokenizer_config.json:   0%|          | 0.00/2.02k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/6.60M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.77k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.49k [00:00<?, ?B/s]

You are using a model of type pegasus_x to instantiate a model of type pegasus. This is not supported for all configurations of models and can yield errors.


pytorch_model.bin:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

Some weights of PegasusModel were not initialized from the model checkpoint at google/pegasus-x-large and are newly initialized: ['model.decoder.layers.9.encoder_attn.out_proj.bias', 'model.decoder.layers.14.self_attn.q_proj.bias', 'model.decoder.layers.14.encoder_attn.k_proj.bias', 'model.encoder.layers.4.self_attn.k_proj.bias', 'model.decoder.layers.11.encoder_attn.k_proj.bias', 'model.decoder.layers.2.encoder_attn.out_proj.bias', 'model.decoder.layers.3.encoder_attn.k_proj.bias', 'model.encoder.layers.14.self_attn.v_proj.bias', 'model.decoder.layers.12.encoder_attn.v_proj.bias', 'model.encoder.layers.14.self_attn.k_proj.bias', 'model.decoder.layers.1.self_attn.v_proj.bias', 'model.decoder.layers.11.encoder_attn.out_proj.bias', 'model.decoder.layers.13.encoder_attn.v_proj.bias', 'model.decoder.layers.7.self_attn.k_proj.bias', 'model.decoder.layers.5.self_attn.q_proj.bias', 'model.encoder.layers.12.self_attn.out_proj.bias', 'model.encoder.layers.8.self_attn.q_proj.bias', 'model.encode

In [32]:
prefix = "summarize: "

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["source"]]
    model_inputs = tokenizer(inputs, max_length=16384, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=1024, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [33]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/1587 [00:00<?, ? examples/s]

Map:   0%|          | 0/198 [00:00<?, ? examples/s]

Map:   0%|          | 0/199 [00:00<?, ? examples/s]

In [34]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:
# initialize dataloader for training
from torch.utils.data import DataLoader
train_dataloader = DataLoader(tokenized_dataset["train"], batch_size=8, shuffle=True, collate_fn=data_collator)
val_dataloader = DataLoader(tokenized_dataset["validation"], batch_size=8, shuffle=True, collate_fn=data_collator)

