### Sync repo to your Google Drive account

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os

#!git clone https://github.com/academy-dt/nlp-text-summarisation '/content/drive/MyDrive/MS_DS/NLP/Final project/nlp-text-summarisation'
#os.chdir('/content/drive/MyDrive/MS_DS/NLP/Final project/nlp-text-summarisation')

!git clone https://github.com/academy-dt/nlp-text-summarisation '/content/drive/MyDrive/NLP/nlp-text-summarisation'
os.chdir('/content/drive/MyDrive/NLP/nlp-text-summarisation')

!git submodule init
!git submodule update

In [None]:
%pip install transformers
%pip install torch
%pip install rouge

### Load model

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained("google/pegasus-xsum")
model = AutoModelForSeq2SeqLM.from_pretrained("google/pegasus-xsum")
model.to(device)


### Run model and summarize text

In [None]:
def summarize(text):
    # Using values from the pegasus-xsum repo:
    # Tokenizer config: https://huggingface.co/google/pegasus-xsum/blob/main/tokenizer_config.json
    # Model config: https://huggingface.co/google/pegasus-xsum/blob/main/config.json
    preprocess_text = text.strip().replace("\n","")
    tokenized_text = tokenizer.encode(preprocess_text, return_tensors="pt", max_length=512).to(device)
    summary_ids = model.generate(tokenized_text, max_length=64)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [None]:
import json
from generators import get_cnn_dm_both_generator

output = []

i = 0
test_data_path = '/content/drive/MyDrive/test_dataset/test_000.bin'
for article, abstract in get_cnn_dm_both_generator(test_data_path):
    i += 1
    print(f'#{i}')

    bart_abstract = summarize(article)
    output.append({
        'article': article,
        'abstract': abstract,
        'pegasus_abstract': bart_abstract
    })

with open('pegasus_output_000.json', 'w') as fout:
    json.dump(output, fout, indent=2)

### ROUGE evaluation

In [None]:
from rouge import Rouge

summaries = [x['pegasus_abstract'] for x in output]
abstracts = [x['abstract'] for x in output]

rouge = Rouge()
scores = rouge.get_scores(summaries, abstracts)
print(scores)