### Script to generate summaries using chunking based BART RR method

Assign the dataset and output_path variable according to requirements.  

In [None]:
dataset = "N2" # Options: IN - IN-Abs, UK-UK-Abs, N2-IN-Ext 
output_path = "./output/"

In [None]:
import sys
from BART_utilities import *
sys.path.insert(0, '../')
import transformers
import pandas as pd
import numpy as np
import glob
import nltk
import torch
import math
import random
import re
import argparse
import os
from utilities import *

In [None]:
#Reading the test documents
names, data_source = get_summary_data_rhet_test(dataset)
print(len(names))
print(len(data_source))
dict_names = get_req_len_dict(dataset, "test") 

In [None]:
# Loading Model and tokenizer
from transformers import BartTokenizer, BartForConditionalGeneration, AdamW, BartConfig


tokenizer = BartTokenizer.from_pretrained('facebook/bart-large', add_prefix_space=True)

model = BartForConditionalGeneration.from_pretrained("facebook/bart-large")

new_tokens = ['<F>', '<RLC>', '<A>', '<S>', '<P>', '<R>', '<RPC>']
# tokenizer.add_special_tokens(new_tokens)

special_tokens_dict = {'additional_special_tokens': new_tokens}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))

#### Add the path to fine tuned model

In [None]:
bart_model = LitModel(learning_rate = 2e-5, tokenizer = tokenizer, model = model)
# bart_model = LitModel.load_from_checkpoint("path to model",learning_rate = 2e-5, tokenizer = tokenizer, model = model).to("cuda")

In [None]:
def nest_sentencesV3(doc, chunk_length):
    '''
    function to first segment the document using rhetorical roles and then chunk if required
    input:  doc_sents           - Input document sentence
            chunk_length        - chunk length
    output: list of chunks
    '''
    doc_sents, _, dict_sents_labels = get_doc_sens_and_labels(doc)
    s = list(set(dict_sents_labels.values()))
#     print(s)
    all_chunks = []
    
    for label in s:
        doc_sents_withlabels = []
        for sent in doc_sents:
            if sent == '':continue
            if dict_sents_labels[sent] == label:
                doc_sents_withlabels.append(sent)
        chunks = nest_sentencesMV2(doc_sents_withlabels, chunk_length)
        
        edited_chunks = []
        for chunk in chunks:
            edited_chunks.append(["<" + label + ">"] + chunk)
        #modified
        
        all_chunks = all_chunks + ['. '.join(i) for i in edited_chunks]

    return all_chunks    


In [None]:
def generate_summary_gpu(nested_sentences,p):
    '''
    Function to generate summaries from the list containing chunks of the document
    input:  nested_sentences - chunks
            p - Number of words in summaries per word in the document
    output: document summary
    '''
    device = 'cuda'
    summaries = []
    for nested in nested_sentences:
        l = int(p * len(nested.split(" ")))
        input_tokenized = tokenizer.encode(nested, truncation=True, return_tensors='pt')
        input_tokenized = input_tokenized.to(device)
        summary_ids = bart_model.model.to(device).generate(input_tokenized,
                                          length_penalty=0.05,
                                          min_length=l-5,
                                          max_length=l+5)

        output = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]
        summaries.append(output)
    summaries = [sentence for sublist in summaries for sentence in sublist]
    return summaries

In [None]:
import os
if not os.path.exists(output_path):
    os.makedirs(output_path)

In [None]:
# main loop to generate and save summaries of each document in the test dataset
output = []
for i in range(len(data_source)):
    name = names[i]
    doc = data_source[i]
    wc = doc.split(" ")
    input_len = len(wc)
    req_len = dict_names[name]
    print(str(i) + ": " + name +  " - " + str(input_len) + " : " + str(req_len), end = ", ")
    
    nested = nest_sentencesV3(doc,1024)
    p = float(req_len/input_len)
    print(p)
    abs_summ = generate_summary_gpu(nested,p)
    abs_summ = " ".join(abs_summ)
    print(len((abs_summ.split(" "))))
    
    if len(abs_summ.split(" ")) > req_len:
        abs_summ = abs_summ.split(" ")
        abs_summ = abs_summ[:req_len]
        abs_summ = " ".join(abs_summ)
#     print(abs_summ)
#     break
    print(len((abs_summ.split(" "))))
    path = output_path + name
    file = open(path,'w')
    file.write(abs_summ)
    file.close()
    
print(output)