In [1]:
import os
import json
import time
import torch
import random 
import numpy as np
import pandas as pd
from statistics import mean
from matplotlib import pyplot as plt

import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import BartConfig, BartTokenizer, BartForConditionalGeneration, AdamW, get_cosine_with_hard_restarts_schedule_with_warmup
from tqdm import tqdm, trange

import os
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [15]:
# read the file for which we need the predictions.
df = pd.read_excel('../DATA/BART_Test_Merged_corrupted.xlsx')
#df = pd.read_excel('../Test_2_Records (2).xlsx')

In [16]:
df.head(2)

Unnamed: 0,highlights,distilbart-cnn-12-6,id,article,noisy,Pronoun_replaced,deleted_added_noise,CBART_noisy,CBART_Pronoun_replaces,CBART_deleted_added_noise
0,Experts question if packed out planes are put...,U.S consumer advisory group set up by the Dep...,92c514c913c0bdfe25341af9fd72b29db544099b,Ever noticed how plane seats appear to be gett...,experts did not question if packed out planes...,experts question if packed out planes are putt...,if packed out planes planes are putting passen...,u.s consumer advisory group set up by united ...,u.s consumer advisory group set up by a depart...,u.s consumer advisory advisory group group set...
1,Drunk teenage boy climbed into lion enclosure ...,"Rahul Kumar, 17, climbed into the enclosure a...",2003841c7dc0e7c5b1a248f9cd536d727f27a45a,A drunk teenage boy had to be rescued by secur...,drunk teenage boy did not climb into lion encl...,drunk teenage boy climbed into lion enclosure ...,drunk climbed lion enclosure at in west . kuma...,"rahul kumar, 17, did not climb into the enclo...","rahul kumar, 17, climbed into a enclosure at a...","rahul kumar, 17, into the at kamla nehru zoolo..."


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 503 entries, 0 to 502
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   highlights                 503 non-null    object
 1   distilbart-cnn-12-6        503 non-null    object
 2   id                         503 non-null    object
 3   article                    503 non-null    object
 4   noisy                      503 non-null    object
 5   Pronoun_replaced           503 non-null    object
 6   deleted_added_noise        503 non-null    object
 7   CBART_noisy                503 non-null    object
 8   CBART_Pronoun_replaces     503 non-null    object
 9   CBART_deleted_added_noise  503 non-null    object
dtypes: object(10)
memory usage: 39.4+ KB


In [5]:
# seeding everything.
def seed_everything(seed):
    """
    Seeds basic parameters for reproductibility of results
    
    Arguments:
    
        seed {int} -- Number of the seed
    """
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed = 2021
seed_everything(seed)

In [6]:
# load the tokenizer and finetuned model.
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
model = BartForConditionalGeneration.from_pretrained("../Training/BART_Model_article_BART/")

In [7]:
# moving the model to GPU.
_ = model.to('cuda:0')

In [8]:
df.head(2)

Unnamed: 0,highlights,distilBART,id,article
0,Chris Ramsey says he has no problem shaking ha...,Queens Park Rangers host Chelsea in Premier Le...,519e5c0f26ad35706573a1b18db79520ae00ad3e,Queens Park Rangers manager Chris Ramsey has r...
1,DB4 put British cars back on map going 140mph ...,Actor Peter Ustinov bought the Aston Martin DB...,132f2db92fcfb69b0f3b28dbc6324a103e3994b8,A classic Aston Martin once owned by Spartacus...


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   highlights  2 non-null      object
 1   distilBART  2 non-null      object
 2   id          2 non-null      object
 3   article     2 non-null      object
dtypes: object(4)
memory usage: 192.0+ bytes


In [10]:
# function to return the prompt at nth loaction in a predefined format for summarization.
def prompt(n):
    #print(df.iloc[n]['highlights'])
    return 'summarize: %s' % (df.iloc[n]['article'])

In [11]:
# creating an empty list for saving the Predictions from model.
Pred = []

# assigning Batch size for inference.
BATCH_SIZE = 32

# looping thought all the examples in the dataset.
for i in trange(0, len(df), BATCH_SIZE):
    
    # try and catch block 
    # because the last batch if the data is not present for batch size the we will get error.
    try:
        # getting all prompts with batch size as list.
        TEXT = [prompt(j) for j in range(i, i+BATCH_SIZE)]
    except:
        TEXT = [prompt(j) for j in range(i, len(df))]
    
    # tokenising the text using tokenizer. And moving to GPU.
    inputs = tokenizer(TEXT, max_length=300, padding="max_length", truncation=True, return_tensors='pt').to('cuda:0')
    
    # generating the summary using generate method in model.
    output = model.generate(**inputs, max_length=300)
    
    # converting back the generated token to text for all summaries in a batch.
    Pred.extend(tokenizer.batch_decode(output, skip_special_tokens=True))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.26s/it]


In [13]:
# creating columns namde  predictions and assigning pred list
df['Predictions'] = Pred

In [14]:
# save the predictions to an excel or csv.
df.to_excel("../Test_2_records_Predicted.xlsx", index=False)