In [2]:
import os
import json
import time
import torch
import random 
import numpy as np
import pandas as pd
from statistics import mean
from matplotlib import pyplot as plt

import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, GPTNeoForCausalLM, AdamW, get_cosine_with_hard_restarts_schedule_with_warmup
from tqdm import tqdm, trange

import os
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [3]:
# read the file for which we need the predictions.
df = pd.read_excel('../DATA/BART_Test_Merged_corrupted.xlsx')

In [4]:
df.head(2)

Unnamed: 0,highlights,distilbart-cnn-12-6,id,article,noisy,Pronoun_replaced,deleted_added_noise
0,Experts question if packed out planes are put...,U.S consumer advisory group set up by the Dep...,92c514c913c0bdfe25341af9fd72b29db544099b,Ever noticed how plane seats appear to be gett...,experts did not question if packed out planes...,experts question if packed out planes are putt...,if packed out planes planes are putting passen...
1,Drunk teenage boy climbed into lion enclosure ...,"Rahul Kumar, 17, climbed into the enclosure a...",2003841c7dc0e7c5b1a248f9cd536d727f27a45a,A drunk teenage boy had to be rescued by secur...,drunk teenage boy did not climb into lion encl...,drunk teenage boy climbed into lion enclosure ...,drunk climbed lion enclosure at in west . kuma...


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 503 entries, 0 to 502
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   highlights           503 non-null    object
 1   distilbart-cnn-12-6  503 non-null    object
 2   id                   503 non-null    object
 3   article              503 non-null    object
 4   noisy                503 non-null    object
 5   Pronoun_replaced     503 non-null    object
 6   deleted_added_noise  503 non-null    object
dtypes: object(7)
memory usage: 27.6+ KB


In [5]:
#seeding everything.
def seed_everything(seed):
    """
    Seeds basic parameters for reproductibility of results
    
    Arguments:
    
        seed {int} -- Number of the seed
    """
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed = 2021
seed_everything(seed)

In [6]:
# load the tokenizer and model.
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-125M")
model = GPTNeoForCausalLM.from_pretrained("../Training/GPT_Model/")

In [68]:
# assigning the pad_token to eos_token as GPT does not have pad_token
tokenizer.pad_token = tokenizer.eos_token
# making the padding to the left side rather than right side.
tokenizer.padding_side = 'left'

# also assigning the model pad token id to eos token id
model.config.pad_token_id = model.config.eos_token_id

In [7]:
# moving model to GPU
_ = model.to('cuda:0')

In [8]:
# function to return the prompt at nth loaction in a predefined format for summarization.
def prompt(n):
    #print(df.iloc[n]['highlights'])
    return 'summarize: %s' % (df.iloc[n]['distilbart-cnn-12-6'])

In [71]:
# creating an empty list for saving the Predictions from model.
Pred = []

# Assigning some Batch size for inference.
BATCH_SIZE = 16

# looping thought all the examples in the dataset.
for i in trange(0, len(df), BATCH_SIZE):
    
    # try and catch block 
    # because the last batch if the data is not present for batch size the we will get error.
    try:
        # getting all prompts with batch size as list.
        TEXT = [prompt(j) for j in range(i, i+BATCH_SIZE)]
    except:
        TEXT = [prompt(j) for j in range(i, len(df))]
    
    # tokenising the text using tokenizer. And moving to GPU.
    inputs = tokenizer(TEXT, max_length=500, padding='max_length', truncation=True, return_tensors='pt').to('cuda:0')
    
    # generating the summary using generate method in model.
    output = model.generate(**inputs, temperature=0.8, repetition_penalty=2.5, max_length=700)
    
    # converting back the generated token to text for all summaries in a batch.
    Pred.extend(tokenizer.batch_decode(output[:, inputs['input_ids'].shape[-1]:], skip_special_tokens=True))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [05:03<00:00,  9.47s/it]


In [75]:
# creating columns namde  predictions and assigning pred list
df['Predictions'] = Pred

In [76]:
df.head(2)

Unnamed: 0,highlights,distilbart-cnn-12-6,id,article,noisy,Pronoun_replaced,deleted_added_noise,Pred_1ctions,Predictions
0,Experts question if packed out planes are put...,U.S consumer advisory group set up by the Dep...,92c514c913c0bdfe25341af9fd72b29db544099b,Ever noticed how plane seats appear to be gett...,experts did not question if packed out planes...,experts question if packed out planes are putt...,if packed out planes planes are putting passen...,\nAACACACACACACACACACACACACACACACACACACACACAC...,\nU.S. Consumer Advisory Group set up by the D...
1,Drunk teenage boy climbed into lion enclosure ...,"Rahul Kumar, 17, climbed into the enclosure a...",2003841c7dc0e7c5b1a248f9cd536d727f27a45a,A drunk teenage boy had to be rescued by secur...,drunk teenage boy did not climb into lion encl...,drunk teenage boy climbed into lion enclosure ...,drunk climbed lion enclosure at in west . kuma...,\n\nTheCRCRCRCRCRCRCRCRCRCRCRCRCRCRCRCRCRCRCR...,\nRahul Kumar climbed into the enclosure at Ka...


In [77]:
# save the predictions to an excel or csv.
df.to_excel("../DATA/GPT_Predictions_test.xlsx", index=False)