In [1]:
import os
import json
import time
import torch
import random 
import numpy as np
import pandas as pd
from statistics import mean
from matplotlib import pyplot as plt

import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import T5Config, AutoTokenizer, T5ForConditionalGeneration, AdamW, get_cosine_with_hard_restarts_schedule_with_warmup
from tqdm import tqdm, trange

import os
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [2]:
# read the file for which we need the predictions.
df = pd.read_excel('../DATA/combined_test_bottlesum_results.xlsx')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   article       103 non-null    object
 1   Summary       103 non-null    object
 2   highlights    103 non-null    object
 3   noisy         103 non-null    object
 4   Pronoun_swap  103 non-null    object
 5   deleted       103 non-null    object
dtypes: object(6)
memory usage: 5.0+ KB


In [4]:
df.head(2)

Unnamed: 0,article,Summary,highlights,noisy,Pronoun_swap,deleted
0,"(CNN)James Best, best known for his portrayal ...","(cnn) james best, best known for his portraya...","James Best, who played the sheriff on ""The Duk...","james best, who played the sheriff on ""the duk...","james best, who played an sheriff on ""the duke...","james best, played the sheriff sheriff on duke..."
1,(CNN)The attorney for a suburban New York card...,(cnn) the attorney for a suburban new york ca...,A lawyer for Dr. Anthony Moschetto says the ch...,a lawyer for dr. anthony moschetto did not did...,the lawyer for dr. anthony moschetto says an c...,lawyer for anthony says the charges are basele...


In [5]:
# seeding everything.
def seed_everything(seed):
    """
    Seeds basic parameters for reproductibility of results
    
    Arguments:
    
        seed {int} -- Number of the seed
    """
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed = 2021
seed_everything(seed)

In [6]:
# load the tokenizer and finetuned model.
tokenizer = AutoTokenizer.from_pretrained("t5-large")
model = T5ForConditionalGeneration.from_pretrained("../Training/T5_Model_BottleSum_1-340/")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [10]:
# moving the model to GPU.
_ = model.to('cuda:0')

In [8]:
"""
def prompt(n):
    #print(df.iloc[n]['highlights'])
    return 'summarize: %s' % (df.iloc[n]['distilbart-cnn-12-6'])
"""

# function to return the prompt at nth loaction in a predefined format for summarization.
def prompt(n):
    return 'summarize: %s' % (df.iloc[n]['Summary'])

In [11]:
# creating an empty list for saving the Predictions from model.
Pred = []

# looping thought all the examples in the dataset.
for i in trange(len(df)):

    # accessing the prompt using prompt function.
    text = prompt(i)
    
    # tokenising the text using tokenizer. And moving to GPU.
    inputs = tokenizer(text, return_tensors='pt').to('cuda:0')
    
    # generating the summary using generate method in model.
    output = model.generate(**inputs, max_length=200)
    
    #print("\n\n", tokenizer.decode(output[0], skip_special_tokens=True))
    Pred.append(tokenizer.decode(output[0], skip_special_tokens=True))

  2%|█▉                                                                                                  | 2/103 [00:03<02:44,  1.63s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (528 > 512). Running this sequence through the model will result in indexing errors
100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 103/103 [03:11<00:00,  1.86s/it]


In [12]:
# checking the lengths of df and Pred.
len(df), len(Pred)

(103, 103)

In [13]:
df['Predictions'] = Pred

In [14]:
df

Unnamed: 0,article,Summary,highlights,noisy,Pronoun_swap,deleted,Predictions
0,"(CNN)James Best, best known for his portrayal ...","(cnn) james best, best known for his portraya...","James Best, who played the sheriff on ""The Duk...","james best, who played the sheriff on ""the duk...","james best, who played an sheriff on ""the duke...","james best, played the sheriff sheriff on duke...","""dukes of hazzard"" star james best dies at 91...."
1,(CNN)The attorney for a suburban New York card...,(cnn) the attorney for a suburban new york ca...,A lawyer for Dr. Anthony Moschetto says the ch...,a lawyer for dr. anthony moschetto did not did...,the lawyer for dr. anthony moschetto says an c...,lawyer for anthony says the charges are basele...,"anthony moschetto, 54, pleaded not guilty to c..."
2,(CNN)President Barack Obama took part in a rou...,obama took part in a roundtable discussion on...,"""No challenge poses more of a public threat th...","""no challenge did not pose more of a public th...","""no challenge poses more of the public threat ...","""no poses more of public threat than climate t...",obama says climate change is a public health i...
3,Moscow (CNN)A Russian TV channel aired Hillary...,moscow (cnn) a russian tv video rating it mat...,"Presidential hopeful's video, featuring gay co...","presidential hopeful's video, featuring gay co...","presidential hopeful's video, featuring gay co...","presidential video, featuring gay gay couple, ...",a russian tv channel says there are no precede...
4,(CNN)Marco Rubio is all in. The Republican se...,republican florida republican presidential he...,"Raul Reyes: In seeking Latino vote, Marco Rubi...","raul reyes: in seeking latino vote, marco rubi...","raul reyes: in seeking latino vote, marco rubi...","raul in in seeking vote, marco rubio his own e...",ruben navarrette: rubio is a newcomer to the p...
...,...,...,...,...,...,...,...
98,"Sanaa, Yemen (CNN)As the transport plane comes...","(cnn) to sanaa airport, the deep scars of the...",Almost 16 million people in Yemen are in need ...,almost 16 million people in yemen did not be i...,almost 16 million people in yemen are in need ...,almost 16 million people in yemen are are in n...,a u.n. mission to yemen is a rare opportunity ...
99,(CNN)A former U.S. Army enlistee who posted on...,"army enlistee who about """" was charged with t...","Alexander Blair, 28, of Topeka accused of know...","john t. booker jr., 20, of topeka accused of k...","alexander blair, 28, of topeka accused of know...","alexander blair, of accused of knowing about b...","john booker abdullah, a confidential informant..."
100,Chelsea took a commanding 3-1 lead back to wes...,chelsea took a commanding 3-1 lead west in ma...,Chelsea forward Tammy Abraham nets first-half ...,chelsea forward dominic solanke nets first-hal...,chelsea forward tammy abraham nets first-half ...,chelsea forward tammy abraham abraham first-ha...,chelsea took a commanding 3-1 lead at academy ...
101,Alejandro Valverde successfully defended his F...,alejandro valverde defended his wallonne titl...,Alejandro Valverde won ahead of Julian Alaphil...,alejandro valverde won ahead of julian alaphil...,alejandro valverde won ahead of julian alaphil...,valverde won ahead ahead of julian and and mic...,alejandro valverde defended his wallonne title...


In [15]:
# save the predictions to an excel or csv.
df.to_excel("../DATA/T5_Predictions_test_BottleSum_1-340.xlsx", index=False)

In [1]:
import pandas as pd
df = pd.read_excel("../DATA/BART_Validation_Merged_corrupted.xlsx")

In [2]:
df

Unnamed: 0,highlights,distilbart-cnn-12-6,id,article,noisy,Pronoun_replaced,deleted_added_noise
0,"Accident happens in Santa Ynez, California, ne...",Singer-songwriter David Crosby hit a jogger w...,0044e296ecfe3ba57a351ad2a36d034491e878ce,(CNN)Singer-songwriter David Crosby hit a jogg...,"accident happens in california, santa ynez, ne...","accident happens in santa ynez, california, ne...","accident happens happens in santa ynez, ynez, ..."
1,Sigma Alpha Epsilon is being tossed out by the...,Video shows party-bound Sigma Alpha Epsilon m...,00716be72be8cf48cc23ac3b4b8924e569628be2,(CNN)Sigma Alpha Epsilon is under fire for a v...,sigma alpha epsilon is being tossed out by yal...,sigma alpha epsilon is being tossed out by a u...,sigma alpha is being tossed out by the univers...
2,Religion professor Candida Moss appears in eac...,"Candida Moss was an adviser on the ""True Cros...",00ac882e1a7f4862fa9f1e863b738966625f554d,(CNN)I'm Candida Moss and I am professor of Ne...,religion professor candida moss did not appear...,religion professor candida moss appears in eac...,religion professor candida candida moss appear...
3,Two police officers were shot Wednesday in Fer...,In three-quarters of all U.S. cities with pop...,00c45eb98a06f9218170edf5767617cc20991840,(CNN)Ferguson is crumbling. The cowardly and r...,two police officers were shot wednesday in fer...,two police officers were shot wednesday in fer...,two police officers were shot wednesday in fer...
4,Clinton Foundation has taken money from foreig...,The Clinton Foundation admitted last month th...,00dc77ce6c1d10bf1160829109b4f7d2a450823b,"Coral Gables, Florida (CNN)Former President Bi...",bill clinton foundation has taken money from f...,clinton foundation has taken money from foreig...,clinton clinton foundation has taken money fro...
...,...,...,...,...,...,...,...
995,Former Hugh Hefner companion Holly Madison has...,"Holly Madison's memoir, ""Down the Rabbit Hole...",d468092a6d909f36d26f98bcc9f14f28f8074e4e,(CNN)Hugh Hefner's Playboy Mansion was a gilde...,former hugh hefner companion madison has a mem...,former hugh hefner companion holly madison has...,former hugh hefner companion holly madison a m...
996,Sturt Manning: Video shows ISIS destroying app...,ISIS has been busy trying to damage the famed...,d4cbc2386a97964145c7e6acb8fe18e16c12d5d7,"(CNN)Confucius said: ""Study the past if you wo...",sturt manning: video did not show did not bedi...,sturt manning: video shows isis destroying app...,manning: video shows isis destroying destroyin...
997,Fred Craddock revolutionized art of preaching ...,The Rev. Fred Craddock redefined the art of p...,d4e931e515096bd5350290b55738ccbad83c11cb,"(CNN)The Rev. Fred Craddock, the pulpit giant ...",fred craddock did not revolutionize art of pre...,fred craddock revolutionized art of preaching ...,fred craddock revolutionized art preaching . ....
998,The man known as Africa often helped employees...,Los Angeles police shot and killed a homeless...,d532c558cec4abfa5ae40ac56c5db5aa51cb6363,Los Angeles (CNN)No one knew him by his real n...,the man known as africa often did not help emp...,a man known as africa often helped employees a...,man known africa often employees employees at ...
