# Qualitative Analysis

In [1]:
abs_root = '/ssd_scratch/cvit/adhiraj_deshmukh'
abs_code = f'{abs_root}/ANLP-Project'
abs_data = f'{abs_code}/data'

## Load the Model

In [2]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# bertscore = load("bertscore", cache_dir=f'{abs_root}/hf_cache') 

# tokenizer =  AutoTokenizer.from_pretrained('facebook/bart-large-cnn', cache_dir=f'{abs_root}/hf_cache')
tokenizer =  AutoTokenizer.from_pretrained('facebook/bart-large-cnn', cache_dir=f'{abs_root}/hf_cache', add_prefix_space=True)

model = AutoModelForSeq2SeqLM.from_pretrained(f"{abs_root}/bart_cnn")
# model =  AutoModelForSeq2SeqLM.from_pretrained('facebook/bart-large-cnn', cache_dir=f'{abs_root}/hf_cache')

# model = model.eval()

## Tokenize a Sample Article

In [3]:
ARTICLE_TO_SUMMARIZE = """sebastian vettel is determined to ensure the return of a long-standing ritual at ferrari is not a one-off this season. fresh from ferrari's first victory in 35 grands prix in malaysia 11 days ago, and ending his own 20-race drought, vettel returned to a hero's welcome at the team's factory at maranello last week. the win allowed ferrari to revive a tradition not seen at their base for almost two years since their previous triumph in may 2013 at the spanish grand prix courtesy of fernando alonso. sebastian vettel reflected on his stunning win for ferrari at the malaysian grand prix during the press conference before the weekend's chinese grand prix in shanghai the four-time world champion shares a friendly discussion with mclaren star jenson button four-times world champion vettel said: 'it was a great victory we had in malaysia, great for us as a team, and for myself a very emotional day - my first win with ferrari. 'when i returned to the factory on wednesday, to see all the people there was quite special. there are a lot of people working there and as you can imagine they were very, very happy. 'the team hadn't won for quite a while, so they enjoyed the fact they had something to celebrate. there were a couple of rituals involved, so it was nice for them to get that feeling again.' asked as to the specific nature of the rituals, vettel replied: 'i was supposed to be there for simulator work anyway, but it was quite nice to receive the welcome after the win. ferrari's vettel and britta roeske arrive at the shanghai circuit along with a ferrari mechanic, vettel caught up with members of his old team red bull on thursday 'all the factory got together for a quick lunch. it was quite nice to have all the people together in one room - it was a big room! - so we were able to celebrate altogether for a bit. 'i also learned when you win with ferrari, at the entry gate, they raise a ferrari flag - and obviously it's been a long time since they last did that. 'some 10 years ago there were a lot of flags, especially at the end of a season, so this flag will stay there for the rest of the year. 'we will, of course, try and put up another one sometime soon.' inside the ferrari garage, vettel shares a discussion with team staff as he looks to build on his sepang win ferrari team principal maurizio arrivabene shares a conversation with vettel at the team's hospitality suite the feeling is that will not happen after this weekend's race in china as the conditions at the shanghai international circuit are expected to suit rivals mercedes. not that vettel believes his success will be a one-off, adding: 'for here and the next races, we should be able to confirm we have a strong package and a strong car. 'we will want to make sure we stay ahead of the people we were ahead of in the first couple of races, but obviously knowing mercedes are in a very, very strong position. 'in general, for the start of a season things can be up and down, and we want to make sure there is quite a lot of ups, not so many downs. 'but it's normal in some races you are more competitive than others. 'we managed to do a very good job in malaysia, but for here and the next races we have to be realistic about we want to achieve.' ferrari mechanics show their joy after vettel won the malaysian grand prix, helping record the team's first formula one win since 2013 at the spanish grand prix"""

In [None]:
tokenized_input = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=512, truncation=True, return_tensors='pt')

## Summarize

In [5]:
summary_ids = model.generate(tokenized_input['input_ids'], max_length=15, early_stopping=True)



In [6]:
print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])

['\'. . . . \' \' \' " " "']


In [7]:
for i, beam in enumerate(summary_ids):
    print(f'{i}: {tokenizer.decode(beam, skip_special_tokens=True)}')

0: '....'''" " "


## Example-2

In [8]:
text = "Since, 2010, project researchers have uncovered documents in portugal that have revealed who owned the ship."

# encode the text into tensor of integers using the appropriate tokenizer
inputs = tokenizer.encode("paraphrase: " + text, return_tensors="pt", max_length=512, truncation=True)

max_l = 512
num_b = 10
num_sub_b =1

# generate text until the output length (which includes the context length) reaches 50
beam_outputs = model.generate(inputs,max_length=max_l,num_beams=num_b,early_stopping=True,
    no_repeat_ngram_size=3,
    num_return_sequences=10,
    top_k=4, top_p=0.95
    # return_dict_in_generate=True,
)

for x, beam in enumerate(beam_outputs):
    print("{} {}".format(i, tokenizer.decode(beam, skip_special_tokens=True)))



0 arierryprojectproject project project project researcherscerscerscersResearchers Researchers Researchers ResearcherscersResearcherscers Researcherscers Researchers ResearchersResearchersResearchers ResearchersResearchers Researchers Scientists Scientists Scientists Researchers Scientists Researchers Researchers They They They Who They Who Who Who They They She She She Who Who She She They Who She Who She They She Who They She They They... ) ) ) :.. : ) : : :. : : ).. ; : : ; ; : ) ) ; :. ) : ; : ; ) :,,, :, : :, ; :,..,, ) : ) ] ] ] : ) ; ) ) ] : : ] > > >>>> >> > > <<<<><< <<>>< < << <>> <<.<<< </<<><<<</<<>.>>>...>>..... : ;.. ] ><<.. : :.. ;. :. ; ; ; ) ; ;. ) ;. ; ),, ; ;., ;. ; :. ;. :., : ;.. ) ; } : ; ] : ;  ; ; ] ; ;,,. :. ) :. : ;? : :? : ; > > : ; } > >. ) ). :? >>. ) ] ) ; ] ) :? ; :?..,. ;    : :   ;, ; ).. ). ;.. ). ), ;  ) ;  : ; { : ::... ; ; } } : : { :: : :;. ; }. :...?..?. ; ). :?. ::.. }. );. : ). ;. ). )?. )? }. ;? }. ; }.. ), ; :: ; ;; ; :; ; ; {:. ;, :? } }. ] ]

## Example-3

In [9]:
text = "Experts say China's air pollution exacts a tremendous toll on human health."

# encode the text into tensor of integers using the appropriate tokenizer
inputs = tokenizer.encode("paraphrase: " + text, return_tensors="pt", max_length=512, truncation=True)

max_l = 512
num_b = 10
num_sub_b =1

# generate text until the output length (which includes the context length) reaches 50
beam_outputs = model.generate(inputs,max_length=max_l,num_beams=num_b,early_stopping=True,
    no_repeat_ngram_size=3,
    num_return_sequences=10,
    top_k=4, top_p=0.95
    # return_dict_in_generate=True,
)

for x, beam in enumerate(beam_outputs):
    print("{} {}".format(i, tokenizer.decode(beam, skip_special_tokens=True)))

0 '.... ) ) ) :.. : ) : ) ) ;.... :. ) : :.... ) :. ) ). ). :.. : ).. :. : : : ). : :. :. ;. : ;. ; : ) ; :.. ;:.. ;...:. ).... )) ) ))..
0 '.... ) ) ) :.. : ) : ) ) ;.... :. ) : :.... ) :. ) ). ). :.. : ).. :. : : : ). : :. :. ;. : ;. ; : ) ; :.. ;:.. ;...:. ).... )) ) )):..
0 '.... ) ) ) :.. : ) : ) ) ;.... :. ) : :.... ) :. ) ). ). :.. : ).. :. : : : ). : :. :. ;. : ;. ; : ) ; :.. ;:.. ;...:. ).... )) ) )). )
0 '.... ) ) ) :.. : ) : ) ) ;.... :. ) : :.... ) :. ) ). ). :.. : ).. :. : : : ). : :. :. ;. : ;. ; : ) ; :.. ;:.. ;...:. ).... )) )..
0 '.... ) ) ) :.. : ) : ) ) ;.... :. ) : :.... ) :. ) ). ). :.. : ).. :. : : : ). : :. :. ;. : ;. ; : ) ; :.. ;:.. ;...:. ).:..: )..
0 '.... ) ) ) :.. : ) : ) ) ;.... :. ) : :.... ) :. ) ). ). :.. : ).. :. : : : ). : :. :. ;. : ;. ; : ) ; :.. ;:.. ;...:. ).... )) ) )
0 '.... ) ) ) :.. : ) : ) ) ;.... :. ) : :.... ) :. ) ). ). :.. : ).. :. : : : ). : :. :. ;. : ;. ; : ) ; :.. ;:.. ;...:. ).... )) ).
0 '.... ) ) ) :.. : ) : ) ) ;.... :. ) : :.... 

## Evaluate

In [10]:
# # load rouge for validation
# rouge = datasets.load_metric("rouge")

# def compute_metrics(pred):
#     labels_ids = pred.label_ids
#     pred_ids = pred.predictions

#     # all unnecessary tokens are removed
#     pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
#     labels_ids[labels_ids == -100] = tokenizer.pad_token_id
#     label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

#     rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

#     return {
#         "rouge2_precision": round(rouge_output.precision, 4),
#         "rouge2_recall": round(rouge_output.recall, 4),
#         "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
#     }

In [11]:
# import datasets
# from transformers import BertTokenizer, EncoderDecoderModel

# tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# model = EncoderDecoderModel.from_pretrained("./checkpoint-16")
# model.to("cuda")

# test_data = datasets.load_dataset("cnn_dailymail", "3.0.0", split="test")

# # only use 16 training examples for notebook - DELETE LINE FOR FULL TRAINING
# test_data = test_data.select(range(16))

# batch_size = 16  # change to 64 for full evaluation

# # map data correctly
# def generate_summary(batch):
#     # Tokenizer will automatically set [BOS] <text> [EOS]
#     # cut off at BERT max length 512
#     inputs = tokenizer(batch["article"], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
#     input_ids = inputs.input_ids.to("cuda")
#     attention_mask = inputs.attention_mask.to("cuda")

#     outputs = model.generate(input_ids, attention_mask=attention_mask)

#     # all special tokens including will be removed
#     output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)

#     batch["pred"] = output_str

#     return batch

# results = test_data.map(generate_summary, batched=True, batch_size=batch_size, remove_columns=["article"])

# pred_str = results["pred"]
# label_str = results["highlights"]

# rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

# print(rouge_output)