In [None]:
#imp links
# https://github.com/huggingface/transformers/blob/master/examples/pytorch/summarization/README.md
# https://colab.research.google.com/github/huggingface/datasets/blob/master/notebooks/Overview.ipynb


In [None]:
! pip install datasets transformers



Make sure your version of Transformers is at least 4.11.0 since the functionality was introduced in that version:

In [None]:
import transformers

print(transformers.__version__)

4.12.5


You can find a script version of this notebook to fine-tune your model in a distributed fashion using multiple GPUs or TPUs [here](https://github.com/huggingface/transformers/tree/master/examples/language-modeling).

## Preparing the dataset

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

In [None]:
from datasets import load_dataset, load_metric

In [None]:
from pprint import pprint

In [None]:
#version number is needed for cnn-dailymail dataset; version 3 used
#Nguyen et al used non-anonymized version (which is version 3 only; therefore, we also used the same)
# details here-- https://huggingface.co/datasets/cnn_dailymail
#'summarization': Versions 2.0.0 and 3.0.0 of the CNN / DailyMail Dataset can be used to train a model for abstractive and extractive summarization (Version 1.0.0 was developed for machine reading and comprehension and abstractive question answering). The model performance is measured by how high the output summary's ROUGE score for a given article is when compared to the highlight as written by the original article author. Zhong et al (2020) report a ROUGE-1 score of 44.41 when testing a model trained for extractive summarization. See the Papers With Code leaderboard for more models.

In [None]:
dataset = load_dataset('cnn_dailymail', '3.0.0')

Reusing dataset cnn_dailymail (/root/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234)


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
# details about the dataset ; it is organised as a dictionary
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})


In [None]:
# dataset made up of a dictionary
doc_train_0 = dataset['train'][0]
doc_train_0

{'article': 'It\'s official: U.S. President Barack Obama wants lawmakers to weigh in on whether to use military force in Syria. Obama sent a letter to the heads of the House and Senate on Saturday night, hours after announcing that he believes military action against Syrian targets is the right step to take over the alleged use of chemical weapons. The proposed legislation from Obama asks Congress to approve the use of military force "to deter, disrupt, prevent and degrade the potential for future uses of chemical weapons or other weapons of mass destruction." It\'s a step that is set to turn an international crisis into a fierce domestic political battle. There are key questions looming over the debate: What did U.N. weapons inspectors find in Syria? What happens if Congress votes no? And how will the Syrian government react? In a televised address from the White House Rose Garden earlier Saturday, the president said he would take his case to Congress, not because he has to -- but bec

In [None]:
doc_train_0_article = dataset['train'][0]['article']
doc_train_0_article

'It\'s official: U.S. President Barack Obama wants lawmakers to weigh in on whether to use military force in Syria. Obama sent a letter to the heads of the House and Senate on Saturday night, hours after announcing that he believes military action against Syrian targets is the right step to take over the alleged use of chemical weapons. The proposed legislation from Obama asks Congress to approve the use of military force "to deter, disrupt, prevent and degrade the potential for future uses of chemical weapons or other weapons of mass destruction." It\'s a step that is set to turn an international crisis into a fierce domestic political battle. There are key questions looming over the debate: What did U.N. weapons inspectors find in Syria? What happens if Congress votes no? And how will the Syrian government react? In a televised address from the White House Rose Garden earlier Saturday, the president said he would take his case to Congress, not because he has to -- but because he want

In [None]:
!pip install --user -U nltk



In [None]:
# finally, didn't use nltk
import nltk
from nltk.corpus import stopwords

In [None]:
# used spacy for all downstream text processing
!pip install -U spacy



In [None]:
import spacy

In [None]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)
[K     |████████████████████████████████| 13.9 MB 4.5 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
doc_train_article = dataset['train']['article']

In [None]:
doc_train_article[:5]

['It\'s official: U.S. President Barack Obama wants lawmakers to weigh in on whether to use military force in Syria. Obama sent a letter to the heads of the House and Senate on Saturday night, hours after announcing that he believes military action against Syrian targets is the right step to take over the alleged use of chemical weapons. The proposed legislation from Obama asks Congress to approve the use of military force "to deter, disrupt, prevent and degrade the potential for future uses of chemical weapons or other weapons of mass destruction." It\'s a step that is set to turn an international crisis into a fierce domestic political battle. There are key questions looming over the debate: What did U.N. weapons inspectors find in Syria? What happens if Congress votes no? And how will the Syrian government react? In a televised address from the White House Rose Garden earlier Saturday, the president said he would take his case to Congress, not because he has to -- but because he wan

In [None]:
doc_validation_article = dataset['validation']['article']

In [None]:
doc_lst = [[doc] for doc in doc_validation_article]

In [None]:
doc_lst[:5]

[['(CNN)Singer-songwriter David Crosby hit a jogger with his car Sunday evening, a spokesman said. The accident happened in Santa Ynez, California, near where Crosby lives. Crosby was driving at approximately 50 mph when he struck the jogger, according to California Highway Patrol Spokesman Don Clotworthy. The posted speed limit was 55. The jogger suffered multiple fractures, and was airlifted to a hospital in Santa Barbara, Clotworthy said. His injuries are not believed to be life threatening. "Mr. Crosby was cooperative with authorities and he was not impaired or intoxicated in any way. Mr. Crosby did not see the jogger because of the sun," said Clotworthy. According to the spokesman, the jogger and Crosby were on the same side of the road. Pedestrians are supposed to be on the left side of the road walking toward traffic, Clotworthy said. Joggers are considered pedestrians. Crosby is known for weaving multilayered harmonies over sweet melodies. He belongs to the celebrated rock grou

In [None]:
docs_nlp = [nlp(doc) for doc in doc_validation_article]

In [None]:
docs_nlp[:5]

[(CNN)Singer-songwriter David Crosby hit a jogger with his car Sunday evening, a spokesman said. The accident happened in Santa Ynez, California, near where Crosby lives. Crosby was driving at approximately 50 mph when he struck the jogger, according to California Highway Patrol Spokesman Don Clotworthy. The posted speed limit was 55. The jogger suffered multiple fractures, and was airlifted to a hospital in Santa Barbara, Clotworthy said. His injuries are not believed to be life threatening. "Mr. Crosby was cooperative with authorities and he was not impaired or intoxicated in any way. Mr. Crosby did not see the jogger because of the sun," said Clotworthy. According to the spokesman, the jogger and Crosby were on the same side of the road. Pedestrians are supposed to be on the left side of the road walking toward traffic, Clotworthy said. Joggers are considered pedestrians. Crosby is known for weaving multilayered harmonies over sweet melodies. He belongs to the celebrated rock group 

In [None]:
docs_nlp[0].ents[0].lemma_

'cnn)singer-songwriter David Crosby'

In [None]:
allowed_pos = ['NOUN', 'ADJ', "PROPN", "VERB"]

In [None]:
# tokenization (lemmatization+stop words removal)
lemmas = [[token.lemma_ for token in doc_nlp if token.pos_ in allowed_pos and not token.is_stop] for doc_nlp in docs_nlp]

In [None]:
# for checking words distribution
import itertools

In [None]:
import pandas as pd

In [None]:
from collections import Counter

In [None]:
terms =  list(itertools.chain.from_iterable(lemmas))
count_terms = Counter(terms)
pd_terms = pd.Series(count_terms)
sorted_count = pd_terms.sort_values(ascending =False)
sorted_count


say          59208
year         32283
time         18257
tell         14553
take         13649
             ...  
Xishan           1
Pollution        1
biomass          1
Guannan          1
toilete          1
Length: 97001, dtype: int64

In [None]:
# checking lemmatization
lemmas[:3]

[['cnn)singer',
  'songwriter',
  'David',
  'Crosby',
  'hit',
  'jogger',
  'car',
  'Sunday',
  'evening',
  'spokesman',
  'say',
  'accident',
  'happen',
  'Santa',
  'Ynez',
  'California',
  'Crosby',
  'live',
  'Crosby',
  'drive',
  'mph',
  'strike',
  'jogger',
  'accord',
  'California',
  'Highway',
  'Patrol',
  'Spokesman',
  'Don',
  'Clotworthy',
  'post',
  'speed',
  'limit',
  'jogger',
  'suffer',
  'multiple',
  'fracture',
  'airlift',
  'hospital',
  'Santa',
  'Barbara',
  'Clotworthy',
  'say',
  'injury',
  'believe',
  'life',
  'threaten',
  'Mr.',
  'Crosby',
  'cooperative',
  'authority',
  'impaired',
  'intoxicate',
  'way',
  'Mr.',
  'Crosby',
  'jogger',
  'sun',
  'say',
  'Clotworthy',
  'accord',
  'spokesman',
  'jogger',
  'Crosby',
  'road',
  'pedestrian',
  'suppose',
  'left',
  'road',
  'walk',
  'traffic',
  'Clotworthy',
  'say',
  'jogger',
  'consider',
  'pedestrian',
  'Crosby',
  'know',
  'weave',
  'multilayere',
  'harmony',
 

In [None]:
# for topic modeling
!pip install gensim 



In [None]:
# for topic visualization
!pip install pyLDAvis



In [None]:
import pyLDAvis

  from collections import Iterable


In [None]:
import gensim

In [None]:
from gensim.test.utils import datapath

In [None]:
# create a dictionary
from gensim.corpora import Dictionary

In [None]:
dictionary1 = Dictionary(lemmas)

In [None]:
# convert dictionary into bag-of-words (BoW)/document-term-matrix
corpus = [dictionary1.doc2bow(text) for text in lemmas]

In [None]:
len(dictionary1)

97001

In [None]:
# instatiating an lda model
LDA = gensim.models.ldamodel.LdaModel 

In [None]:
# topic model for 50 topics
lda_model_cnn = LDA(corpus = corpus, id2word = dictionary1, num_topics =50, random_state =1)

In [None]:
# checking top 20 words for different topics
lda_model_cnn.print_topics(num_topics =50, num_words =20)

[(0,
  '0.029*"United" + 0.013*"Manchester" + 0.010*"Gaal" + 0.010*"Arsenal" + 0.008*"season" + 0.008*"Old" + 0.008*"League" + 0.008*"Van" + 0.007*"Trafford" + 0.007*"van" + 0.007*"year" + 0.006*"club" + 0.006*"say" + 0.006*"look" + 0.006*"Louis" + 0.005*"Premier" + 0.005*"time" + 0.005*"Maria" + 0.004*"win" + 0.004*"play"'),
 (1,
  '0.009*"say" + 0.007*"water" + 0.006*"Earth" + 0.006*"surface" + 0.005*"image" + 0.005*"year" + 0.005*"light" + 0.005*"scientist" + 0.005*"air" + 0.005*"eclipse" + 0.004*"mile" + 0.004*"system" + 0.004*"show" + 0.004*"space" + 0.004*"sun" + 0.004*"solar" + 0.004*"create" + 0.004*"fly" + 0.004*"China" + 0.004*"large"'),
 (2,
  '0.016*"food" + 0.015*"say" + 0.013*"child" + 0.009*"health" + 0.007*"people" + 0.006*"year" + 0.006*"cent" + 0.005*"eat" + 0.005*"study" + 0.005*"day" + 0.005*"school" + 0.005*"find" + 0.005*"time" + 0.004*"help" + 0.004*"age" + 0.004*"good" + 0.004*"brain" + 0.004*"start" + 0.004*"research" + 0.004*"baby"'),
 (3,
  '0.020*"say" + 0.0

In [None]:
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

In [None]:
cnn_topic50_vis = gensimvis.prepare(lda_model_cnn, corpus, dictionary1)

  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [None]:
cnn_topic50_vis

In [None]:
pyLDAvis.save_html(cnn_topic50_vis, 'cnn_topic50vis.html')

In [None]:
from google.colab import files
files.download('cnn_topic50vis.html')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# topic model for 100 topics
lda_model_cnn_100 = LDA(corpus = corpus, id2word = dictionary1, num_topics =100, random_state =1)

  diff = np.log(self.expElogbeta)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id,

In [None]:
lda_model_cnn_100.print_topics(num_topics =100, num_words =20)

[(0,
  '0.018*"look" + 0.013*"dress" + 0.013*"wear" + 0.010*"fashion" + 0.009*"visit" + 0.009*"style" + 0.006*"year" + 0.005*"say" + 0.005*"collection" + 0.005*"black" + 0.005*"love" + 0.004*"event" + 0.004*"new" + 0.004*"good" + 0.004*"London" + 0.004*"model" + 0.004*"Kate" + 0.004*"designer" + 0.004*"hat" + 0.003*"Beckham"'),
 (1,
  '0.010*"say" + 0.009*"space" + 0.009*"metre" + 0.009*"water" + 0.009*"ft" + 0.007*"fly" + 0.007*"land" + 0.007*"year" + 0.007*"mission" + 0.007*"helicopter" + 0.006*"foot" + 0.006*"Mars" + 0.006*"plane" + 0.006*"aircraft" + 0.006*"air" + 0.006*"mile" + 0.005*"crew" + 0.005*"large" + 0.005*"painting" + 0.005*"crash"'),
 (2,
  '0.056*"food" + 0.037*"eat" + 0.017*"say" + 0.014*"meal" + 0.011*"weight" + 0.009*"shark" + 0.009*"day" + 0.008*"diet" + 0.008*"healthy" + 0.008*"child" + 0.008*"calorie" + 0.007*"health" + 0.006*"time" + 0.006*"drink" + 0.006*"lunch" + 0.005*"year" + 0.005*"insect" + 0.005*"restaurant" + 0.005*"egg" + 0.005*"fruit"'),
 (3,
  '0.025*"

In [None]:
cnn_topic100_vis = gensimvis.prepare(lda_model_cnn_100, corpus, dictionary1)

  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [None]:
cnn_topic100_vis

In [None]:
pyLDAvis.save_html(cnn_topic100_vis, 'cnn_topic100vis.html')

In [None]:
files.download('cnn_topic100vis.html')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# topic model for 200 topics
lda_model_cnn_200 = LDA(corpus = corpus, id2word = dictionary1, num_topics =200, random_state =1)

  diff = np.log(self.expElogbeta)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id,

In [None]:
lda_model_cnn_200.print_topics(num_topics =200, num_words =20)

[(0,
  '0.024*"dress" + 0.020*"look" + 0.015*"wear" + 0.014*"visit" + 0.009*"Sarah" + 0.008*"site" + 0.007*"pair" + 0.007*"year" + 0.007*"style" + 0.007*"clothe" + 0.006*"say" + 0.006*"picture" + 0.006*"woman" + 0.006*"Kate" + 0.006*"love" + 0.005*"write" + 0.004*"mirror" + 0.004*"beauty" + 0.004*"model" + 0.004*"brand"'),
 (1,
  '0.040*"Mars" + 0.016*"Malaysia" + 0.016*"say" + 0.016*"Alaska" + 0.012*"search" + 0.011*"satellite" + 0.011*"Boeing" + 0.011*"aircraft" + 0.011*"flight" + 0.010*"orbit" + 0.010*"mission" + 0.010*"Kuala" + 0.010*"crew" + 0.009*"mh370" + 0.008*"formation" + 0.008*"Lumpur" + 0.008*"revolutionise" + 0.008*"Beijing" + 0.008*"chinese" + 0.008*"plane"'),
 (2,
  '0.022*"child" + 0.022*"love" + 0.019*"say" + 0.016*"age" + 0.013*"year" + 0.011*"start" + 0.011*"old" + 0.010*"mother" + 0.010*"time" + 0.010*"Ashton" + 0.009*"math" + 0.008*"family" + 0.008*"facial" + 0.007*"get" + 0.007*"theatre" + 0.007*"Life" + 0.007*"couple" + 0.006*"know" + 0.006*"want" + 0.006*"come"'

In [None]:
# colab collapsed here
cnn_topic200_vis = gensimvis.prepare(lda_model_cnn_200, corpus, dictionary1)

  by='saliency', ascending=False).head(R).drop('saliency', 1)
