## Scrape any two articles of choice, summarize them and perform NLP tasks on the summaries

### Import Libraries

In [7]:
import requests #Requests information from url
from bs4 import BeautifulSoup  #Makes the page received more understandable(beautiful)
import gensim 
import docx
from gensim.summarization import summarize

### Scraping and summerizing article 1

In [8]:
url1 = 'https://www.npr.org/2021/05/26/1000448553/tiny-fund-scores-historic-win-in-battle-against-exxonmobil-over-future-of-oil'

In [10]:
requests.get(url1)
page = requests.get(url1).text
soup = BeautifulSoup(page)
header = soup.find('h1').get_text()
date = soup.find(name = 'span', attrs = {'class':'date'}).text
p_tags = soup.find_all('p')
text = [tags.get_text().strip() for tags in p_tags]
sentence_list = [sentence for sentence in text if not '\n' in sentence]
body = [sentence for sentence in sentence_list if '.' in sentence]
article = ''.join(body)
summary1 = summarize(article, 1)
doc = docx.Document()
doc.add_heading(header)
doc.add_picture('pic01.png')
doc.add_paragraph(date)
doc.add_paragraph(summary1)
doc.save('article1.docx')

In [12]:
summary1

'Pictured are pumps at an Exxon gas station in Charlotte, N.C. A tiny fund got two board members elected to the oil giant\'s board, delivering a historic defeat to ExxonMobil.In a dramatic boardroom battle on Wednesday, a tiny hedge fund fought with the energy giant ExxonMobil over the future of the oil and gas industry — and won.The brand-new activist hedge fund successfully placed at least two new candidates on the company\'s board of directors in hopes that they can use that position to push Exxon to take climate change more seriously.\nFor two more seats on the board, the vote was too close to call.Winning a seat for any directors at all is an unprecedented achievement by activist shareholders, who have spent decades trying to persuade companies to cut their carbon emissions.\nTo do it at ExxonMobil, once the world\'s most influential oil company, makes the feat all the more astonishing.And it demonstrates how arguments about the financial implications of climate change — specifica

### Tokenization

In [13]:
import spacy

### Load the language model

In [14]:
nlp = spacy.load('en_core_web_sm')

In [16]:
doc = nlp(summary1)

In [17]:
for token in doc:
    print(token.text, '--->', spacy.explain(token.text)) 

Pictured ---> None
are ---> None
pumps ---> None
at ---> None
an ---> None
Exxon ---> None
gas ---> None
station ---> None
in ---> None
Charlotte ---> None
, ---> punctuation mark, comma
N.C. ---> None
A ---> None
tiny ---> None
fund ---> None
got ---> None
two ---> None
board ---> None
members ---> None
elected ---> None
to ---> None
the ---> None
oil ---> None
giant ---> None
's ---> None
board ---> None
, ---> punctuation mark, comma
delivering ---> None
a ---> None
historic ---> None
defeat ---> None
to ---> None
ExxonMobil ---> None
. ---> punctuation mark, sentence closer
In ---> None
a ---> None
dramatic ---> None
boardroom ---> None
battle ---> None
on ---> None
Wednesday ---> None
, ---> punctuation mark, comma
a ---> None
tiny ---> None
hedge ---> None
fund ---> None
fought ---> None
with ---> None
the ---> None
energy ---> None
giant ---> None
ExxonMobil ---> None
over ---> None
the ---> None
future ---> None
of ---> None
the ---> None
oil ---> None
and ---> None
gas ---> No

— ---> None
just ---> None
0.02 ---> None
% ---> None
, ---> punctuation mark, comma
according ---> None
to ---> None
the ---> None
proxy ---> None
advisory ---> None
firm ---> None
ISS ---> None
. ---> punctuation mark, sentence closer

 ---> None
By ---> None
itself ---> None
, ---> punctuation mark, comma
it ---> None
had ---> None
no ---> None
chance ---> None
to ---> None
sway ---> None
the ---> None
company ---> None
. ---> punctuation mark, sentence closer
But ---> None
it ---> None
spent ---> None
months ---> None
building ---> None
support ---> None
for ---> None
its ---> None
case ---> case marking
. ---> punctuation mark, sentence closer

 ---> None
CalSTRS ---> None
, ---> punctuation mark, comma
the ---> None
California ---> None
teachers ---> None
pension ---> None
, ---> punctuation mark, comma
was ---> None
an ---> None
early ---> None
backer ---> None
, ---> punctuation mark, comma
citing ---> None
frustration ---> None
with ---> None
Exxon ---> None
's ---> None
lack 

### POS tagging 

In [18]:
for token in doc:
    print(token.text, '--->', token.pos_, '--->', spacy.explain(token.pos_))

Pictured ---> VERB ---> verb
are ---> AUX ---> auxiliary
pumps ---> NOUN ---> noun
at ---> ADP ---> adposition
an ---> DET ---> determiner
Exxon ---> PROPN ---> proper noun
gas ---> NOUN ---> noun
station ---> NOUN ---> noun
in ---> ADP ---> adposition
Charlotte ---> PROPN ---> proper noun
, ---> PUNCT ---> punctuation
N.C. ---> PROPN ---> proper noun
A ---> DET ---> determiner
tiny ---> ADJ ---> adjective
fund ---> NOUN ---> noun
got ---> VERB ---> verb
two ---> NUM ---> numeral
board ---> NOUN ---> noun
members ---> NOUN ---> noun
elected ---> VERB ---> verb
to ---> ADP ---> adposition
the ---> DET ---> determiner
oil ---> NOUN ---> noun
giant ---> NOUN ---> noun
's ---> PART ---> particle
board ---> NOUN ---> noun
, ---> PUNCT ---> punctuation
delivering ---> VERB ---> verb
a ---> DET ---> determiner
historic ---> ADJ ---> adjective
defeat ---> NOUN ---> noun
to ---> ADP ---> adposition
ExxonMobil ---> PROPN ---> proper noun
. ---> PUNCT ---> punctuation
In ---> ADP ---> adposition


that ---> DET ---> determiner
strategy ---> NOUN ---> noun
. ---> PUNCT ---> punctuation

 ---> SPACE ---> space
The ---> DET ---> determiner
company ---> NOUN ---> noun
says ---> VERB ---> verb
its ---> PRON ---> pronoun
core ---> NOUN ---> noun
strengths ---> NOUN ---> noun
are ---> AUX ---> auxiliary
in ---> ADP ---> adposition
oil ---> NOUN ---> noun
and ---> CCONJ ---> coordinating conjunction
gas ---> NOUN ---> noun
, ---> PUNCT ---> punctuation
and ---> CCONJ ---> coordinating conjunction
it ---> PRON ---> pronoun
argues ---> VERB ---> verb
that ---> SCONJ ---> subordinating conjunction
the ---> DET ---> determiner
world ---> NOUN ---> noun
simply ---> ADV ---> adverb
will ---> AUX ---> auxiliary
not ---> PART ---> particle
pivot ---> VERB ---> verb
away ---> ADV ---> adverb
from ---> ADP ---> adposition
those ---> DET ---> determiner
energy ---> NOUN ---> noun
sources ---> NOUN ---> noun
very ---> ADV ---> adverb
quickly ---> ADV ---> adverb
. ---> PUNCT ---> punctuation
Instea

shift ---> NOUN ---> noun
away ---> ADV ---> adverb
from ---> ADP ---> adposition
relying ---> VERB ---> verb
on ---> ADP ---> adposition
fossil ---> ADJ ---> adjective
fuels ---> NOUN ---> noun
, ---> PUNCT ---> punctuation
and ---> CCONJ ---> coordinating conjunction
agreed ---> VERB ---> verb
that ---> SCONJ ---> subordinating conjunction
Exxon ---> PROPN ---> proper noun
has ---> AUX ---> auxiliary
inadequately ---> ADV ---> adverb
prepared ---> VERB ---> verb
for ---> ADP ---> adposition
this ---> DET ---> determiner
future ---> NOUN ---> noun
. ---> PUNCT ---> punctuation
Heading ---> VERB ---> verb
into ---> ADP ---> adposition
Wednesday ---> PROPN ---> proper noun
's ---> PART ---> particle
meeting ---> NOUN ---> noun
, ---> PUNCT ---> punctuation
all ---> DET ---> determiner
eyes ---> NOUN ---> noun
were ---> AUX ---> auxiliary
on ---> ADP ---> adposition
the ---> DET ---> determiner
three ---> NUM ---> numeral
companies ---> NOUN ---> noun
with ---> ADP ---> adposition
the --

### Dependency Parsing

In [19]:
for token in doc:
    print(token.text, '--->', token.dep_, '--->', spacy.explain(token.dep_))

Pictured ---> nsubj ---> nominal subject
are ---> ROOT ---> None
pumps ---> attr ---> attribute
at ---> prep ---> prepositional modifier
an ---> det ---> determiner
Exxon ---> compound ---> compound
gas ---> compound ---> compound
station ---> pobj ---> object of preposition
in ---> prep ---> prepositional modifier
Charlotte ---> pobj ---> object of preposition
, ---> punct ---> punctuation
N.C. ---> punct ---> punctuation
A ---> det ---> determiner
tiny ---> amod ---> adjectival modifier
fund ---> nsubj ---> nominal subject
got ---> ROOT ---> None
two ---> nummod ---> numeric modifier
board ---> compound ---> compound
members ---> dobj ---> direct object
elected ---> acl ---> clausal modifier of noun (adjectival clause)
to ---> prep ---> prepositional modifier
the ---> det ---> determiner
oil ---> compound ---> compound
giant ---> poss ---> possession modifier
's ---> case ---> case marking
board ---> pobj ---> object of preposition
, ---> punct ---> punctuation
delivering ---> advcl 

slash ---> advcl ---> adverbial clause modifier
their ---> poss ---> possession modifier
emissions ---> dobj ---> direct object
to ---> prep ---> prepositional modifier
zero ---> pobj ---> object of preposition
, ---> punct ---> punctuation
but ---> cc ---> coordinating conjunction
Exxon ---> nsubj ---> nominal subject
has ---> aux ---> auxiliary
consistently ---> advmod ---> adverbial modifier
rejected ---> conj ---> conjunct
that ---> det ---> determiner
strategy ---> dobj ---> direct object
. ---> punct ---> punctuation

 ---> ROOT ---> None
The ---> det ---> determiner
company ---> nsubj ---> nominal subject
says ---> ROOT ---> None
its ---> poss ---> possession modifier
core ---> compound ---> compound
strengths ---> nsubj ---> nominal subject
are ---> ccomp ---> clausal complement
in ---> prep ---> prepositional modifier
oil ---> pobj ---> object of preposition
and ---> cc ---> coordinating conjunction
gas ---> conj ---> conjunct
, ---> punct ---> punctuation
and ---> cc ---> coo

we ---> nsubj ---> nominal subject
had ---> aux ---> auxiliary
tried ---> advcl ---> adverbial clause modifier
everything ---> dobj ---> direct object
else ---> advmod ---> adverbial modifier
, ---> punct ---> punctuation
" ---> punct ---> punctuation
says ---> ROOT ---> None
Aeisha ---> compound ---> compound
Mastagni ---> nsubj ---> nominal subject
, ---> punct ---> punctuation
a ---> det ---> determiner
portfolio ---> compound ---> compound
manager ---> appos ---> appositional modifier
with ---> prep ---> prepositional modifier
CalSTRS ---> poss ---> possession modifier
' ---> case ---> case marking
sustainable ---> amod ---> adjectival modifier
investment ---> nmod ---> modifier of nominal
and ---> cc ---> coordinating conjunction
stewardship ---> compound ---> compound
strategies ---> conj ---> conjunct
unit ---> pobj ---> object of preposition
. ---> punct ---> punctuation
The ---> det ---> determiner
influential ---> amod ---> adjectival modifier
proxy ---> amod ---> adjectival 

### Lemmatization

In [20]:
for token in doc:
    print(token.text, '--->', token.lemma_, '--->', spacy.explain(token.lemma_))

Pictured ---> picture ---> None
are ---> be ---> None
pumps ---> pump ---> None
at ---> at ---> None
an ---> an ---> None
Exxon ---> Exxon ---> None
gas ---> gas ---> None
station ---> station ---> None
in ---> in ---> None
Charlotte ---> Charlotte ---> None
, ---> , ---> punctuation mark, comma
N.C. ---> N.C. ---> None
A ---> a ---> None
tiny ---> tiny ---> None
fund ---> fund ---> None
got ---> get ---> None
two ---> two ---> None
board ---> board ---> None
members ---> member ---> None
elected ---> elect ---> None
to ---> to ---> None
the ---> the ---> None
oil ---> oil ---> None
giant ---> giant ---> None
's ---> 's ---> None
board ---> board ---> None
, ---> , ---> punctuation mark, comma
delivering ---> deliver ---> None
a ---> a ---> None
historic ---> historic ---> None
defeat ---> defeat ---> None
to ---> to ---> None
ExxonMobil ---> ExxonMobil ---> None
. ---> . ---> punctuation mark, sentence closer
In ---> in ---> None
a ---> a ---> None
dramatic ---> dramatic ---> None
boa

oil ---> oil ---> None
and ---> and ---> None
gas ---> gas ---> None
companies ---> company ---> None
are ---> be ---> None
investing ---> invest ---> None
in ---> in ---> None
renewable ---> renewable ---> None
energy ---> energy ---> None
and ---> and ---> None
pledging ---> pledge ---> None
to ---> to ---> None
slash ---> slash ---> None
their ---> their ---> None
emissions ---> emission ---> None
to ---> to ---> None
zero ---> zero ---> None
, ---> , ---> punctuation mark, comma
but ---> but ---> None
Exxon ---> Exxon ---> None
has ---> have ---> None
consistently ---> consistently ---> None
rejected ---> reject ---> None
that ---> that ---> None
strategy ---> strategy ---> None
. ---> . ---> punctuation mark, sentence closer

 ---> 
 ---> None
The ---> the ---> None
company ---> company ---> None
says ---> say ---> None
its ---> its ---> None
core ---> core ---> None
strengths ---> strength ---> None
are ---> be ---> None
in ---> in ---> None
oil ---> oil ---> None
and ---> and --

some ---> some ---> None
of ---> of ---> None
the ---> the ---> None
incumbent ---> incumbent ---> None
directors ---> director ---> None
, ---> , ---> punctuation mark, comma
we ---> we ---> None
were ---> be ---> None
very ---> very ---> None
intrigued ---> intrigue ---> None
— ---> — ---> None
because ---> because ---> None
we ---> we ---> None
had ---> have ---> None
tried ---> try ---> None
everything ---> everything ---> None
else ---> else ---> None
, ---> , ---> punctuation mark, comma
" ---> " ---> None
says ---> say ---> None
Aeisha ---> Aeisha ---> None
Mastagni ---> Mastagni ---> None
, ---> , ---> punctuation mark, comma
a ---> a ---> None
portfolio ---> portfolio ---> None
manager ---> manager ---> None
with ---> with ---> None
CalSTRS ---> calstr ---> None
' ---> ' ---> None
sustainable ---> sustainable ---> None
investment ---> investment ---> None
and ---> and ---> None
stewardship ---> stewardship ---> None
strategies ---> strategy ---> None
unit ---> unit ---> None
.

### Sentence boundary detection

In [21]:
list(doc.sents)

[Pictured are pumps at an Exxon gas station in Charlotte, N.C.,
 A tiny fund got two board members elected to the oil giant's board, delivering a historic defeat to ExxonMobil.,
 In a dramatic boardroom battle on Wednesday, a tiny hedge fund fought with the energy giant ExxonMobil over the future of the oil and gas industry — and won.,
 The brand-new activist hedge fund successfully placed at least two new candidates on the company's board of directors in hopes that they can use that position to push Exxon to take climate change more seriously.,
 ,
 For two more seats on the board, the vote was too close to call.,
 Winning a seat for any directors at all is an unprecedented achievement by activist shareholders, who have spent decades trying to persuade companies to cut their carbon emissions.,
 ,
 To do it at ExxonMobil, once the world's most influential oil company, makes the feat all the more astonishing.,
 And it demonstrates how arguments about the financial implications of climate

In [22]:
sentences = list(doc.sents)

In [23]:
for sentence in sentences:
    print(sentence)

Pictured are pumps at an Exxon gas station in Charlotte, N.C.
A tiny fund got two board members elected to the oil giant's board, delivering a historic defeat to ExxonMobil.
In a dramatic boardroom battle on Wednesday, a tiny hedge fund fought with the energy giant ExxonMobil over the future of the oil and gas industry — and won.
The brand-new activist hedge fund successfully placed at least two new candidates on the company's board of directors in hopes that they can use that position to push Exxon to take climate change more seriously.


For two more seats on the board, the vote was too close to call.
Winning a seat for any directors at all is an unprecedented achievement by activist shareholders, who have spent decades trying to persuade companies to cut their carbon emissions.


To do it at ExxonMobil, once the world's most influential oil company, makes the feat all the more astonishing.
And it demonstrates how arguments about the financial implications of climate change — specifi

### Named entity

In [24]:
for ent in doc.ents:
    print(ent.text, '--->', ent.label_, '--->', spacy.explain(ent.label_))

Exxon ---> ORG ---> Companies, agencies, institutions, etc.
Charlotte ---> GPE ---> Countries, cities, states
N.C. ---> GPE ---> Countries, cities, states
two ---> CARDINAL ---> Numerals that do not fall under another type
ExxonMobil ---> ORG ---> Companies, agencies, institutions, etc.
Wednesday ---> DATE ---> Absolute or relative dates or periods
ExxonMobil ---> ORG ---> Companies, agencies, institutions, etc.
at least two ---> CARDINAL ---> Numerals that do not fall under another type
Exxon ---> ORG ---> Companies, agencies, institutions, etc.
two ---> CARDINAL ---> Numerals that do not fall under another type
decades ---> DATE ---> Absolute or relative dates or periods
ExxonMobil ---> ORG ---> Companies, agencies, institutions, etc.
Exxon ---> ORG ---> Companies, agencies, institutions, etc.
Engine ---> ORG ---> Companies, agencies, institutions, etc.
1 ---> CARDINAL ---> Numerals that do not fall under another type
just last year ---> DATE ---> Absolute or relative dates or period

### Visualization

In [25]:
from spacy import displacy

In [26]:
displacy.render(doc, style='ent', jupyter=True)

In [27]:
displacy.render(doc, style='dep', jupyter=True)

### Putting it all together

In [28]:
lis = []

In [29]:
for token in doc:
    dic = {}
    dic['Token'] = token.text
    dic['POS'] = token.pos_
    dic['Tags'] = token.tag_
    dic['Dep'] = token.dep_
    dic['Explaination'] = spacy.explain(token.tag_)
    lis.append(dic)

In [30]:
import pandas as pd

In [31]:
data = pd.DataFrame(lis)

In [32]:
data

Unnamed: 0,Token,POS,Tags,Dep,Explaination
0,Pictured,VERB,VBN,nsubj,"verb, past participle"
1,are,AUX,VBP,ROOT,"verb, non-3rd person singular present"
2,pumps,NOUN,NNS,attr,"noun, plural"
3,at,ADP,IN,prep,"conjunction, subordinating or preposition"
4,an,DET,DT,det,determiner
...,...,...,...,...,...
905,on,ADP,IN,prep,"conjunction, subordinating or preposition"
906,fighting,VERB,VBG,pcomp,"verb, gerund or present participle"
907,climate,NOUN,NN,compound,"noun, singular or mass"
908,change,NOUN,NN,dobj,"noun, singular or mass"


#### You can reindex if you do not like the arrangement

In [33]:
l = []
for ent in doc.ents:
    d = {}
    d['entities'] = ent.text
    d['labels'] = ent.label_
    d['Explaination'] = spacy.explain(ent.label_)
    l.append(d)
    

In [34]:
entities = pd.DataFrame(l)

In [35]:
entities

Unnamed: 0,entities,labels,Explaination
0,Exxon,ORG,"Companies, agencies, institutions, etc."
1,Charlotte,GPE,"Countries, cities, states"
2,N.C.,GPE,"Countries, cities, states"
3,two,CARDINAL,Numerals that do not fall under another type
4,ExxonMobil,ORG,"Companies, agencies, institutions, etc."
5,Wednesday,DATE,Absolute or relative dates or periods
6,ExxonMobil,ORG,"Companies, agencies, institutions, etc."
7,at least two,CARDINAL,Numerals that do not fall under another type
8,Exxon,ORG,"Companies, agencies, institutions, etc."
9,two,CARDINAL,Numerals that do not fall under another type


In [36]:
entities.to_csv('Entities.csv', index = False) 

### Scraping and summerizing article 2

In [37]:
url2 = 'https://www.npr.org/2021/05/26/1000452074/if-you-didnt-look-up-this-morning-heres-what-you-missed'

In [39]:
requests.get(url2)
page = requests.get(url2).text
soup = BeautifulSoup(page)
header = soup.find('h1').get_text()
date = soup.find(name = 'span', attrs = {'class':'date'}).text
p_tags = soup.find_all('p')
text = [tags.get_text().strip() for tags in p_tags]
sentence_list = [sentence for sentence in text if not '\n' in sentence]
body = [sentence for sentence in sentence_list if '.' in sentence]
article = ''.join(body)
summary2 = summarize(article, 1)
doc = docx.Document()
doc.add_heading(header)
doc.add_picture('pic02.png')
doc.add_paragraph(date)
doc.add_paragraph(summary2)
doc.save('article2.docx')

In [40]:
summary2

'The lunar eclipse is seen in Santa Monica.\nFull moons that occur in May are sometimes known as flower moons.Maybe the sky was cloudy; maybe waking up in the middle of the night to look at the moon just sounds like lunacy.\nWhatever the reason, if you missed seeing last night\'s lunar eclipse, you\'re not alone.\nLuckily, there are plenty of photos and video of the rare sight.For those of us who slept through the Super Flower Blood Moon: Here\'s a quick time lapse taken early this morning 🌷🌕 pic.twitter.com/Q38GJ4WrWHThe supermoon — the Super Flower Blood Moon, to be exact — brought the first total lunar eclipse in nearly 2 1/2 years, treating sky watchers to the sight of the moon slipping into Earth\'s shadow while also appearing around 7% larger than it normally does.Residents watch the lunar eclipse Wednesday at Sanur beach in Indonesia\'s Bali.\nThe reddish-orange color of the supermoon is the result of all the sunrises and sunsets in Earth\'s atmosphere projected onto the surface

### Tokenization

In [41]:
import spacy

#### Load the language model

In [42]:
nlp = spacy.load('en_core_web_sm')

In [43]:
doc2 = nlp(summary2)

In [44]:
for token in doc2:
    print(token.text, '--->', spacy.explain(token.text))

The ---> None
lunar ---> None
eclipse ---> None
is ---> None
seen ---> None
in ---> None
Santa ---> None
Monica ---> None
. ---> punctuation mark, sentence closer

 ---> None
Full ---> None
moons ---> None
that ---> None
occur ---> None
in ---> None
May ---> None
are ---> None
sometimes ---> None
known ---> None
as ---> None
flower ---> None
moons ---> None
. ---> punctuation mark, sentence closer
Maybe ---> None
the ---> None
sky ---> None
was ---> None
cloudy ---> None
; ---> None
maybe ---> None
waking ---> None
up ---> None
in ---> None
the ---> None
middle ---> None
of ---> None
the ---> None
night ---> None
to ---> None
look ---> None
at ---> None
the ---> None
moon ---> None
just ---> None
sounds ---> None
like ---> None
lunacy ---> None
. ---> punctuation mark, sentence closer

 ---> None
Whatever ---> None
the ---> None
reason ---> None
, ---> punctuation mark, comma
if ---> None
you ---> None
missed ---> None
seeing ---> None
last ---> None
night ---> None
's ---> None
lunar 

moon ---> None
's ---> None
disk ---> None
will ---> None
remain ---> None
outside ---> None
the ---> None
umbra ---> None
, ---> punctuation mark, comma
so ---> None
for ---> None
all ---> None
intents ---> None
and ---> None
purposes ---> None
it ---> None
'll ---> None
be ---> None
very ---> None
much ---> None
like ---> None
a ---> None
total ---> None
eclipse ---> None
, ---> punctuation mark, comma
" ---> None
she ---> None
said ---> None
. ---> punctuation mark, sentence closer


### POS tagging

In [45]:
for token in doc2:
    print(token.text, '--->', token.pos_, '--->', spacy.explain(token.pos_))

The ---> DET ---> determiner
lunar ---> ADJ ---> adjective
eclipse ---> NOUN ---> noun
is ---> AUX ---> auxiliary
seen ---> VERB ---> verb
in ---> ADP ---> adposition
Santa ---> PROPN ---> proper noun
Monica ---> PROPN ---> proper noun
. ---> PUNCT ---> punctuation

 ---> SPACE ---> space
Full ---> ADJ ---> adjective
moons ---> NOUN ---> noun
that ---> DET ---> determiner
occur ---> VERB ---> verb
in ---> ADP ---> adposition
May ---> PROPN ---> proper noun
are ---> AUX ---> auxiliary
sometimes ---> ADV ---> adverb
known ---> VERB ---> verb
as ---> ADP ---> adposition
flower ---> NOUN ---> noun
moons ---> NOUN ---> noun
. ---> PUNCT ---> punctuation
Maybe ---> ADV ---> adverb
the ---> DET ---> determiner
sky ---> NOUN ---> noun
was ---> AUX ---> auxiliary
cloudy ---> ADJ ---> adjective
; ---> PUNCT ---> punctuation
maybe ---> ADV ---> adverb
waking ---> VERB ---> verb
up ---> ADP ---> adposition
in ---> ADP ---> adposition
the ---> DET ---> determiner
middle ---> NOUN ---> noun
of ---> 

bloodlike ---> ADJ ---> adjective
color ---> NOUN ---> noun
is ---> AUX ---> auxiliary
caused ---> VERB ---> verb
by ---> ADP ---> adposition
red ---> ADJ ---> adjective
- ---> PUNCT ---> punctuation
orange ---> NOUN ---> noun
light ---> NOUN ---> noun
refracted ---> VERB ---> verb
through ---> ADP ---> adposition
the ---> DET ---> determiner
Earth ---> PROPN ---> proper noun
's ---> PART ---> particle
atmosphere ---> NOUN ---> noun
. ---> PUNCT ---> punctuation

 ---> SPACE ---> space
The ---> DET ---> determiner
red ---> ADJ ---> adjective
hue ---> NOUN ---> noun
can ---> AUX ---> auxiliary
appear ---> VERB ---> verb
more ---> ADV ---> adverb
intense ---> ADJ ---> adjective
if ---> SCONJ ---> subordinating conjunction
more ---> ADJ ---> adjective
clouds ---> NOUN ---> noun
or ---> CCONJ ---> coordinating conjunction
dust ---> NOUN ---> noun
are ---> VERB ---> verb
in ---> ADP ---> adposition
the ---> DET ---> determiner
Earth ---> PROPN ---> proper noun
's ---> PART ---> particle
atm

, ---> PUNCT ---> punctuation
but ---> CCONJ ---> coordinating conjunction
only ---> ADV ---> adverb
the ---> DET ---> determiner
thinnest ---> ADJ ---> adjective
sliver ---> NOUN ---> noun
of ---> ADP ---> adposition
the ---> DET ---> determiner
moon ---> NOUN ---> noun
's ---> PART ---> particle
disk ---> NOUN ---> noun
will ---> AUX ---> auxiliary
remain ---> VERB ---> verb
outside ---> ADP ---> adposition
the ---> DET ---> determiner
umbra ---> NOUN ---> noun
, ---> PUNCT ---> punctuation
so ---> CCONJ ---> coordinating conjunction
for ---> ADP ---> adposition
all ---> DET ---> determiner
intents ---> NOUN ---> noun
and ---> CCONJ ---> coordinating conjunction
purposes ---> NOUN ---> noun
it ---> PRON ---> pronoun
'll ---> AUX ---> auxiliary
be ---> VERB ---> verb
very ---> ADV ---> adverb
much ---> ADV ---> adverb
like ---> ADP ---> adposition
a ---> DET ---> determiner
total ---> ADJ ---> adjective
eclipse ---> NOUN ---> noun
, ---> PUNCT ---> punctuation
" ---> PUNCT ---> punctu

### Dependency Parsing

In [46]:
for token in doc2:
    print(token.text, '--->', token.dep_, '--->', spacy.explain(token.dep_))

The ---> det ---> determiner
lunar ---> amod ---> adjectival modifier
eclipse ---> nsubjpass ---> nominal subject (passive)
is ---> auxpass ---> auxiliary (passive)
seen ---> ROOT ---> None
in ---> prep ---> prepositional modifier
Santa ---> compound ---> compound
Monica ---> pobj ---> object of preposition
. ---> punct ---> punctuation

 ---> ROOT ---> None
Full ---> amod ---> adjectival modifier
moons ---> nsubjpass ---> nominal subject (passive)
that ---> nsubj ---> nominal subject
occur ---> relcl ---> relative clause modifier
in ---> prep ---> prepositional modifier
May ---> pobj ---> object of preposition
are ---> auxpass ---> auxiliary (passive)
sometimes ---> advmod ---> adverbial modifier
known ---> ROOT ---> None
as ---> prep ---> prepositional modifier
flower ---> compound ---> compound
moons ---> pobj ---> object of preposition
. ---> punct ---> punctuation
Maybe ---> advmod ---> adverbial modifier
the ---> det ---> determiner
sky ---> nsubj ---> nominal subject
was ---> cc

the ---> det ---> determiner
world ---> pobj ---> object of preposition
, ---> punct ---> punctuation
but ---> cc ---> coordinating conjunction
the ---> det ---> determiner
full ---> amod ---> adjectival modifier
lunar ---> amod ---> adjectival modifier
eclipse ---> nsubj ---> nominal subject
was ---> conj ---> conjunct
visible ---> acomp ---> adjectival complement
in ---> prep ---> prepositional modifier
many ---> amod ---> adjectival modifier
parts ---> pobj ---> object of preposition
of ---> prep ---> prepositional modifier
the ---> det ---> determiner
world ---> pobj ---> object of preposition
. ---> punct ---> punctuation

 ---> ROOT ---> None
Observers ---> nsubj ---> nominal subject
had ---> ccomp ---> clausal complement
to ---> aux ---> auxiliary
look ---> xcomp ---> open clausal complement
fast ---> advmod ---> adverbial modifier
; ---> punct ---> punctuation
the ---> det ---> determiner
total ---> amod ---> adjectival modifier
eclipse ---> nsubj ---> nominal subject
lasted --

in ---> prep ---> prepositional modifier
more ---> amod ---> adjectival modifier
than ---> quantmod ---> modifier of quantifier
two ---> nummod ---> numeric modifier
years ---> pobj ---> object of preposition
is ---> aux ---> auxiliary
coinciding ---> ROOT ---> None
with ---> prep ---> prepositional modifier
a ---> det ---> determiner
supermoon ---> pobj ---> object of preposition
. ---> punct ---> punctuation
"Technically ---> advmod ---> adverbial modifier
, ---> punct ---> punctuation
the ---> det ---> determiner
November ---> compound ---> compound
event ---> nsubj ---> nominal subject
will ---> aux ---> auxiliary
be ---> ccomp ---> clausal complement
partial ---> acomp ---> adjectival complement
, ---> punct ---> punctuation
but ---> cc ---> coordinating conjunction
only ---> advmod ---> adverbial modifier
the ---> det ---> determiner
thinnest ---> amod ---> adjectival modifier
sliver ---> nsubj ---> nominal subject
of ---> prep ---> prepositional modifier
the ---> det ---> determ

### Lemmatization

In [47]:
for token in doc2:
    print(token.text, '--->', token.lemma_, '--->', spacy.explain(token.lemma_))

The ---> the ---> None
lunar ---> lunar ---> None
eclipse ---> eclipse ---> None
is ---> be ---> None
seen ---> see ---> None
in ---> in ---> None
Santa ---> Santa ---> None
Monica ---> Monica ---> None
. ---> . ---> punctuation mark, sentence closer

 ---> 
 ---> None
Full ---> full ---> None
moons ---> moon ---> None
that ---> that ---> None
occur ---> occur ---> None
in ---> in ---> None
May ---> May ---> None
are ---> be ---> None
sometimes ---> sometimes ---> None
known ---> know ---> None
as ---> as ---> None
flower ---> flower ---> None
moons ---> moon ---> None
. ---> . ---> punctuation mark, sentence closer
Maybe ---> maybe ---> None
the ---> the ---> None
sky ---> sky ---> None
was ---> be ---> None
cloudy ---> cloudy ---> None
; ---> ; ---> None
maybe ---> maybe ---> None
waking ---> wake ---> None
up ---> up ---> None
in ---> in ---> None
the ---> the ---> None
middle ---> middle ---> None
of ---> of ---> None
the ---> the ---> None
night ---> night ---> None
to ---> to ---

astronomical ---> astronomical ---> None
phenomena ---> phenomenon ---> None
coinciding ---> coincide ---> None
in ---> in ---> None
one ---> one ---> None
event ---> event ---> None
. ---> . ---> punctuation mark, sentence closer
"Blood ---> "Blood ---> None
" ---> " ---> None
: ---> : ---> punctuation mark, colon or ellipsis
The ---> the ---> None
moon ---> moon ---> None
takes ---> take ---> None
on ---> on ---> None
a ---> a ---> None
red ---> red ---> None
hue ---> hue ---> None
as ---> as ---> None
it ---> it ---> None
aligns ---> align ---> None
with ---> with ---> None
the ---> the ---> None
sun ---> sun ---> None
and ---> and ---> None
Earth ---> Earth ---> None
and ---> and ---> None
passes ---> pass ---> None
fully ---> fully ---> None
into ---> into ---> None
Earth ---> Earth ---> None
's ---> 's ---> None
shadow ---> shadow ---> None
, ---> , ---> punctuation mark, comma
or ---> or ---> None
umbra ---> umbra ---> None
. ---> . ---> punctuation mark, sentence closer

 ---> 

### Sentence boundary detection

In [48]:
list(doc2.sents)

[The lunar eclipse is seen in Santa Monica.,
 ,
 Full moons that occur in May are sometimes known as flower moons.,
 Maybe the sky was cloudy; maybe waking up in the middle of the night to look at the moon just sounds like lunacy.,
 ,
 Whatever the reason, if you missed seeing last night's lunar eclipse, you're not alone.,
 Luckily, there are plenty of photos and video of the rare sight.,
 For those of us who slept through the Super Flower Blood Moon: Here's a quick time lapse taken early this morning 🌷🌕 pic.twitter.com/Q38GJ4WrWHThe supermoon — the Super Flower Blood Moon, to be exact — brought the first total lunar eclipse in nearly 2 1/2 years, treating sky watchers to the sight of the moon slipping into Earth's shadow while also appearing around 7% larger than it normally does.,
 Residents watch the lunar eclipse Wednesday at Sanur beach in Indonesia's Bali.,
 The reddish-orange color of the supermoon is the result of all the sunrises and sunsets in Earth's atmosphere projected ont

In [49]:
sentences2 = list(doc2.sents)

In [50]:
for sentence in sentences2:
    print(sentence)

The lunar eclipse is seen in Santa Monica.


Full moons that occur in May are sometimes known as flower moons.
Maybe the sky was cloudy; maybe waking up in the middle of the night to look at the moon just sounds like lunacy.


Whatever the reason, if you missed seeing last night's lunar eclipse, you're not alone.

Luckily, there are plenty of photos and video of the rare sight.
For those of us who slept through the Super Flower Blood Moon: Here's a quick time lapse taken early this morning 🌷🌕 pic.twitter.com/Q38GJ4WrWHThe supermoon — the Super Flower Blood Moon, to be exact — brought the first total lunar eclipse in nearly 2 1/2 years, treating sky watchers to the sight of the moon slipping into Earth's shadow while also appearing around 7% larger than it normally does.
Residents watch the lunar eclipse Wednesday at Sanur beach in Indonesia's Bali.

The reddish-orange color of the supermoon is the result of all the sunrises and sunsets in Earth's atmosphere projected onto the surface o

### Named entity

In [51]:
for ent in doc2.ents:
    print(ent.text, '--->', ent.label_, '--->', spacy.explain(ent.label_))

Santa Monica ---> GPE ---> Countries, cities, states
May ---> DATE ---> Absolute or relative dates or periods
last night's ---> TIME ---> Times smaller than a day
early this morning ---> TIME ---> Times smaller than a day
🌕 ---> DATE ---> Absolute or relative dates or periods
first ---> ORDINAL ---> "first", "second", etc.
nearly 2 1/2 years ---> DATE ---> Absolute or relative dates or periods
Earth ---> LOC ---> Non-GPE locations, mountain ranges, bodies of water
around 7% ---> PERCENT ---> Percentage, including "%"
Wednesday ---> DATE ---> Absolute or relative dates or periods
Sanur ---> ORG ---> Companies, agencies, institutions, etc.
Indonesia ---> GPE ---> Countries, cities, states
Bali ---> GPE ---> Countries, cities, states
Earth ---> LOC ---> Non-GPE locations, mountain ranges, bodies of water
the Pacific Ocean ---> LOC ---> Non-GPE locations, mountain ranges, bodies of water
U.S. ---> GPE ---> Countries, cities, states
Mexico ---> GPE ---> Countries, cities, states
New Zealand

### Visualization

In [52]:
from spacy import displacy

In [56]:
displacy.render(doc2, style='ent', jupyter=True)

In [57]:
displacy.render(doc2, style='dep', jupyter=True)

### Putting it all together

In [58]:
lis2 = []

In [61]:
for token in doc2:
    dic = {}
    dic['Token'] = token.text
    dic['POS'] = token.pos_
    dic['Tags'] = token.tag_
    dic['Dep'] = token.dep_
    dic['Explaination'] = spacy.explain(token.tag_)
    lis2.append(dic)

In [62]:
import pandas as pd

In [63]:
data2 = pd.DataFrame(lis2)

In [64]:
data2

Unnamed: 0,Token,POS,Tags,Dep,Explaination
0,The,DET,DT,det,determiner
1,lunar,ADJ,JJ,amod,adjective
2,eclipse,NOUN,NN,nsubjpass,"noun, singular or mass"
3,is,AUX,VBZ,auxpass,"verb, 3rd person singular present"
4,seen,VERB,VBN,ROOT,"verb, past participle"
...,...,...,...,...,...
713,",",PUNCT,",",punct,"punctuation mark, comma"
714,"""",PUNCT,'',punct,closing quotation mark
715,she,PRON,PRP,nsubj,"pronoun, personal"
716,said,VERB,VBD,ROOT,"verb, past tense"


#### You can reindex if you do not like the arrangement

In [66]:
l2 = []
for ent in doc2.ents:
    d = {}
    d['entities'] = ent.text
    d['labels'] = ent.label_
    d['Explaination'] = spacy.explain(ent.label_)
    l2.append(d)

In [67]:
entities2 = pd.DataFrame(l2)

In [68]:
entities2

Unnamed: 0,entities,labels,Explaination
0,Santa Monica,GPE,"Countries, cities, states"
1,May,DATE,Absolute or relative dates or periods
2,last night's,TIME,Times smaller than a day
3,early this morning,TIME,Times smaller than a day
4,🌕,DATE,Absolute or relative dates or periods
...,...,...,...
56,Santa Monica,GPE,"Countries, cities, states"
57,Calif.,GPE,"Countries, cities, states"
58,first,ORDINAL,"""first"", ""second"", etc."
59,more than two years,DATE,Absolute or relative dates or periods


In [69]:
entities2.to_csv('Entities2.csv', index = False) 