# Section 22: Identifying Parts of Speech / Named Entities

### Parts of Speech

In [1]:
import spacy
import pandas as pd

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
emma_ja = "emma woodhouse handsome clever and rich with a comfortable home and happy disposition seemed to unite some of the best blessings of existence and had lived nearly twentyone years in the world with very little to distress or vex her she was the youngest of the two daughters of a most affectionate indulgent father and had in consequence of her sisters marriage been mistress of his house from a very early period her mother had died too long ago for her to have more than an indistinct remembrance of her caresses and her place had been supplied by an excellent woman as governess who had fallen little short of a mother in affection sixteen years had miss taylor been in mr woodhouses family less as a governess than a friend very fond of both daughters but particularly of emma between them it was more the intimacy of sisters even before miss taylor had ceased to hold the nominal office of governess the mildness of her temper had hardly allowed her to impose any restraint and the shadow of authority being now long passed away they had been living together as friend and friend very mutually attached and emma doing just what she liked highly esteeming miss taylors judgment but directed chiefly by her own"


In [4]:
spacy_doc_pos = nlp(emma_ja)

---
save POS in a dataframe  to see tokens and assignemnt

In [5]:
pos_df = pd.DataFrame(columns=['token', 'pos_tag']) # initialize df
pos_df.head()

Unnamed: 0,token,pos_tag


In [6]:
for token in spacy_doc_pos:
    # print('token text', token.text)
    # print('token pos', token.pos_)    
    pos_df = pd.concat([pos_df, pd.DataFrame.from_records([{'token': token.text, 'pos_tag': token.pos_}])] , ignore_index=True)

In [7]:
print(pos_df.head())

       token pos_tag
0       emma   PROPN
1  woodhouse   PROPN
2   handsome     ADJ
3     clever     ADJ
4        and   CCONJ


---
next we'll look at the most common tokens and their tags

In [8]:
pos_df_counts = pos_df.groupby(['token','pos_tag']).size().reset_index(name='counts').sort_values(by='counts',ascending=False)
print(pos_df_counts.head())

    token pos_tag  counts
88     of     ADP      14
49    had     AUX       9
54    her    PRON       9
111   the     DET       8
6     and   CCONJ       8


In [9]:
# this one provides total frequency of the tag
pos_counts = pos_df.groupby(['pos_tag']).size().reset_index(name='count')
print(pos_counts.sort_values(by='count',ascending=False))

# this provides unique frequency
pos_counts2 = pos_df_counts.groupby(['pos_tag'])['token'].count().sort_values(ascending=False)

print(pos_counts2)


   pos_tag  count
6     NOUN     44
1      ADP     28
2      ADV     22
0      ADJ     19
12    VERB     19
5      DET     18
9     PRON     18
3      AUX     16
4    CCONJ     11
10   PROPN      8
8     PART      5
7      NUM      3
11   SCONJ      3
pos_tag
NOUN     35
VERB     19
ADJ      18
ADV      18
PRON      9
ADP       8
PROPN     6
DET       5
AUX       4
CCONJ     3
NUM       3
SCONJ     3
PART      1
Name: token, dtype: int64


In [10]:
#printing out the top nouns
nouns = pos_df_counts[pos_df_counts.pos_tag=='NOUN'][:10]
print(nouns)

         token pos_tag  counts
48   governess    NOUN       3
46      friend    NOUN       3
130      years    NOUN       2
35        emma    NOUN       2
28   daughters    NOUN       2
103    sisters    NOUN       2
82      mother    NOUN       2
89      office    NOUN       1
78    mistress    NOUN       1
75    mildness    NOUN       1


### Named Entity Recognition

In [11]:
from spacy import displacy
from spacy import tokenizer
import re

In [12]:
google_text = "Google was founded on September 4, 1998, by computer scientists Larry Page and Sergey Brin while they were PhD students at Stanford University in California. Together they own about 14% of its publicly listed shares and control 56% of its stockholder voting power through super-voting stock. The company went public via an initial public offering (IPO) in 2004. In 2015, Google was reorganized as a wholly owned subsidiary of Alphabet Inc. Google is Alphabet's largest subsidiary and is a holding company for Alphabet's internet properties and interests. Sundar Pichai was appointed CEO of Google on October 24, 2015, replacing Larry Page, who became the CEO of Alphabet. On December 3, 2019, Pichai also became the CEO of Alphabet."

In [13]:
spacy_doc_ner = nlp(google_text)
print(spacy_doc_ner)

Google was founded on September 4, 1998, by computer scientists Larry Page and Sergey Brin while they were PhD students at Stanford University in California. Together they own about 14% of its publicly listed shares and control 56% of its stockholder voting power through super-voting stock. The company went public via an initial public offering (IPO) in 2004. In 2015, Google was reorganized as a wholly owned subsidiary of Alphabet Inc. Google is Alphabet's largest subsidiary and is a holding company for Alphabet's internet properties and interests. Sundar Pichai was appointed CEO of Google on October 24, 2015, replacing Larry Page, who became the CEO of Alphabet. On December 3, 2019, Pichai also became the CEO of Alphabet.


In [14]:
for word in spacy_doc_ner.ents:
    print(word.text, ">> " , word.label_) # print each word and its entity label

Google >>  ORG
September 4, 1998 >>  DATE
Larry Page >>  PERSON
Sergey Brin >>  PERSON
PhD >>  WORK_OF_ART
Stanford University >>  ORG
California >>  GPE
about 14% >>  PERCENT
56% >>  PERCENT
IPO >>  ORG
2004 >>  DATE
2015 >>  DATE
Google >>  ORG
Alphabet Inc. >>  ORG
Alphabet >>  ORG
Alphabet >>  ORG
Sundar Pichai >>  PERSON
Google >>  ORG
October 24, 2015 >>  DATE
Larry Page >>  PERSON
Alphabet >>  GPE
December 3, 2019 >>  DATE
Pichai >>  PERSON
Alphabet >>  GPE


In [15]:
displacy.render(spacy_doc_ner, style='ent') # render viz of entity recognition

--- 
next lets redo this with a cleaner version of the text


In [16]:
clean_google_text = re.sub(r'[^\w\s]','', google_text).lower()
print(clean_google_text)

google was founded on september 4 1998 by computer scientists larry page and sergey brin while they were phd students at stanford university in california together they own about 14 of its publicly listed shares and control 56 of its stockholder voting power through supervoting stock the company went public via an initial public offering ipo in 2004 in 2015 google was reorganized as a wholly owned subsidiary of alphabet inc google is alphabets largest subsidiary and is a holding company for alphabets internet properties and interests sundar pichai was appointed ceo of google on october 24 2015 replacing larry page who became the ceo of alphabet on december 3 2019 pichai also became the ceo of alphabet


In [17]:
clean_spacy_doc = nlp(clean_google_text)
for word in clean_spacy_doc.ents:
    print(word.text, '>>>', word.label_)

google >>> ORG
september 4 1998 >>> DATE
stanford university >>> ORG
california >>> GPE
about 14 >>> CARDINAL
56 >>> CARDINAL
2004 >>> DATE
2015 >>> DATE
alphabet inc google >>> ORG
google >>> ORG
october 24 2015 >>> DATE
larry >>> PERSON
december 3 2019 >>> DATE


In [18]:
displacy.render(clean_spacy_doc, style='ent')

we can see that the NER has different outcomes due to the lowercased and punctuation removal. So it's important to determine what stage of the NLP process we do each stage of processing!

### Practical Test

In [25]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pandas as pd
import matplotlib.pyplot as plt

Load in our data and then pull out the titles for our work:

In [83]:
bbc_data = pd.read_csv("bbc_news.csv")
bbc_data.head()

Unnamed: 0.1,Unnamed: 0,index,title,pubDate,guid,link,description
0,0,6684,Can I refuse to work?,"Wed, 10 Aug 2022 15:46:18 GMT",https://www.bbc.co.uk/news/business-62147992,https://www.bbc.co.uk/news/business-62147992?a...,With much of the UK enduring another period of...
1,1,9267,'Liz Truss the Brief?' World reacts to UK poli...,"Mon, 17 Oct 2022 11:35:12 GMT",https://www.bbc.co.uk/news/world-63285480,https://www.bbc.co.uk/news/world-63285480?at_m...,The UK's political chaos has been watched arou...
2,2,7387,Rationing energy is nothing new for off-grid c...,"Wed, 31 Aug 2022 05:20:18 GMT",https://www.bbc.co.uk/news/uk-scotland-highlan...,https://www.bbc.co.uk/news/uk-scotland-highlan...,Scoraig in the north west Highlands has long h...
3,3,767,The hunt for superyachts of sanctioned Russian...,"Tue, 22 Mar 2022 14:37:01 GMT",https://www.bbc.co.uk/news/60739336,https://www.bbc.co.uk/news/60739336?at_medium=...,"Wealthy Russians sanctioned by the US, EU and ..."
4,4,3712,Platinum Jubilee: 70 years of the Queen in 70 ...,"Wed, 01 Jun 2022 23:17:33 GMT",https://www.bbc.co.uk/news/uk-61660128,https://www.bbc.co.uk/news/uk-61660128?at_medi...,A quick look back at the Queen's 70 years on t...


In [84]:
titles = pd.DataFrame(bbc_data['title'])
titles.head()

Unnamed: 0,title
0,Can I refuse to work?
1,'Liz Truss the Brief?' World reacts to UK poli...
2,Rationing energy is nothing new for off-grid c...
3,The hunt for superyachts of sanctioned Russian...
4,Platinum Jubilee: 70 years of the Queen in 70 ...


Next, we'll clean the data in the order:
1. lowercase
2. remove stopwords
3. remove punctuation
4. tokenize it
5. lemmatize it


In [85]:
titles['cleaned'] = titles['title'].str.lower()
titles.head()

Unnamed: 0,title,cleaned
0,Can I refuse to work?,can i refuse to work?
1,'Liz Truss the Brief?' World reacts to UK poli...,'liz truss the brief?' world reacts to uk poli...
2,Rationing energy is nothing new for off-grid c...,rationing energy is nothing new for off-grid c...
3,The hunt for superyachts of sanctioned Russian...,the hunt for superyachts of sanctioned russian...
4,Platinum Jubilee: 70 years of the Queen in 70 ...,platinum jubilee: 70 years of the queen in 70 ...


In [86]:
en_stopwords = stopwords.words('english')
titles['cleaned'] = titles['cleaned'].apply(lambda x: ' '.join([word for word in x.split() if word not in en_stopwords])) 
titles.head() # remove stopwords

Unnamed: 0,title,cleaned
0,Can I refuse to work?,refuse work?
1,'Liz Truss the Brief?' World reacts to UK poli...,'liz truss brief?' world reacts uk political t...
2,Rationing energy is nothing new for off-grid c...,rationing energy nothing new off-grid community
3,The hunt for superyachts of sanctioned Russian...,hunt superyachts sanctioned russian oligarchs
4,Platinum Jubilee: 70 years of the Queen in 70 ...,platinum jubilee: 70 years queen 70 seconds


In [87]:
titles['cleaned'] = titles['cleaned'].apply(lambda x: re.sub(r"([^\w\s])","",x)) # remove punct
titles.head()

Unnamed: 0,title,cleaned
0,Can I refuse to work?,refuse work
1,'Liz Truss the Brief?' World reacts to UK poli...,liz truss brief world reacts uk political turmoil
2,Rationing energy is nothing new for off-grid c...,rationing energy nothing new offgrid community
3,The hunt for superyachts of sanctioned Russian...,hunt superyachts sanctioned russian oligarchs
4,Platinum Jubilee: 70 years of the Queen in 70 ...,platinum jubilee 70 years queen 70 seconds


below we build two sets of tokenized titles in order to compare the results later

In [None]:
titles['cleaned'] = titles['cleaned'].apply(lambda x: word_tokenize(x)) # tokenize clean titles
titles['og_tokenized'] = titles['title'].apply(lambda x: word_tokenize(x)) # tokenize orig titles
titles.head()

Unnamed: 0,title,cleaned,og_tokenized
0,Can I refuse to work?,"[refuse, work]","[Can, I, refuse, to, work, ?]"
1,'Liz Truss the Brief?' World reacts to UK poli...,"[liz, truss, brief, world, reacts, uk, politic...","['Liz, Truss, the, Brief, ?, ', World, reacts,..."
2,Rationing energy is nothing new for off-grid c...,"[rationing, energy, nothing, new, offgrid, com...","[Rationing, energy, is, nothing, new, for, off..."
3,The hunt for superyachts of sanctioned Russian...,"[hunt, superyachts, sanctioned, russian, oliga...","[The, hunt, for, superyachts, of, sanctioned, ..."
4,Platinum Jubilee: 70 years of the Queen in 70 ...,"[platinum, jubilee, 70, years, queen, 70, seco...","[Platinum, Jubilee, :, 70, years, of, the, Que..."


In [89]:
lemmatizer = WordNetLemmatizer() # initialize lemmatizer
titles['cleaned'] = titles['cleaned'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
titles.head()

Unnamed: 0,title,cleaned,og_tokenized
0,Can I refuse to work?,"[refuse, work]","[Can, I, refuse, to, work, ?]"
1,'Liz Truss the Brief?' World reacts to UK poli...,"[liz, truss, brief, world, reacts, uk, politic...","['Liz, Truss, the, Brief, ?, ', World, reacts,..."
2,Rationing energy is nothing new for off-grid c...,"[rationing, energy, nothing, new, offgrid, com...","[Rationing, energy, is, nothing, new, for, off..."
3,The hunt for superyachts of sanctioned Russian...,"[hunt, superyachts, sanctioned, russian, oliga...","[The, hunt, for, superyachts, of, sanctioned, ..."
4,Platinum Jubilee: 70 years of the Queen in 70 ...,"[platinum, jubilee, 70, year, queen, 70, second]","[Platinum, Jubilee, :, 70, years, of, the, Que..."


next we will unpack these lists of lists into single lists for the POS tagging and NER

In [91]:
og_token_list = sum(titles['og_tokenized'],[])
cleaned_token_list = sum(titles['cleaned'],[])

In [95]:
# print(nlp) # spacy is already built / initialized but redoing so here
nlp = spacy.load("en_core_web_sm")

In [97]:
spacy_doc_og = nlp(' '.join(og_token_list))

In [100]:
pos_df_og = pd.DataFrame(columns=['token', 'pos_tag']) #initialize dataframe
pos_df_og.head()

Unnamed: 0,token,pos_tag


In [None]:
for token in spacy_doc_og:
    pos_df_og = pd.concat([pos_df_og, pd.DataFrame.from_records([{'token': token.text , 'pos_tag': token.pos_}])], ignore_index=True)
pos_df_og.head() # show og dataframe to see that all words are getting POS'd

Unnamed: 0,token,pos_tag
0,Can,AUX
1,I,PRON
2,refuse,VERB
3,to,PART
4,work,VERB


In [107]:
og_pos_stats = pos_df_og.groupby(by=['token', 'pos_tag']).size().reset_index(name='frequency').sort_values(by='frequency', ascending=False)
print(og_pos_stats)

          token pos_tag  frequency
95            :   PUNCT        543
8             '   PUNCT        300
2897         in     ADP        187
4082         to    PART        175
3268         of     ADP        172
...         ...     ...        ...
2304  crumbling    VERB          1
2305     crunch   PROPN          1
827      Jarrod   PROPN          1
826    Japanese     ADJ          1
0             !   PUNCT          1

[4368 rows x 3 columns]


Note how the puctuation and other junk was captured in the tokenization stage which made the pos tagging also apply to it which creates a ton of noise in our summary

---

Now we'll do the same thing but for the cleaned tokens

In [135]:
spacy_doc_cleaned = nlp(' '.join(cleaned_token_list)) # generate clean doc
pos_df_cleaned = pd.DataFrame(columns=['token', 'pos_tag']) # init DF
for token in spacy_doc_cleaned: # pos tag each token
    pos_df_cleaned = pd.concat([pos_df_cleaned, pd.DataFrame.from_records([{'token': token.text , 'pos_tag': token.pos_}])] , ignore_index=True)
clean_pos_stats = pos_df_cleaned.groupby(by=['token', 'pos_tag']).size().reset_index(name='frequency').sort_values(by='frequency', ascending=False)
print(clean_pos_stats)

        token pos_tag  frequency
30       2022     NUM         47
1162  england   PROPN         45
870       cup   PROPN         39
3059      say    VERB         37
3710       uk   PROPN         37
...       ...     ...        ...
2081   lookup    NOUN          1
2082     loom     ADJ          1
2083     loom    NOUN          1
732      coco     ADJ          1
3997      zuu   PROPN          1

[3998 rows x 3 columns]


Comparing the two results:

In [128]:
print(og_pos_stats.head(n=20))
clean_pos_stats.head(n=20)

        token pos_tag  frequency
95          :   PUNCT        543
8           '   PUNCT        300
2897       in     ADP        187
4082       to    PART        175
3268       of     ADP        172
22          -   PUNCT        166
4043      the     DET        163
1856      and   CCONJ        147
15         's    PART        143
97          ?   PUNCT        130
2655      for     ADP        112
18          ,   PUNCT         95
3287       on     ADP         86
1785        a     DET         81
4080       to     ADP         66
1585      The     DET         63
1655  Ukraine   PROPN         55
431       Cup   PROPN         50
551   England   PROPN         49
2938       is     AUX         49


Unnamed: 0,token,pos_tag,frequency
30,2022,NUM,47
1162,england,PROPN,45
870,cup,PROPN,39
3059,say,VERB,37
3710,uk,PROPN,37
3844,war,NOUN,34
2389,new,ADJ,31
3952,world,NOUN,30
3953,world,PROPN,26
3713,ukraine,PROPN,23


---

Next, we'll do the NER analysis and summary

---

In [129]:
ner_df = pd.DataFrame(columns=['token', 'ner_tag'])
ner_df.head()

Unnamed: 0,token,ner_tag


In [138]:
for token in spacy_doc_cleaned.ents:
    if pd.isna(token.label_) is False:
        ner_df = pd.concat([ner_df, pd.DataFrame.from_records(
            [{'token': token.text, 'ner_tag': token.label_}])], ignore_index=True)

In [139]:
ner_df.head()

Unnamed: 0,token,ner_tag
0,russian,NORP
1,70 year,DATE
2,70 second,TIME
3,bull,ORG
4,1,CARDINAL


In [141]:
ner_df_stats = ner_df.groupby(['token','ner_tag']).size().reset_index(name='frequency').sort_values(by='frequency', ascending=False)
ner_df_stats.head()

Unnamed: 0,token,ner_tag,frequency
65,2022,CARDINAL,61
1494,russian,NORP,50
1021,Ukraine,GPE,47
1250,first,ORDINAL,43
1011,UK,GPE,36
