In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy
import re
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Load Data
bbc_data = pd.read_csv("bbc_news.csv")

In [3]:
bbc_data.head()

Unnamed: 0.1,Unnamed: 0,index,title,pubDate,guid,link,description
0,0,6684,Can I refuse to work?,"Wed, 10 Aug 2022 15:46:18 GMT",https://www.bbc.co.uk/news/business-62147992,https://www.bbc.co.uk/news/business-62147992?a...,With much of the UK enduring another period of...
1,1,9267,'Liz Truss the Brief?' World reacts to UK poli...,"Mon, 17 Oct 2022 11:35:12 GMT",https://www.bbc.co.uk/news/world-63285480,https://www.bbc.co.uk/news/world-63285480?at_m...,The UK's political chaos has been watched arou...
2,2,7387,Rationing energy is nothing new for off-grid c...,"Wed, 31 Aug 2022 05:20:18 GMT",https://www.bbc.co.uk/news/uk-scotland-highlan...,https://www.bbc.co.uk/news/uk-scotland-highlan...,Scoraig in the north west Highlands has long h...
3,3,767,The hunt for superyachts of sanctioned Russian...,"Tue, 22 Mar 2022 14:37:01 GMT",https://www.bbc.co.uk/news/60739336,https://www.bbc.co.uk/news/60739336?at_medium=...,"Wealthy Russians sanctioned by the US, EU and ..."
4,4,3712,Platinum Jubilee: 70 years of the Queen in 70 ...,"Wed, 01 Jun 2022 23:17:33 GMT",https://www.bbc.co.uk/news/uk-61660128,https://www.bbc.co.uk/news/uk-61660128?at_medi...,A quick look back at the Queen's 70 years on t...


In [4]:
bbc_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Unnamed: 0   1000 non-null   int64 
 1   index        1000 non-null   int64 
 2   title        1000 non-null   object
 3   pubDate      1000 non-null   object
 4   guid         1000 non-null   object
 5   link         1000 non-null   object
 6   description  1000 non-null   object
dtypes: int64(2), object(5)
memory usage: 54.8+ KB


In [5]:
titles = pd.DataFrame(bbc_data['title'])
titles.head()

Unnamed: 0,title
0,Can I refuse to work?
1,'Liz Truss the Brief?' World reacts to UK poli...
2,Rationing energy is nothing new for off-grid c...
3,The hunt for superyachts of sanctioned Russian...
4,Platinum Jubilee: 70 years of the Queen in 70 ...


In [6]:
# Clean Data
# 1. Add a column where everything is lower caser:
titles['lowercase'] = titles['title'].str.lower()

In [7]:
# 2. Add column where 'stop' words are removed:
en_stopwords = stopwords.words('english')
titles['no_stopwords'] = titles['lowercase'].apply(lambda x: ' '.join([word for word in x.split() if word not in en_stopwords]))

In [8]:
# 3. Add column where punctuation is removed:
titles['no_stopwords_no_punct'] = titles.apply(lambda x: re.sub(r"[^\w\s]", '', x['no_stopwords']), axis=1)

In [9]:
# 4. Add columns with tokenized titles:
titles['tokens_raw'] = titles.apply(lambda x: word_tokenize(x['title']), axis=1)
titles['tokens_clean'] = titles.apply(lambda x: word_tokenize(x['no_stopwords_no_punct']), axis=1)

In [10]:
# 5. Add lemmatizing column:
lemmatizer = WordNetLemmatizer()
titles['tokens_clean_lemmatized'] = titles['tokens_clean'].apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens])

In [11]:
titles.head()

Unnamed: 0,title,lowercase,no_stopwords,no_stopwords_no_punct,tokens_raw,tokens_clean,tokens_clean_lemmatized
0,Can I refuse to work?,can i refuse to work?,refuse work?,refuse work,"[Can, I, refuse, to, work, ?]","[refuse, work]","[refuse, work]"
1,'Liz Truss the Brief?' World reacts to UK poli...,'liz truss the brief?' world reacts to uk poli...,'liz truss brief?' world reacts uk political t...,liz truss brief world reacts uk political turmoil,"['Liz, Truss, the, Brief, ?, ', World, reacts,...","[liz, truss, brief, world, reacts, uk, politic...","[liz, truss, brief, world, reacts, uk, politic..."
2,Rationing energy is nothing new for off-grid c...,rationing energy is nothing new for off-grid c...,rationing energy nothing new off-grid community,rationing energy nothing new offgrid community,"[Rationing, energy, is, nothing, new, for, off...","[rationing, energy, nothing, new, offgrid, com...","[rationing, energy, nothing, new, offgrid, com..."
3,The hunt for superyachts of sanctioned Russian...,the hunt for superyachts of sanctioned russian...,hunt superyachts sanctioned russian oligarchs,hunt superyachts sanctioned russian oligarchs,"[The, hunt, for, superyachts, of, sanctioned, ...","[hunt, superyachts, sanctioned, russian, oliga...","[hunt, superyachts, sanctioned, russian, oliga..."
4,Platinum Jubilee: 70 years of the Queen in 70 ...,platinum jubilee: 70 years of the queen in 70 ...,platinum jubilee: 70 years queen 70 seconds,platinum jubilee 70 years queen 70 seconds,"[Platinum, Jubilee, :, 70, years, of, the, Que...","[platinum, jubilee, 70, years, queen, 70, seco...","[platinum, jubilee, 70, year, queen, 70, second]"


In [12]:
# Create lists for just our tokens:
tokens_raw_list = sum(titles['tokens_raw'], []) # unpack our lists into a single list
tokens_raw_list

['Can',
 'I',
 'refuse',
 'to',
 'work',
 '?',
 "'Liz",
 'Truss',
 'the',
 'Brief',
 '?',
 "'",
 'World',
 'reacts',
 'to',
 'UK',
 'political',
 'turmoil',
 'Rationing',
 'energy',
 'is',
 'nothing',
 'new',
 'for',
 'off-grid',
 'community',
 'The',
 'hunt',
 'for',
 'superyachts',
 'of',
 'sanctioned',
 'Russian',
 'oligarchs',
 'Platinum',
 'Jubilee',
 ':',
 '70',
 'years',
 'of',
 'the',
 'Queen',
 'in',
 '70',
 'seconds',
 'Red',
 'Bull',
 'found',
 'guilty',
 'of',
 'breaking',
 'Formula',
 '1',
 "'s",
 'budget',
 'cap',
 'World',
 'Triathlon',
 'Championship',
 'Series',
 ':',
 'Flora',
 'Duffy',
 'beats',
 'Georgia',
 'Taylor-Brown',
 'to',
 'women',
 "'s",
 'title',
 'Terry',
 'Hall',
 ':',
 'Coventry',
 'scooter',
 'ride-out',
 'pays',
 'tribute',
 'to',
 'singer',
 'Post',
 'Office',
 'and',
 'Fujitsu',
 'to',
 'face',
 'inquiry',
 'over',
 'Horizon',
 'scandal',
 "'Pavement",
 'parking',
 'frightens',
 'me',
 "'",
 'UK',
 'interest',
 'rates',
 ':',
 'How',
 'will',
 'the'

In [13]:
tokens_clean_list = sum(titles['tokens_clean_lemmatized'], [])
tokens_clean_list

['refuse',
 'work',
 'liz',
 'truss',
 'brief',
 'world',
 'reacts',
 'uk',
 'political',
 'turmoil',
 'rationing',
 'energy',
 'nothing',
 'new',
 'offgrid',
 'community',
 'hunt',
 'superyachts',
 'sanctioned',
 'russian',
 'oligarch',
 'platinum',
 'jubilee',
 '70',
 'year',
 'queen',
 '70',
 'second',
 'red',
 'bull',
 'found',
 'guilty',
 'breaking',
 'formula',
 '1',
 'budget',
 'cap',
 'world',
 'triathlon',
 'championship',
 'series',
 'flora',
 'duffy',
 'beat',
 'georgia',
 'taylorbrown',
 'womens',
 'title',
 'terry',
 'hall',
 'coventry',
 'scooter',
 'rideout',
 'pay',
 'tribute',
 'singer',
 'post',
 'office',
 'fujitsu',
 'face',
 'inquiry',
 'horizon',
 'scandal',
 'pavement',
 'parking',
 'frightens',
 'me',
 'uk',
 'interest',
 'rate',
 'rise',
 'affect',
 'high',
 'could',
 'go',
 'stayed',
 'storm',
 'happens',
 'now',
 'six',
 'nation',
 'scotland',
 'best',
 'since',
 '99',
 'beat',
 'best',
 'ireland',
 'ever',
 'long',
 'liz',
 'truss',
 'survive',
 'prime',
 'm

In [14]:
# POS Tagging
nlp = spacy.load('en_core_web_sm')

In [15]:
spacy_doc = nlp(' '.join(tokens_raw_list))

In [16]:
pos_df = pd.DataFrame(columns = ['token', 'pos_tag'])
pos_df

Unnamed: 0,token,pos_tag


In [17]:
for token in spacy_doc:
    pos_df = pd.concat([pos_df, pd.DataFrame.from_records(
        [{'token': token.text, 'pos_tag': token.pos_}]
    )], ignore_index=True)

In [18]:
pos_df

Unnamed: 0,token,pos_tag
0,Can,AUX
1,I,PRON
2,refuse,VERB
3,to,PART
4,work,VERB
...,...,...
11742,sale,NOUN
11743,scams,NOUN
11744,",",PUNCT
11745,consumers,NOUN


In [19]:
pos_df_counts = pos_df.groupby(['token', 'pos_tag']).size().reset_index(name='counts').sort_values(by='counts', ascending=False)

In [20]:
pos_df_counts[:10]

Unnamed: 0,token,pos_tag,counts
95,:,PUNCT,543
8,',PUNCT,300
2897,in,ADP,187
4082,to,PART,175
3268,of,ADP,172
22,-,PUNCT,166
4043,the,DET,163
1856,and,CCONJ,147
15,'s,PART,143
97,?,PUNCT,130


In [21]:
nouns = pos_df_counts[pos_df_counts.pos_tag == 'NOUN'][:10]
nouns

Unnamed: 0,token,pos_tag,counts
4267,war,NOUN,35
3552,record,NOUN,15
3416,police,NOUN,14
4356,year,NOUN,14
4316,win,NOUN,14
3061,living,NOUN,13
4009,tax,NOUN,13
2326,day,NOUN,12
3368,people,NOUN,12
2031,boss,NOUN,11


In [22]:
verbs = pos_df_counts[pos_df_counts.pos_tag == 'VERB'][:10]
verbs

Unnamed: 0,token,pos_tag,counts
3687,says,VERB,30
9,',VERB,14
2670,found,VERB,13
4317,win,VERB,12
4324,wins,VERB,10
2713,get,VERB,9
2388,dies,VERB,9
3990,take,VERB,8
2982,killed,VERB,8
3686,say,VERB,8


In [23]:
adj = pos_df_counts[pos_df_counts.pos_tag == 'ADJ'][:10]
adj

Unnamed: 0,token,pos_tag,counts
3244,new,ADJ,28
1400,Russian,ADJ,21
2606,final,ADJ,16
19,-,ADJ,14
2625,first,ADJ,12
3199,more,ADJ,10
1994,big,ADJ,9
2835,high,ADJ,9
3000,last,ADJ,8
3304,other,ADJ,8


In [24]:
spacy_doc2 = nlp(' '.join(tokens_clean_list))
spacy_doc2



In [25]:
pos_df2 = pd.DataFrame(columns=['clean_token', 'pos_tag'])

In [26]:
pos_df2

Unnamed: 0,clean_token,pos_tag


In [27]:
for token in spacy_doc2:
    pos_df2 = pd.concat([pos_df2, pd.DataFrame.from_records(
        [{'clean_token': token.text, 'pos_tag': token.pos_}
        ])], ignore_index=True)

In [28]:
pos_df2

Unnamed: 0,clean_token,pos_tag
0,refuse,AUX
1,work,NOUN
2,liz,PROPN
3,truss,ADJ
4,brief,ADJ
...,...,...
7539,car,NOUN
7540,sale,NOUN
7541,scam,NOUN
7542,consumer,NOUN


In [29]:
pos_df2_counts = pos_df2.groupby(['clean_token', 'pos_tag']).size().reset_index(name='counts2').sort_values(by='counts2', ascending=False)

In [30]:
pos_df2_counts

Unnamed: 0,clean_token,pos_tag,counts2
30,2022,NUM,47
1162,england,PROPN,45
870,cup,PROPN,39
3056,say,VERB,37
3707,uk,PROPN,37
...,...,...,...
1559,halftime,NOUN,1
1560,hall,NOUN,1
1562,halloween,PROPN,1
1563,halted,VERB,1


In [31]:
clean_nouns = pos_df2_counts[pos_df2_counts.pos_tag == 'NOUN'][:10]
clean_nouns

Unnamed: 0,clean_token,pos_tag,counts2
3840,war,NOUN,34
3948,world,NOUN,30
2136,man,NOUN,22
907,day,NOUN,21
3973,year,NOUN,20
1158,energy,NOUN,17
2847,record,NOUN,17
3935,woman,NOUN,16
1130,election,NOUN,16
3870,week,NOUN,16


In [32]:
clean_propn = pos_df2_counts[pos_df2_counts.pos_tag == 'PROPN'][:10]
clean_propn

Unnamed: 0,clean_token,pos_tag,counts2
1162,england,PROPN,45
870,cup,PROPN,39
3707,uk,PROPN,37
3949,world,PROPN,26
3710,ukraine,PROPN,23
3763,v,PROPN,18
3699,u,PROPN,14
3917,win,PROPN,14
2775,queen,PROPN,13
2053,liverpool,PROPN,13


In [70]:
# Named Entity Recognition
ner_df = pd.DataFrame(columns=['token', 'ner_tag'])

for token in spacy_doc.ents:
    if pd.isna(token.label_) is False:
        ner_df = pd.concat([ner_df, pd.DataFrame.from_records(
            [{'token': token.text, 'ner_tag': token.label_}]
        )], ignore_index=True)

In [72]:
ner_df.head()

Unnamed: 0,token,ner_tag
0,Liz Truss,PERSON
1,UK,GPE
2,Rationing,PRODUCT
3,superyachts,CARDINAL
4,Russian,NORP


In [76]:
ner_df_counts = ner_df.groupby(['token', 'ner_tag']).size().reset_index(name='counts').sort_values(by='counts', ascending=False)

In [78]:
ner_df_counts.head(10)

Unnamed: 0,token,ner_tag,counts
965,Ukraine,GPE,47
955,UK,GPE,36
329,England,GPE,32
819,Russian,NORP,20
957,US,GPE,19
1031,World Cup 2022,EVENT,18
1058,first,ORDINAL,13
918,The Papers,WORK_OF_ART,13
378,France,GPE,12
226,China,GPE,11


In [84]:
people = ner_df_counts[ner_df_counts.ner_tag == 'PERSON'][:10]
people

Unnamed: 0,token,ner_tag,counts
257,Covid,PERSON,9
760,Queen,PERSON,8
757,Putin,PERSON,8
169,Boris Johnson,PERSON,6
563,Liz Truss,PERSON,6
788,Rishi Sunak,PERSON,5
581,Macron,PERSON,4
762,Quiz,PERSON,4
515,Jurgen Klopp,PERSON,4
325,Emma Raducanu,PERSON,4
