In [1]:
#Mount your drive to get cleaned_text
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
#Clone this repository
!git clone https://github.com/alexdseo/Visualization-App-on-World-Events

Cloning into 'Visualization-App-on-World-Events'...
remote: Enumerating objects: 186, done.[K
remote: Total 186 (delta 0), reused 0 (delta 0), pack-reused 186[K
Receiving objects: 100% (186/186), 1.49 GiB | 12.06 MiB/s, done.
Resolving deltas: 100% (33/33), done.


In [3]:
#Change directory
%cd Visualization-App-on-World-Events

/content/Visualization-App-on-World-Events


In [7]:
import spacy
#from spacy import displacy #visualize option
from collections import Counter
from pprint import pprint

In [8]:
# Use GPU
spacy.prefer_gpu()

True

In [9]:
import en_core_web_sm
nlp = en_core_web_sm.load()
nlp.max_length = 4000000

In [10]:
#Work on entitiy level
#Example using Grover-generated text
grover_txt = nlp('In a press release dated March 28, 2004, United States Centers for Disease Control and Prevention Director Dr. Michael Osterholm announced the results of the detection and safety trial of SARS-CoV in the Chinese trans- plantation market.')
pprint([(t.text, t.label_) for t in grover_txt.ents])

[('March 28, 2004', 'DATE'),
 ('United States Centers for Disease Control and Prevention', 'ORG'),
 ('Michael Osterholm', 'PERSON'),
 ('Chinese', 'NORP')]


In [11]:
#Work on token level
# B: The first token of a multi-token entity
# I: The inner token of a multi-token entity
# L: The Final token of a multi-token entity
# U: A single token # O: Non-entity token
pprint([(t, t.ent_iob_, t.ent_type_) for t in grover_txt])

[(In, 'O', ''),
 (a, 'O', ''),
 (press, 'O', ''),
 (release, 'O', ''),
 (dated, 'O', ''),
 (March, 'B', 'DATE'),
 (28, 'I', 'DATE'),
 (,, 'I', 'DATE'),
 (2004, 'I', 'DATE'),
 (,, 'O', ''),
 (United, 'B', 'ORG'),
 (States, 'I', 'ORG'),
 (Centers, 'I', 'ORG'),
 (for, 'I', 'ORG'),
 (Disease, 'I', 'ORG'),
 (Control, 'I', 'ORG'),
 (and, 'I', 'ORG'),
 (Prevention, 'I', 'ORG'),
 (Director, 'O', ''),
 (Dr., 'O', ''),
 (Michael, 'B', 'PERSON'),
 (Osterholm, 'I', 'PERSON'),
 (announced, 'O', ''),
 (the, 'O', ''),
 (results, 'O', ''),
 (of, 'O', ''),
 (the, 'O', ''),
 (detection, 'O', ''),
 (and, 'O', ''),
 (safety, 'O', ''),
 (trial, 'O', ''),
 (of, 'O', ''),
 (SARS, 'O', ''),
 (-, 'O', ''),
 (CoV, 'O', ''),
 (in, 'O', ''),
 (the, 'O', ''),
 (Chinese, 'B', 'NORP'),
 (trans-, 'O', ''),
 (plantation, 'O', ''),
 (market, 'O', ''),
 (., 'O', '')]


In [12]:
#Get file_path
import os

all_filepaths = []

for newssource in os.listdir('/content/drive/MyDrive/cleaned_text'):
    if newssource != ".DS_Store":
        for filename in os.listdir(os.path.join('/content/drive/MyDrive/cleaned_text',newssource)):
            if filename != ".DS_Store":
                all_filepaths.append(f"/content/drive/MyDrive/cleaned_text/{newssource}/{filename}")

all_filepaths.sort()

In [13]:
len(all_filepaths)

81

In [14]:
# All txt files into string
count = 0
cnn_texts=''
alja_texts=''
fox_texts=''
all_texts=''
for i in all_filepaths:
    f = open(i,"r").read().replace('\n', ' ')
    components = i.split("/")

    all_texts += f
    all_texts += ' '

    if components[5] == "aljazeera":
      alja_texts += f
      alja_texts += ' '
    elif components[5] == "cnn":
      cnn_texts += f
      cnn_texts += ' '
    elif components[5] == "fox":
      fox_texts += f
      fox_texts += ' '
    
    count += 1

In [15]:
#Length of texts
print('Length of all texts:', len(all_texts))
print('Length of cnn texts:',len(cnn_texts))
print('Length of fox texts:',len(fox_texts))
print('Length of aljazeera texts:',len(alja_texts))

Length of all texts: 3619982
Length of cnn texts: 1568460
Length of fox texts: 603849
Length of aljazeera texts: 1447673


In [None]:
#Train all
#Gets out of memory error
#all_article = nlp(all_texts)
#len(all_article.ents)

In [16]:
#Train pipeline
fox_article = nlp(fox_texts)
#Number of entities in fox datasets
len(fox_article.ents)

16005

In [17]:
#Train pipeline
cnn_article = nlp(cnn_texts)
#Number of entities in cnn datasets
len(cnn_article.ents)

34036

In [18]:
#Train pipeline
alja_article = nlp(alja_texts)
#Number of entities in aljazeera datasets
len(alja_article.ents)

34821

## Top mentioned Entities

In [20]:
# Top mentioned entities in fox
labels_fox = [t.label_ for t in fox_article.ents]
Counter(labels_fox).most_common()

[('GPE', 4397),
 ('PERSON', 2771),
 ('ORG', 2508),
 ('NORP', 2128),
 ('DATE', 2088),
 ('CARDINAL', 1003),
 ('LOC', 272),
 ('TIME', 226),
 ('ORDINAL', 125),
 ('WORK_OF_ART', 104),
 ('PRODUCT', 97),
 ('FAC', 67),
 ('MONEY', 63),
 ('PERCENT', 47),
 ('EVENT', 40),
 ('QUANTITY', 35),
 ('LAW', 21),
 ('LANGUAGE', 13)]

In [21]:
# Top mentioned entities in cnn
labels_cnn = [t.label_ for t in cnn_article.ents]
Counter(labels_cnn).most_common()

[('GPE', 10094),
 ('ORG', 5519),
 ('PERSON', 5480),
 ('NORP', 5296),
 ('DATE', 3277),
 ('CARDINAL', 1798),
 ('LOC', 761),
 ('TIME', 420),
 ('ORDINAL', 217),
 ('QUANTITY', 210),
 ('FAC', 203),
 ('WORK_OF_ART', 192),
 ('PRODUCT', 171),
 ('MONEY', 153),
 ('PERCENT', 93),
 ('EVENT', 77),
 ('LAW', 49),
 ('LANGUAGE', 26)]

In [24]:
# Top mentioned entities in aljazeera
labels_alja = [t.label_ for t in alja_article.ents]
Counter(labels_alja).most_common()

[('GPE', 12649),
 ('ORG', 5447),
 ('NORP', 5122),
 ('PERSON', 4981),
 ('CARDINAL', 2248),
 ('DATE', 1988),
 ('LOC', 801),
 ('ORDINAL', 257),
 ('TIME', 240),
 ('MONEY', 238),
 ('FAC', 175),
 ('WORK_OF_ART', 158),
 ('QUANTITY', 150),
 ('PRODUCT', 141),
 ('PERCENT', 92),
 ('EVENT', 78),
 ('LANGUAGE', 30),
 ('LAW', 26)]

## Top mentioned words

In [27]:
# Top 20 mentioned word in fox
words_fox = [t.text for t in fox_article.ents]
Counter(words_fox).most_common(20)

[('Ukraine', 1440),
 ('Russian', 1054),
 ('Russia', 989),
 ('Ukrainian', 558),
 ('Biden', 287),
 ('Putin', 273),
 ('NATO', 243),
 ('U.S.', 219),
 ('Kyiv', 208),
 ('Zelenskyy', 181),
 ('Mariupol', 175),
 ('Vladimir Putin', 132),
 ('Fox News', 129),
 ('Wednesday', 126),
 ('Volodymyr Zelenskyy', 124),
 ('Poland', 118),
 ('Friday', 115),
 ('Thursday', 110),
 ('US', 109),
 ('Monday', 101)]

In [28]:
# Top 20 mentioned word in cnn
words_cnn = [t.text for t in cnn_article.ents]
Counter(words_cnn).most_common(20)

[('Ukraine', 2830),
 ('Russian', 2378),
 ('Russia', 1913),
 ('Ukrainian', 1368),
 ('US', 1157),
 ('CNN', 730),
 ('Kyiv', 529),
 ('NATO', 455),
 ('Putin', 385),
 ('Mariupol', 377),
 ('Zelensky', 372),
 ('Biden', 354),
 ('Poland', 267),
 ('Ukrainians', 248),
 ('two', 235),
 ('Friday', 232),
 ('Tuesday', 230),
 ('Thursday', 227),
 ('Wednesday', 211),
 ('Volodymyr Zelensky', 202)]

In [29]:
# Top 20 mentioned word in aljazeera
words_alja = [t.text for t in alja_article.ents]
Counter(words_alja).most_common(20)

[('Ukraine', 3735),
 ('Russia', 2919),
 ('Russian', 2411),
 ('Ukrainian', 1214),
 ('US', 693),
 ('Moscow', 629),
 ('Kyiv', 598),
 ('Mariupol', 556),
 ('Putin', 455),
 ('Zelenskyy', 357),
 ('NATO', 342),
 ('UN', 280),
 ('EU', 247),
 ('UK', 235),
 ('Biden', 217),
 ('Kremlin', 214),
 ('China', 211),
 ('Al Jazeera', 200),
 ('Europe', 198),
 ('two', 195)]

## Top mentioned person

In [30]:
# Top 20 mentioned person in fox
person_fox = []
for t in fox_article.ents:
  if t.label_ == 'PERSON':
    person_fox.append(t.text)

Counter(person_fox).most_common(20)

[('Putin', 273),
 ('Biden', 223),
 ('Zelenskyy', 179),
 ('Vladimir Putin', 132),
 ('Volodymyr Zelenskyy', 124),
 ('Kirby', 37),
 ('Mariupol', 36),
 ('John Kirby', 23),
 ('Joe Biden', 23),
 ('Viadimir Putin', 19),
 ('Dmytro Kuleba', 18),
 ('Lviv', 18),
 ('Xi Jinping', 15),
 ('Greg Norman', 15),
 ('Boris Johnson', 15),
 ('Brie Stimson', 14),
 ('Antony Blinken', 13),
 ('Lawrence Richard', 13),
 ('Jens Stoltenberg', 13),
 ("Tyler O'Neil", 12)]

In [31]:
# Top 20 mentioned person in cnn
person_cnn = []
for t in cnn_article.ents:
  if t.label_ == 'PERSON':
    person_cnn.append(t.text)

Counter(person_cnn).most_common(20)

[('Putin', 385),
 ('Zelensky', 344),
 ('Biden', 276),
 ('Volodymyr Zelensky', 202),
 ('Joe Biden', 190),
 ('Vladimir Putin', 175),
 ('Mariupol', 99),
 ('Kharkiv', 94),
 ('Kirby', 75),
 ('Antony Blinken', 41),
 ('Lviv', 40),
 ('Blinken', 38),
 ('Sullivan', 36),
 ('Luhansk', 32),
 ('Emmanuel Macron', 32),
 ('John Kirby', 31),
 ('Johnson', 29),
 ('Jen Psaki', 27),
 ("Bookmark CNN's", 26),
 ('Boris Johnson', 26)]

In [32]:
# Top 20 mentioned person in aljazeera
person_alja = []
for t in alja_article.ents:
  if t.label_ == 'PERSON':
    person_alja.append(t.text)

Counter(person_alja).most_common(20)

[('Putin', 455),
 ('Zelenskyy', 351),
 ('Mariupol', 138),
 ('Kharkiv', 137),
 ('Biden', 125),
 ('Vladimir Putin', 97),
 ('Volodymyr Zelenskyy', 78),
 ('Joe Biden', 70),
 ('Blinken', 55),
 ('Lavrov', 50),
 ('Dmytro Kuleba', 49),
 ('Iryna Vereshchuk', 43),
 ('Kyiv', 39),
 ('Dmitry Peskov', 34),
 ('Antony Blinken', 33),
 ('Sergey Lavrov', 33),
 ('Peskov', 33),
 ('Abramovich', 31),
 ('Boris Johnson', 30),
 ('Lviv', 30)]

## Top mentioned Nationalities or religious or political groups

In [33]:
# Top 20 mentioned NORP in fox
norp_fox = []
for t in fox_article.ents:
  if t.label_ == 'NORP':
    norp_fox.append(t.text)

Counter(norp_fox).most_common(20)

[('Russian', 1045),
 ('Ukrainian', 499),
 ('Ukrainians', 83),
 ('Russians', 82),
 ('American', 55),
 ('Chinese', 32),
 ('Polish', 26),
 ('European', 21),
 ('Biden', 14),
 ('British', 14),
 ('Turkish', 12),
 ('Republican', 10),
 ('German', 10),
 ('Instagram', 9),
 ('Israeli', 9),
 ('Democrats', 8),
 ('Soviet', 8),
 ('French', 7),
 ('Belarusian', 6),
 ('Japanese', 6)]

In [34]:
# Top 20 mentioned NORP in cnn
norp_cnn = []
for t in cnn_article.ents:
  if t.label_ == 'NORP':
    norp_cnn.append(t.text)

Counter(norp_cnn).most_common(20)

[('Russian', 2365),
 ('Ukrainian', 1220),
 ('Ukrainians', 248),
 ('Russians', 180),
 ('European', 109),
 ('American', 98),
 ('French', 98),
 ('Polish', 77),
 ('Chinese', 57),
 ('Zaporizhzhia', 50),
 ('German', 49),
 ('Western', 41),
 ('British', 37),
 ('Belarusian', 36),
 ('Turkish', 31),
 ('Americans', 28),
 ('Biden', 27),
 ('Republican', 25),
 ('Israeli', 20),
 ('Soviet', 19)]

In [35]:
# Top 20 mentioned NORP in aljazeera
norp_alja = []
for t in alja_article.ents:
  if t.label_ == 'NORP':
    norp_alja.append(t.text)

Counter(norp_alja).most_common(20)

[('Russian', 2390),
 ('Ukrainian', 1065),
 ('Ukrainians', 151),
 ('Russians', 120),
 ('French', 83),
 ('European', 78),
 ('Polish', 75),
 ('British', 69),
 ('Western', 66),
 ('German', 55),
 ('Zaporizhzhia', 55),
 ('Turkish', 54),
 ('Belarusian', 48),
 ('Chinese', 46),
 ('Sumy', 45),
 ('American', 28),
 ('Interfax', 25),
 ('Italian', 24),
 ('Israeli', 24),
 ('Japanese', 19)]

## Top mentioned organization

In [36]:
# Top 20 mentioned ORG in fox
org_fox = []
for t in fox_article.ents:
  if t.label_ == 'ORG':
    org_fox.append(t.text)

Counter(org_fox).most_common(20)

[('NATO', 243),
 ('Fox News', 128),
 ('AP', 66),
 ('The Associated Press', 65),
 ('Kremlin', 48),
 ('Pentagon', 44),
 ('Reuters', 41),
 ('Fox News Digital', 36),
 ('White House', 35),
 ('Congress', 35),
 ('the Associated Press', 35),
 ('the White House', 29),
 ('UN', 23),
 ('State', 21),
 ('EU', 21),
 ('Kyiv', 20),
 ('Getty Images', 18),
 ('The White House', 18),
 ('Defense', 13),
 ('Satellite', 12)]

In [37]:
# Top 20 mentioned ORG in cnn
org_cnn = []
for t in cnn_article.ents:
  if t.label_ == 'ORG':
    org_cnn.append(t.text)

Counter(org_cnn).most_common(20)

[('CNN', 730),
 ('NATO', 455),
 ('UN', 128),
 ('IAEA', 105),
 ('Pentagon', 95),
 ('EU', 91),
 ('White House', 83),
 ('Kremlin', 70),
 ('the White House', 68),
 ('State', 60),
 ('Congress', 56),
 ('Maxar Technologies', 55),
 ('Kyiv', 54),
 ('Telegram', 53),
 ('House', 37),
 ('Facebook', 35),
 ('Defense', 33),
 ('Kherson', 31),
 ('the European Union', 31),
 ('AFP/Getty Images', 27)]

In [38]:
# Top 20 mentioned ORG in aljazeera
org_alja = []
for t in alja_article.ents:
  if t.label_ == 'ORG':
    org_alja.append(t.text)

Counter(org_alja).most_common(20)

[('NATO', 342),
 ('UN', 280),
 ('EU', 241),
 ('Kremlin', 203),
 ('Al Jazeera', 196),
 ('Reuters', 150),
 ('Kyiv', 104),
 ('IAEA', 90),
 ('RUSSIA', 75),
 ('Telegram', 62),
 ('Pentagon', 58),
 ('State', 40),
 ('GMT', 40),
 ('Facebook', 40),
 ('the White House', 39),
 ('White House', 39),
 ('European Union', 28),
 ('UAE', 26),
 ('SWIFT', 26),
 ('Mykhailo Podolyak', 25)]

## Top mentioned Geopolitical entity

In [39]:
# Top 20 mentioned GPE in fox
gpe_fox = []
for t in fox_article.ents:
  if t.label_ == 'GPE':
    gpe_fox.append(t.text)

Counter(gpe_fox).most_common(20)

[('Ukraine', 1437),
 ('Russia', 988),
 ('U.S.', 219),
 ('Mariupol', 121),
 ('Poland', 118),
 ('US', 109),
 ('China', 100),
 ('Moscow', 92),
 ('Kyiv', 70),
 ('Ukrainian', 59),
 ('the United States', 50),
 ('Turkey', 38),
 ('Brussels', 33),
 ('Biden', 28),
 ('Germany', 25),
 ('UK', 23),
 ('Belarus', 22),
 ('Warsaw', 21),
 ('Chernobyl', 18),
 ('France', 16)]

I don't think Biden is a country.. Misclassification!

In [40]:
# Top 20 mentioned GPE in cnn
gpe_cnn = []
for t in cnn_article.ents:
  if t.label_ == 'GPE':
    gpe_cnn.append(t.text)

Counter(gpe_cnn).most_common(20)

[('Ukraine', 2830),
 ('Russia', 1913),
 ('US', 1156),
 ('Poland', 267),
 ('Mariupol', 243),
 ('China', 169),
 ('Moscow', 167),
 ('Kyiv', 164),
 ('Ukrainian', 148),
 ('the United States', 118),
 ('UK', 110),
 ('Germany', 88),
 ('Belarus', 77),
 ('Chernobyl', 77),
 ('Brussels', 74),
 ('France', 56),
 ('Kherson', 54),
 ('Donbas', 53),
 ('Warsaw', 51),
 ('Macron', 47)]

> I don't think Macron is a country.. Misclassification!

In [41]:
# Top 20 mentioned GPE in aljazeera
gpe_alja = []
for t in alja_article.ents:
  if t.label_ == 'GPE':
    gpe_alja.append(t.text)

Counter(gpe_alja).most_common(20)

[('Ukraine', 3709),
 ('Russia', 2919),
 ('US', 693),
 ('Moscow', 629),
 ('Mariupol', 372),
 ('UK', 235),
 ('China', 211),
 ('Poland', 185),
 ('Kyiv', 160),
 ('Ukrainian', 148),
 ('Belarus', 116),
 ('Germany', 114),
 ('Turkey', 112),
 ('Chernobyl', 92),
 ('France', 80),
 ('Japan', 74),
 ('Brussels', 61),
 ('the United States', 60),
 ('Washington', 60),
 ('UKRAINE', 57)]

## Top mentioned Location

In [42]:
# Top 20 mentioned LOC in fox
loc_fox = []
for t in fox_article.ents:
  if t.label_ == 'LOC':
    loc_fox.append(t.text)

Counter(loc_fox).most_common(20)

[('Kyiv', 102),
 ('Europe', 57),
 ('West', 19),
 ('Biden', 9),
 ('the Black Sea', 6),
 ('MiGs', 5),
 ('Mariupol', 5),
 ('the Crimean Peninsula', 4),
 ('the Azov Sea', 3),
 ('Black Sea', 3),
 ('Snake Island', 3),
 ('Earth', 2),
 ('Pearl Harbor', 2),
 ('Katya Hill', 2),
 ('West 255th Street', 2),
 ('the Zaporizhzhia Oblast', 2),
 ('Daines', 1),
 ('western Mariupol', 1),
 ('Dnipro', 1),
 ('Pocivicbki oKynaHtu', 1)]

> I don't think Biden is a location.. Misclassification!

In [43]:
# Top 20 mentioned LOC in cnn
loc_cnn = []
for t in cnn_article.ents:
  if t.label_ == 'LOC':
    loc_cnn.append(t.text)

Counter(loc_cnn).most_common(20)

[('Kyiv', 287),
 ('Europe', 168),
 ('West', 38),
 ('Black Sea', 18),
 ('the Black Sea', 18),
 ('Biden', 14),
 ('the Irpin River', 14),
 ('Mariupol', 8),
 ('the Dnieper River', 6),
 ('Asia', 6),
 ('Africa', 6),
 ('Irpin River', 6),
 ('Kharkiv', 5),
 ('the Sea of Azov', 5),
 ('Gulf', 5),
 ('East', 4),
 ('the Middle East', 4),
 ('Berdyansk', 4),
 ('the East Room', 3),
 ('Eastern Europe', 3)]

>Maybe Biden is a location..

In [47]:
# Top 20 mentioned LOC in aljazeera
loc_alja = []
for t in alja_article.ents:
  if t.label_ == 'LOC':
    loc_alja.append(t.text)

Counter(loc_alja).most_common(20)

[('Kyiv', 278),
 ('Europe', 198),
 ('West', 46),
 ('the Black Sea', 25),
 ('Black Sea', 24),
 ('Biden', 15),
 ('Mariupol', 15),
 ('the Middle East', 11),
 ('Eastern Europe', 9),
 ('Dnipro', 8),
 ('the Sea of Azov', 8),
 ('the Crimean Peninsula', 7),
 ('Africa', 6),
 ('Mars', 5),
 ('Asia', 5),
 ('Middle East', 4),
 ('Crimean Peninsula', 3),
 ('Mykolayiv', 3),
 ('Latin America', 3),
 ('Sumy', 3)]

> Next: Fix misclassified entites