# 1.6 

In [2]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re

In [3]:
# Download English module

!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
     ---- ----------------------------------- 1.3/12.8 MB 4.8 MB/s eta 0:00:03
     --------- ------------------------------ 3.1/12.8 MB 7.7 MB/s eta 0:00:02
     ------------------ --------------------- 5.8/12.8 MB 8.6 MB/s eta 0:00:01
     ---------------------- ----------------- 7.3/12.8 MB 8.5 MB/s eta 0:00:01
     --------------------------- ------------ 8.9/12.8 MB 8.3 MB/s eta 0:00:01
     --------------------------------- ------ 10.7/12.8 MB 8.3 MB/s eta 0:00:01
     ---------------------------------------- 12.8/12.8 MB 8.7 MB/s eta 0:00:00
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [4]:
# Load spacy English module

NER = spacy.load("en_core_web_sm")

## Load 20th_Century_article

In [6]:
# Load the article

with open('20th_Century_article_Wiki.txt', 'r', errors='ignore') as file:
    data = file.read().replace('\n', '')

In [7]:
book = NER(data)

In [8]:
# Visualize identified entities

displacy.render(book[273:20000], style = "ent", jupyter = True)

## Get named entity list per sentence

In [10]:
df_sentences = [] # empty shell to store results

# Loop through sentences, get entity list for each sentence
for sent in book.sents:
    entity_list = [ent.text for ent in sent.ents]
    df_sentences.append({"sentence": sent, "entities": entity_list})
    
df_sentences = pd.DataFrame(df_sentences)

In [11]:
df_sentences.head(10)

Unnamed: 0,sentence,entities
0,"(20th, century, -, WikipediaJump, to, contentM...","[20th century - WikipediaJump, Navigation\tMai..."
1,"(Ø¬Ù‡Basa, Balià, ¦, ¬à, ¦, ¾à, ¦, ‚à, ¦, ²à, ...","[¦¬à¦, ¦‚à, ¦²à, ¦¾é–, ÑˆÒ¡Ð¾Ñ€Ñ‚ÑÐ°Ð‘ÐµÐ»Ð°Ñ€..."
2,"(LombardMagyarÐœÐ, °, ÐºÐµÐ´Ð¾Ð½ÑÐºÐ¸à´, ®, à´...","[à¥€áƒ›áƒáƒ, Ø§Ø²ÙØ±ÙˆÙ†ÛŒBahasa Melayué–, sÐœ..."
3,"(ScotsSeelterskSesotho, sa, LeboaShqipSicilian...","[ScotsSeelterskSesotho sa, Ñ€Ð²Ð]"
4,"(informationCite, this, pageGet, shortened, UR...","[URLDownload, Print, Download, PDFPrintable, W..."
5,"(It, was, the, first, photograph, taken, of, E...","[first, Earth]"
6,"(Millennium2ndÂ, , millenniumCenturies19thÂ, ...","[Births, Deaths Establishments, Disestablishme..."
7,"(Population, growth, was, also, unprecedented,...","[the century, around 1.6 billion, billion.[4]T..."
8,"(Unprecedented, advances, in, science, and, te...",[]
9,"(The, Earth, 's, sixth, mass, extinction, even...","[Earth, sixth, Holocene]"


## Load country names

In [13]:
# Import countries

country_df = pd.read_csv("countries_list_20th_century_1.5.csv")

In [14]:
country_df['country_name'] = country_df['country_name'].str.strip()

In [15]:
country_df.head()

Unnamed: 0.1,Unnamed: 0,country_name
0,1,Afghanistan
1,2,Albania
2,3,Algeria
3,4,Andorra
4,5,Angola


## Filtering entities from the article

In [17]:
# Function to filter out entities not of interest

def filter_entity(ent_list, country_df):
    return [ent for ent in ent_list 
            if ent in list(country_df['country_name'])]

In [18]:
# Check

filter_entity(["Germany", "CF", "2"], country_df)

['Germany']

In [19]:
#If the answer is blank, then I probably need to remove white spaces from country list (csv file) that I loaded

In [35]:
df_sentences['country_entities'] = df_sentences['entities'].apply(lambda x: filter_entity(x, country_df))

In [37]:
df_sentences['country_entities'].head(20)

0     []
1     []
2     []
3     []
4     []
5     []
6     []
7     []
8     []
9     []
10    []
11    []
12    []
13    []
14    []
15    []
16    []
17    []
18    []
19    []
Name: country_entities, dtype: object

In [42]:
# Filter out sentences that don't have any country entities

df_sentences_filtered = df_sentences[df_sentences['country_entities'].map(len) > 0]

In [44]:
df_sentences_filtered.tail(10)

Unnamed: 0,sentence,entities,country_entities
110,"(In, 1948, The, Nakba, was, ,, according, to, ...","[1948, Nakba, Arabs, Palestine, Jewish Militia...",[Palestine]
111,"(The, campaign, utilized, methods, of, intimid...","[Arab, the Soviet Union's, World War II, Easte...",[Cuba]
114,"(This, led, to, the, proxy, wars, with, the, W...","[Western, Korea, 1950â€“1953, Vietnam, 1957â€“...","[Vietnam, India, Pakistan]"
118,"(Mahatma, Gandhi, 's, nonviolence, and, Indian...","[Mahatma Gandhi's, Indian, the British Empire,...",[South Africa]
122,"(It, is, thought, to, be, the, largest, famine...","[Vietnam War, two million, Eastern, Western Bl...",[Afghanistan]
144,"(Tango, was, created, in, Argentina, and, beca...","[Tango, Argentina, Americas, Europe]",[Argentina]
239,"((, For, example, ,, oil, was, a, factor, in, ...","[Japan, the United States, 1941, OPEC, the Yom...",[Japan]
283,"(Economics, was, divided, into, two, general, ...","[two, 1970s, Western, the United States, Canad...","[Canada, Australia, New Zealand]"
303,"("", "", Americanization, "", :, Political, and, ...","[Australia, PDF]",[Australia]
369,"("", Russia, and, the, Russians, :, a, history,...","[Russia, Russians]",[Russia]


## Create relationships

In [47]:
# Defining relationships 

# window size = 5 : this defines how many sentences will be looked at simultaneously 
relationships = [] # create an empty list

for i in range(df_sentences_filtered.index[-1]):
    end_i = min(i+5, df_sentences_filtered.index[-1])
    country_list = sum((df_sentences_filtered.loc[i: end_i].country_entities), [])
    
    # Remove duplicated characters that are next to each other
    country_unique = [country_list[i] for i in range(len(country_list)) 
                   if (i==0) or country_list[i] != country_list[i-1]]
    
    if len(country_unique) > 1:
        for idx, a in enumerate(country_unique[:-1]):
            b = country_unique[idx + 1]
            relationships.append({"source": a, "target": b})

In [49]:
relationship_df = pd.DataFrame(relationships)

In [51]:
relationship_df

Unnamed: 0,source,target
0,France,Russia
1,Russia,Italy
2,Italy,Romania
3,Romania,Germany
4,Germany,Austria
...,...,...
219,Australia,New Zealand
220,Canada,Australia
221,Australia,New Zealand
222,Canada,Australia


In [53]:
# Sort the cases with a->b and b->a

relationship_df = pd.DataFrame(np.sort(relationship_df.values, axis = 1), columns = relationship_df.columns)
relationship_df.head(5)

Unnamed: 0,source,target
0,France,Russia
1,Italy,Russia
2,Italy,Romania
3,Germany,Romania
4,Austria,Germany


In [55]:
# Summarize the interactions

relationship_df["value"] = 1
relationship_df = relationship_df.groupby(["source","target"], sort=False, as_index=False).sum()

In [57]:
relationship_df.head(10)

Unnamed: 0,source,target,value
0,France,Russia,6
1,Italy,Russia,6
2,Italy,Romania,6
3,Germany,Romania,6
4,Austria,Germany,6
5,Austria,Hungary,6
6,Bulgaria,Hungary,6
7,Bulgaria,Germany,5
8,Germany,Italy,14
9,Germany,Spain,6


#### Export to local working directory

In [60]:
relationship_df.to_csv('country_relationship.csv')