# Task 1.6: Intro to NLP and Network Analysis
#### 1. Import Libraries
#### 2. Load 20th_century TXT file
#### 3. Data Wrangling
#### 4. Create NER object
#### 5. Split the sentence entities from the NER object
#### 6. Filter the entities so that you end up only with the ones from your countries list
#### 7. Create the relationships dataframe
#### 8. Save and export dataframe

### 1. Import Libraries

In [16]:
# Importing libraries
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re

# Run this cell twice to get rid of import warning after a kernel restart

### 2. Load 20th_century text file

In [2]:
# Importing 20th_century text file
with open('20th_century_scrape.txt', 'r', errors='ignore') as file: 
   data = file.read().replace( '\n', ' ')

In [3]:
# Download English module

!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


#### Q4 comment: The 20th_century text file didn't need any data wrangling done since no special characters were found and names of countries in my list were the same as the names in the text.

### 4. Create NER object

In [4]:
# Load spacy English module

NER = spacy.load("en_core_web_sm")

In [5]:
book = NER(data)

In [6]:
# Visualize identified entities

displacy.render(book[20:200], style = "ent", jupyter = True)

### 5. Split the sentence entities from the NER object

In [7]:
df_sentences = []
sentences = []
entities = []

# Loop through sentences, get entity list for each sentence
for sent in book.sents:
    entity_list = [ent.text for ent in sent.ents]
    entities.append(entity_list)
    sentences.append(sent)
    df_sentences.append({'sentence': sent, 'entities': entity_list})

df = pd.DataFrame()
df["sentence"] = sentences
df["entities"] = entities
df

Unnamed: 0,sentence,entities
0,"( , From, Wikipedia, ,, the, free, encyclop...","[Wikipedia, The 20th century]"
1,"(The, World, Wars, sparked, tension, between, ...","[The World Wars, the Cold War, the Space Race,..."
2,"(These, advancements, have, played, a, signifi...","[the 21st century, today]"
3,"(Historic, events, in, the, 20th, century[edit...","[Historic, 20th, the 20th century]"
4,"(The, 1900s, saw, the, decade, herald, a, seri...","[The 1900s, the decade]"
...,...,...
1569,"(11th, , 1st, 10th, 9th, 8th, 7th, 6th, 5th,...","[11th 1st 10th 9th 8th, 7th, 6th, 5th, 2nd 1..."
1570,"(1st, 1st, 2nd)","[1st 1st, 2nd]"
1571,"(3rd, 4th, 5th, 6th, 7th, 8th, 9th, 10th, , ...","[3rd, 4th, 5th, 6th, 7th, 9th, 10th 2nd, 12t..."
1572,"(15th, 16th, 17th, 18th, 19th, 20th, , 3rd, ...",[18th 19th 20th 3rd 21st 22nd 23rd 24th 25th...


### 6. Filter the entities so that you end up only with the ones from your countries list

In [8]:
# Filtering entities from country list
def filter_entity(ent_list, countries_df):
    return_list = []
    filter_list = list(countries_df['Country'])
    for sublist in ent_list:
        # for each item in sublist
        filtered_sublist = []
        for item in sublist: 

            # if it is in the filter_list, add it to return_list
            if item in filter_list: 
                filtered_sublist.append(item)
        return_list.append(filtered_sublist)
    return return_list

# Create a dataframe of countries from Country_Scrape.txt file
df_countries = pd.DataFrame()
with open('Country_Scrape.txt', 'r', errors='ignore') as file: 
    data = file.read().strip()
    df_countries['Country'] = data.split("\n")

# use fn to filter
filtered_list = filter_entity(entities, df_countries)
filtered_list[:100]

[[],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 ['France', 'Russia'],
 ['Germany', 'Austria', 'Hungary', 'Bulgaria', 'Russia'],
 ['Germany', 'Russia'],
 ['Germany'],
 ['Germany'],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 ['Germany'],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 ['Germany', 'Italy'],
 ['Germany', 'Germany'],
 ['Germany', 'Germany'],
 ['Austria', 'Austria', 'Germany'],
 [],
 [],
 [],
 ['Spain'],
 [],
 [],
 [],
 [],
 [],
 ['Britain', 'France', 'Poland'],
 ['Poland'],
 ['Britain', 'France', 'Germany', 'Poland', 'Poland', 'Soviet Union'],
 [],
 [],
 [],
 [],
 ['Poland', 'Germany'],
 ['Estonia', 'Latvia', 'Lithuania', 'Finland'],
 ['Germany'],
 ['Poland', 'Luxembourg'],
 [],
 ['Belgium'],
 ['Denmark', 'Norway'],
 ['Norway'],
 ['Norway', 'Denmark', 'Sweden', 'Germany'],
 ['France'],
 [],
 ['France'],
 [],
 [],
 ['France'],
 [],
 ['Italy', 'Britain', 'Great Britain'],
 ['Britain'],
 ['Germany'],
 [],
 [],
 [],
 ['Britain'],
 [],
 [],
 ['Britai

In [9]:
df['country_entities'] = filtered_list
df.head(20)

Unnamed: 0,sentence,entities,country_entities
0,"( , From, Wikipedia, ,, the, free, encyclop...","[Wikipedia, The 20th century]",[]
1,"(The, World, Wars, sparked, tension, between, ...","[The World Wars, the Cold War, the Space Race,...",[]
2,"(These, advancements, have, played, a, signifi...","[the 21st century, today]",[]
3,"(Historic, events, in, the, 20th, century[edit...","[Historic, 20th, the 20th century]",[]
4,"(The, 1900s, saw, the, decade, herald, a, seri...","[The 1900s, the decade]",[]
5,"(From, 1914, to, 1918, ,, the, First, World, W...","[1914 to 1918, the First World War]",[]
6,"("", The, war, to, end, all, wars, "", :, World,...","[World War I, World War I Arrest, Sarajevo, Ar...",[]
7,"(The, war, and, by, extension, the, century, a...","[the century, Sarajevo, the Austro-Hungarian E...",[]
8,"(This, was, similar, to, how, the, 9/11, was, ...","[9/11, Bound, Slavic, Serbian, Russians, Serbs]",[]
9,"(Interwoven, alliances, ,, an, increasing, arm...","[Europe, The Allies, The Triple Entente, the B...","[France, Russia]"


In [10]:
df_countries_filtered = df[df['country_entities'].map(len) > 0]

In [11]:
df_countries_filtered

Unnamed: 0,sentence,entities,country_entities
9,"(Interwoven, alliances, ,, an, increasing, arm...","[Europe, The Allies, The Triple Entente, the B...","[France, Russia]"
10,"(Germany, ,, Austria, -, Hungary, ,, Bulgaria,...","[Germany, Austria, Hungary, Bulgaria, the Otto...","[Germany, Austria, Hungary, Bulgaria, Russia]"
11,"(The, Bolsheviks, negotiated, the, Treaty, of,...","[Bolsheviks, Germany, Russia]","[Germany, Russia]"
12,"(In, the, treaty, ,, Bolshevik, Russia, ceded,...","[Bolshevik Russia, Baltic, Germany, Kars Oblas...",[Germany]
13,"(It, also, recognized, the, independence, of, ...","[Germany, Allied, American, 1918.[6]",[Germany]
...,...,...,...
1248,"("", Stuck, in, Endless, Preliminaries, :, Viet...","[Stuck in Endless Preliminaries, Vietnam, the ...",[Vietnam]
1508,"("", Anti, -, American, Behavior, in, the, Midd...","[Anti-American Behavior, the Middle East, a Fi...",[Lebanon]
1514,"(The, Rise, of, China, and, India, :, A, New, ...","[India, New Asian]",[India]
1515,"(Singapore, :, World, Scientific, ., doi:10.11...","[Singapore, World Scientific]",[Singapore]


### 7. Create the relationship dataframe

In [12]:
# Defining relationships
relationships = [] 

for i in range(df_countries_filtered.index[-1]):
     end_i = min(i+5, df_countries_filtered.index[-1])
     country_list = sum((df_countries_filtered.loc[i: end_i].country_entities), [])

     # Remove duplicated characters that are next to each other
     country_unique = [country_list[i] for i in range(len(country_list))
                             if (i==0) or country_list[i] != country_list[i-1]]

     if len(country_unique) > 1:
        for idx, a in enumerate( country_unique[ :-1]):
          b = country_unique[idx + 1]
          relationships.append({"source": a, " target": b})

In [13]:
relationships[:50]

[{'source': 'France', ' target': 'Russia'},
 {'source': 'France', ' target': 'Russia'},
 {'source': 'Russia', ' target': 'Germany'},
 {'source': 'Germany', ' target': 'Austria'},
 {'source': 'Austria', ' target': 'Hungary'},
 {'source': 'Hungary', ' target': 'Bulgaria'},
 {'source': 'Bulgaria', ' target': 'Russia'},
 {'source': 'France', ' target': 'Russia'},
 {'source': 'Russia', ' target': 'Germany'},
 {'source': 'Germany', ' target': 'Austria'},
 {'source': 'Austria', ' target': 'Hungary'},
 {'source': 'Hungary', ' target': 'Bulgaria'},
 {'source': 'Bulgaria', ' target': 'Russia'},
 {'source': 'Russia', ' target': 'Germany'},
 {'source': 'Germany', ' target': 'Russia'},
 {'source': 'France', ' target': 'Russia'},
 {'source': 'Russia', ' target': 'Germany'},
 {'source': 'Germany', ' target': 'Austria'},
 {'source': 'Austria', ' target': 'Hungary'},
 {'source': 'Hungary', ' target': 'Bulgaria'},
 {'source': 'Bulgaria', ' target': 'Russia'},
 {'source': 'Russia', ' target': 'Germany'},

In [14]:
relationship_df = pd.DataFrame(relationships)

relationship_df

Unnamed: 0,source,target
0,France,Russia
1,France,Russia
2,Russia,Germany
3,Germany,Austria
4,Austria,Hungary
...,...,...
769,India,Singapore
770,India,Singapore
771,India,Singapore
772,India,Singapore


### 8. Save and export dataframe

In [15]:
relationship_df.to_csv('20th_century_relationship.csv', index=False)