## 1.6 Intro to NLP and Network Analysis

In [1]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re

In [2]:
# Download English module
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------- ----------------- 7.1/12.8 MB 43.7 MB/s eta 0:00:01
     --------------------------------------- 12.8/12.8 MB 44.6 MB/s eta 0:00:00
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [3]:
# Load spacy English module
NER = spacy.load("en_core_web_sm")

### Loading in the 20th century text file and countries dataframe

In [4]:
# Load the 20th_century text file

with open('20th_century_article.txt', 'r', errors='ignore') as file:
    data = file.read().replace('\n', '')

In [5]:
data



In [6]:
# Create path to prepare for loading countries dataframe
path = os.path.join(r'C:\Users\ariel\OneDrive\Desktop\School work\Data Visualizations with Python', 'countries.csv')

In [7]:
# Upload countries dataframe
df_countries = pd.read_csv(path)

### Cleaning countries dataframe

In [8]:
df_countries

Unnamed: 0,Country,Frequency
0,Albania,2
1,Algeria,1
2,American Samoa,0
3,Andorra,0
4,Angola,1
...,...,...
249,Vietnam\nY\nchange\n Yemen\nZ\nchange\n Zambia,0
250,Wales,0
251,Wallis and Futuna\nOther entities\nchange\n A...,0
252,West Papua\nIntegral parts of sovereign states...,0


In [9]:
# Remove unwanted characters
df_countries['Country'] = df_countries['Country'].str.replace(r'\n|change|Other entities|Disputed countries|Integral parts of sovereign states|See also', '', regex=True)

In [10]:
# Remove concatenated country names or unwanted parts
df_countries['Country'] = df_countries['Country'].str.replace(r'([A-Z])\s([A-Z])', r'\1 \2', regex=True)

In [11]:
# Replace all single letters (A-Z) surrounded by spaces with a space
df_countries['Country'] = df_countries['Country'].str.replace(r' [A-Z] ', ' ', regex=True)

In [12]:
# Clean up any unwanted extra spaces
df_countries['Country'] = df_countries['Country'].str.strip().replace(r'\s+', ' ', regex=True)

In [13]:
# Flatten the list of country names into separate rows
df_countries = df_countries.explode('Country')

In [24]:
# Identify rows where there are multiple countries listed
multiple_countries = df_expanded[df_countries['Country'].str.contains(',')]

In [25]:
print(multiple_countries)

54                                                 Congo
54                            Democratic Republic of the
55                                                 Congo
55                                       Republic of the
122                                                Korea
122                                                North
123                                                Korea
123                                                South
192                                         Saint Helena
192    Ascension and Tristan da Cunha South Georgia a...
232    TransnistriaPlaces sometimes considered countries
232    but not actual countries according to internat...
249                                              Vietnam
249                                                Yemen
249                                               Zambia
dtype: object


In [33]:
# Handle split country names
df_countries['Country'] = df_countries['Country'].replace({
    'Democratic Republic of the': 'Democratic Republic of the Congo',
    'Republic of the': 'Republic of the Congo',
    'North': 'North Korea',
    'South': 'South Korea'
})

In [34]:
# Identify rows where countries are listed together by commas
multi_countries = df_countries[df_countries['Country'].str.contains(',')]

In [35]:
df_expanded = df_countries['Country'].str.split(', ', expand=True).stack().reset_index(level=1, drop=True)

In [36]:
# Create a new DataFrame with expanded countries
df_expanded_df = pd.DataFrame({
    'Country': df_expanded,
    'Frequency': df_countries.loc[df_countries.index.repeat(df_expanded.groupby(level=0).size())]['Frequency'].values})

In [37]:
# Clean up rows with extra descriptive text
df_expanded_df = df_expanded_df[~df_expanded_df['Country'].str.contains('Places sometimes considered countries', na=False)]

In [38]:
df_cleaned = df_expanded_df

In [39]:
df_cleaned

Unnamed: 0,Country,Frequency
0,Albania,2
1,Algeria,1
2,American Samoa,0
3,Andorra,0
4,Angola,1
...,...,...
249,Zambia,0
250,Wales,0
251,Wallis and Futuna Antarctica,0
252,West Papua,0


### Evaluating 20th century text file for wrangling and cleaning 

In [42]:
# Identify and list special characters 
special_characters = re.findall(r'[^a-zA-Z\s,\.]', data)
special_characters = set(special_characters) 

In [43]:
# Compare country names with countries dataframe
countries_in_article = []
for country in df_cleaned['Country']:
    if country.lower() in data.lower():
        countries_in_article.append(country)

In [44]:
# Print special characters and countries found in the article
print("Special Characters in the Article: ", special_characters)
print("Countries Found in the Article: ", countries_in_article)

Special Characters in the Article:  {'2', '!', '"', '8', '4', '1', '9', '-', '6', '5', "'", '?', '7', '3', 'Â', '0'}
Countries Found in the Article:  ['Albania', 'Algeria', 'Angola', 'Australia', 'Austria', 'Bangladesh', 'Belarus', 'Belgium', 'Bulgaria', 'Cambodia', 'Canada', 'China', 'Congo', 'Congo', 'Cuba', 'Egypt', 'Estonia', 'Finland', 'Germany', 'Ghana', 'Greece', 'Guam', 'Guinea', 'Guinea-Bissau', 'Hong Kong', 'India', 'Iran', 'Iraq', 'Islands', 'Israel', 'Italy', 'Japan', 'Kenya', 'Korea', 'North', 'Korea', 'South', 'Latvia', 'Lebanon', 'Libya', 'Lithuania', 'Mexico', 'Moldova', 'Mongolia', 'Morocco', 'Mozambique', 'Netherlands', 'Niger', 'Nigeria', 'Panama', 'Papua New Guinea', 'Philippines', 'Poland', 'Russia', 'Seychelles', 'Singapore', 'Slovakia', 'Solomon Islands', 'South Africa', 'Spain', 'Sudan', 'Sweden', 'Thailand', 'Ukraine', 'United Kingdom', 'United States', 'Vietnam']


In [48]:
# List of countries from the article
countries_in_article = ['Albania', 'Algeria', 'Angola', 'Australia', 'Austria', 'Bangladesh', 
                        'Belarus', 'Belgium', 'Bulgaria', 'Cambodia', 'Canada', 'China', 'Congo',
                        'Congo', 'Cuba', 'Egypt', 'Estonia', 'Finland', 'Germany', 'Ghana', 'Greece',
                        'Guam', 'Guinea', 'Guinea-Bissau', 'Hong Kong', 'India', 'Iran', 'Iraq', 
                        'Islands', 'Israel', 'Italy', 'Japan', 'Kenya', 'Korea', 'North', 'Korea', 
                        'South', 'Latvia', 'Lebanon', 'Libya', 'Lithuania', 'Mexico', 'Moldova', 
                        'Mongolia', 'Morocco', 'Mozambique', 'Netherlands', 'Niger', 'Nigeria', 
                        'Panama', 'Papua New Guinea', 'Philippines', 'Poland', 'Russia', 'Seychelles', 
                        'Singapore', 'Slovakia', 'Solomon Islands', 'South Africa', 'Spain', 'Sudan', 
                        'Sweden', 'Thailand', 'Ukraine', 'United Kingdom', 'United States', 'Vietnam']

In [45]:
# Filter rows where 'Frequency' is greater than 0 in the countries dataframe
df_filtered = df_cleaned[df_cleaned['Frequency'] > 0]

In [47]:
print(df_filtered)

              Country  Frequency
0             Albania          2
1             Algeria          1
4              Angola          1
10          Australia          2
11            Austria          5
17         Bangladesh          2
19            Belarus          1
20            Belgium          2
34           Bulgaria          1
37           Cambodia          2
39             Canada          2
47              China         10
60               Cuba          3
69              Egypt          2
74            Estonia          2
79            Finland          4
88            Germany         42
89              Ghana          1
92             Greece          4
96               Guam          3
98             Guinea          4
99      Guinea-Bissau          1
103         Hong Kong          2
105             India          9
107              Iran          4
108              Iraq          1
110           Islands          8
112            Israel          4
113             Italy         14
115       

In [71]:
# Identify countries that are in the article but not in df_cleaned
countries_in_dataframe = df_cleaned['Country'].tolist()
countries_to_add = [country for country in countries_in_article if country not in countries_in_dataframe]

In [72]:
# Create a new dataframe with the countries to add, initializing Frequency to 0
df_to_add = pd.DataFrame({
    'Country': countries_to_add,
    'Frequency': [0] * len(countries_to_add)  
})

In [73]:
# Append the new countries to the df_cleaned dataframe
df_cleaned_updated = pd.concat([df_cleaned, df_to_add], ignore_index=True)

In [74]:
# Clean up the country names 
df_cleaned_updated['Country'] = df_cleaned_updated['Country'].str.strip().str.title()

In [75]:
# Check if any countries in the updated dataframe need cleaning (e.g., "North Korea" vs "Korea, North")
df_cleaned_updated['Country'] = df_cleaned_updated['Country'].replace({
    'Korea, North': 'North Korea',
    'Korea, South': 'South Korea',
    'Islands': 'Cayman Islands',  
    'Hong Kong': 'China' 
})

In [77]:
# Filter rows where 'Frequency' is greater than 0 in the countries dataframe
df_filtered2 = df_cleaned_updated[df_cleaned_updated['Frequency'] > 0]

In [78]:
df_filtered2

Unnamed: 0,Country,Frequency
0,Albania,2.0
1,Algeria,1.0
4,Angola,1.0
10,Australia,2.0
11,Austria,5.0
17,Bangladesh,2.0
19,Belarus,1.0
20,Belgium,2.0
34,Bulgaria,1.0
37,Cambodia,2.0


In [80]:
df_filtered2.loc[:, 'Frequency'] = df_filtered2['Frequency'].astype(int)

In [81]:
df_filtered2

Unnamed: 0,Country,Frequency
0,Albania,2
1,Algeria,1
4,Angola,1
10,Australia,2
11,Austria,5
17,Bangladesh,2
19,Belarus,1
20,Belgium,2
34,Bulgaria,1
37,Cambodia,2


There seems to be some vague language in some sense when referencing countries in certain context in the 20th century article. This needed to be wrangled to match the format of the countries dataframe for every instance that the countries are mentioned, so that I can get an accurate count of the times each country is mentioned in the text file.

In [170]:
# Export cleaned country list
directory = r'C:\Users\ariel\OneDrive\Desktop\School work\Data Visualizations with Python'

In [171]:
file_name = 'cleaned_countries_list.csv'

In [172]:
# Export cleaned country list 
file_path = os.path.join(directory, file_name)

In [173]:
# Save the DataFrame df_filtered2 to the CSV file
df_filtered2.to_csv(file_path, index=False)

### Add cleaned country names to a new text file and execute additional cleaning

In [94]:
# cleaned list of country names
cleaned_country_names = [
    'Albania', 'Algeria', 'Angola', 'Australia', 'Austria', 'Bangladesh', 
    'Belarus', 'Belgium', 'Bulgaria', 'Cambodia', 'Canada', 'China', 
    'Congo', 'Cuba', 'Egypt', 'Estonia', 'Finland', 'Germany', 'Ghana', 
    'Greece', 'Guam', 'Guinea', 'Guinea-Bissau', 'Hong Kong', 'India', 
    'Iran', 'Iraq', 'Islands', 'Israel', 'Italy', 'Japan', 'Kenya', 
    'North Korea', 'South Korea', 'Latvia', 'Lebanon', 'Libya', 'Lithuania', 
    'Mexico', 'Moldova', 'Mongolia', 'Morocco', 'Mozambique', 'Netherlands', 
    'Niger', 'Nigeria', 'Panama', 'Papua New Guinea', 'Philippines', 
    'Poland', 'Russia', 'Seychelles', 'Singapore', 'Slovakia', 'Solomon Islands', 
    'South Africa', 'Spain', 'Sudan', 'Sweden', 'Thailand', 'Ukraine', 
    'United Kingdom', 'United States', 'Vietnam'
]

In [96]:
replacements = {
    'Islands': 'Cayman Islands',  
    'Hong Kong': 'China' }

In [99]:
# Loop through the country names and replace them in the text
for country in cleaned_country_names:
    
    data = re.sub(r'\b' + re.escape(country) + r'\b', country, data)

In [100]:
# Save the cleaned text back to a file
with open('20thcentury_wrangled.txt', 'w', encoding='utf-8') as file:
    file.write(data)

In [101]:
# Open the new text file
with open('20thcentury_wrangled.txt', 'r', errors='ignore') as file:
    text = file.read().replace('\n', '')

In [118]:
# Remove content before the article starts (e.g., navigation)
cleaned_text = re.sub(r"Navigation.*?References", "", text, flags=re.DOTALL)

In [119]:
# Remove unwanted metadata, navigation, and sections that start with 'Main menu'
cleaned_text = re.sub(r"Main menu.*?References", "", text, flags=re.DOTALL)

In [120]:
# Removing tabs and extra spaces
cleaned_text = re.sub(r'\t+', ' ', cleaned_text)
cleaned_text = re.sub(r'\n+', ' ', cleaned_text)

In [121]:
# Remove sections like "View History" and other irrelevant text
cleaned_text = re.sub(r'(\[.*?\])', '', cleaned_text)

In [122]:
# Remove multiple spaces
cleaned_text = re.sub(r'\s+', ' ', cleaned_text)

In [123]:
# Remove specific phrases like "Jump to content" if needed
cleaned_text = re.sub(r"Jump to content.*?Pages for logged out editors", "", cleaned_text, flags=re.DOTALL)

In [124]:
# Clean up the structure by fixing punctuation and spaces
cleaned_text = re.sub(r'([a-zA-Z0-9])([.,;!?()])', r'\1 \2', cleaned_text) 

In [125]:
# Remove any HTML tags
cleaned_text = re.sub(r'<.*?>', '', cleaned_text)

In [126]:
cleaned_text



In [130]:
# Save the cleaned text to a new file
with open('cleaned_article.txt', 'w') as file:
    file.write(cleaned_text)

In [131]:
# Open the new text file
with open('cleaned_article.txt', 'r', errors='ignore') as file:
    text = file.read().replace('\n', '')

In [132]:
text



### Create book from text file and visualize identified entities using NER algorithm

In [133]:
book = NER(text)

In [134]:
# Visualize identified entities

displacy.render(book[273:20000], style = "ent", jupyter = True)

### Splitting sentence entities

In [135]:
df_sentences = [] 

# Loop through sentences, get entity list for each sentence
for sent in book.sents:
    entity_list = [ent.text for ent in sent.ents]
    df_sentences.append({"sentence": sent, "entities": entity_list})
    
df_sentences = pd.DataFrame(df_sentences)

In [136]:
df_sentences.head(10)

Unnamed: 0,sentence,entities
0,"(Key, events, of, the, 20th, century, -, Wikip...","[the 20th century - WikipediaJump, 20th]"
1,"(What, links, hereRelated, changesUpload, file...",[]
2,"(informationCite, this, pageGet, shortened, UR...","[URLDownload, Printexport Download, PDFPrintab..."
3,"(the, free, encyclopediaThe, 20th, century, ch...",[encyclopediaThe 20th century]
4,"(The, World, Wars, sparked, tension, between, ...","[The World Wars, the Cold War, the Space Race,..."
5,"(These, advancements, have, played, a, signifi...","[the 21st century, today, the 20th, Edwardian,..."
6,"(The, 1900s, saw, the, decade, herald, a, seri...","[The 1900s, the decade]"
7,"(1914, saw, the, completion, of, the, Panama, ...","[1914, the Panama Canal, 1914 to 1918, the Fir..."
8,"(19141918editMain, article, World, War, IArres...","[World War IArrest, Sarajevo, Archduke Franz F..."
9,"(The, war, was, precipitated, by, the, Assassi...","[Sarajevo, the Austro-Hungarian Empire's, Erzh..."


### Filtering entities to show only countries from the dataframe

In [150]:
df_filtered2['Country']

0               Albania
1               Algeria
4                Angola
10            Australia
11              Austria
17           Bangladesh
19              Belarus
20              Belgium
34             Bulgaria
37             Cambodia
39               Canada
47                China
62                 Cuba
71                Egypt
76              Estonia
81              Finland
90              Germany
91                Ghana
94               Greece
98                 Guam
100              Guinea
101       Guinea-Bissau
105               China
107               India
109                Iran
110                Iraq
112      Cayman Islands
114              Israel
115               Italy
117               Japan
122               Kenya
132              Latvia
133             Lebanon
137               Libya
140           Lithuania
155              Mexico
159            Mongolia
162             Morocco
163          Mozambique
167         Netherlands
181              Panama
182    Papua New

In [151]:
ent_list = df_cleaned_updated['Country'].tolist()

In [152]:
# Function to filter out entities not of interest

def filter_entity(ent_list, df_filtered2):
    return [ent for ent in ent_list 
            if ent in list(df_filtered2['Country'])]

In [153]:
# Check
filtered_countries = filter_entity(ent_list, df_filtered2)

In [154]:
filtered_countries

['Albania',
 'Algeria',
 'Angola',
 'Australia',
 'Austria',
 'Bangladesh',
 'Belarus',
 'Belgium',
 'Bulgaria',
 'Cambodia',
 'Canada',
 'Cayman Islands',
 'China',
 'Cuba',
 'Egypt',
 'Estonia',
 'Finland',
 'Germany',
 'Ghana',
 'Greece',
 'Guam',
 'Guinea',
 'Guinea-Bissau',
 'China',
 'India',
 'Iran',
 'Iraq',
 'Cayman Islands',
 'Israel',
 'Italy',
 'Japan',
 'Kenya',
 'Latvia',
 'Lebanon',
 'Libya',
 'Lithuania',
 'Mexico',
 'Mongolia',
 'Morocco',
 'Mozambique',
 'Netherlands',
 'Panama',
 'Papua New Guinea',
 'Philippines',
 'Poland',
 'Russia',
 'Seychelles',
 'Singapore',
 'Solomon Islands',
 'South Africa',
 'Spain',
 'Sweden',
 'Thailand',
 'Ukraine',
 'United Kingdom',
 'United States']

In [155]:
df_sentences['country_entities'] = df_sentences['entities'].apply(lambda x: filter_entity(x, df_filtered2))

In [156]:
df_sentences['country_entities'].head(20)

0                    []
1                    []
2                    []
3                    []
4                    []
5                    []
6                    []
7                    []
8                    []
9                    []
10    [Austria, Russia]
11    [Germany, Russia]
12            [Germany]
13            [Ukraine]
14            [Germany]
15                   []
16                   []
17                   []
18                   []
19                   []
Name: country_entities, dtype: object

In [157]:
# Filter out sentences that don't have any character entities

df_sentences_filtered = df_sentences[df_sentences['country_entities'].map(len) > 0]

In [158]:
df_sentences_filtered.tail(10)

Unnamed: 0,sentence,entities,country_entities
806,"("", The, United, States, declares, war, on, Ja...","[The United States, Japan]",[Japan]
825,"("", From, the, Jaws, of, Death, MacArthur, 's,...","[the Jaws of Death MacArthur's, Escape, Philip...",[Philippines]
988,"(Gutman, ,, Israel, 1990, .)","[Gutman, Israel, 1990]",[Israel]
1076,"("", Israel, 's, Nuclear, Weapons, 5, Things, Y...","[Israel, Nuclear Weapons 5 Things You Need]",[Israel]
1086,"("", Major, milestones, of, Iran, 's, nuclear, ...","[Iran, Iran, Al Jazeera]","[Iran, Iran]"
1134,"("", The, forgotten, violence, that, helped, In...",[India],[India]
1146,"("", The, Philippines, ,, 18981946, US, House, ...","[Philippines, 18981946, US House of Representa...",[Philippines]
1216,"(The, Moldovans, Romania, ,, Russia, ,, and, t...","[Moldovans, Romania, Russia, the Politics of C...",[Russia]
1547,"("", Anti, -, American, Behavior, in, the, Midd...","[Anti-American Behavior, the Middle East, a Fi...",[Lebanon]
1553,"(The, Rise, of, China, and, India, A, New, Asi...","[India, New Asian]",[India]


### Create relationships dataframe

In [159]:
# Defining relationships 

# window size = 5 : this defines how many sentences will be looked at simultaneously 
relationships = [] # create an empty list

for i in range(df_sentences_filtered.index[-1]):
    end_i = min(i+5, df_sentences_filtered.index[-1])
    country_list = sum((df_sentences_filtered.loc[i: end_i].country_entities), [])
    
    # Remove duplicated characters that are next to each other
    country_unique = [country_list[i] for i in range(len(country_list)) 
                   if (i==0) or country_list[i] != country_list[i-1]]
    
    if len(country_unique) > 1:
        for idx, a in enumerate(country_unique[:-1]):
            b = country_unique[idx + 1]
            relationships.append({"source": a, "target": b})

In [160]:
relationship_df = pd.DataFrame(relationships)

In [161]:
relationship_df

Unnamed: 0,source,target
0,Austria,Russia
1,Austria,Russia
2,Russia,Germany
3,Germany,Russia
4,Austria,Russia
...,...,...
555,Italy,Japan
556,Germany,Italy
557,Italy,Japan
558,Germany,Italy


In [162]:
# Sort the cases with a->b and b->a

relationship_df = pd.DataFrame(np.sort(relationship_df.values, axis = 1), columns = relationship_df.columns)
relationship_df.head(5)

Unnamed: 0,source,target
0,Austria,Russia
1,Austria,Russia
2,Germany,Russia
3,Germany,Russia
4,Austria,Russia


In [163]:
# Summarize the interactions

relationship_df["value"] = 1
relationship_df = relationship_df.groupby(["source","target"], sort=False, as_index=False).sum()

In [164]:
relationship_df.head(10)

Unnamed: 0,source,target,value
0,Austria,Russia,6
1,Germany,Russia,21
2,Germany,Ukraine,10
3,Germany,Italy,33
4,Austria,Germany,11
5,Germany,Spain,1
6,Poland,Spain,2
7,Germany,Poland,43
8,Estonia,Germany,5
9,Estonia,Latvia,12


This is very insightful to know the relationships in the text between the countries. This can give insights to how often enemies interact with each other throughout the text, or how frequently alliances work together. This relationship dataframe gives key clues to what relationships I should look into in more detail especially based on the value number. Germany and Poland have a high value number, so this is a relationship I will definitely want to look into further.

In [176]:
# Export relationships dataframe
file_path = os.path.join(r'C:\Users\ariel\OneDrive\Desktop\School work\Data Visualizations with Python', 'relationship_df.csv')

In [177]:
# Save the DataFrame to a CSV file using the path
relationship_df.to_csv(file_path, index=False)

In [174]:
#utf-8 encode cleaned text article file
with open('cleaned_article_utf8.txt', 'w', encoding='utf-8') as file:
    file.write(text)