# 1.6 ANSWERS 20th_CENTURY

In [1]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re

In [2]:
# Download English module

!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ------------------------ --------------- 7.9/12.8 MB 40.4 MB/s eta 0:00:01
     --------------------------------------  12.6/12.8 MB 34.2 MB/s eta 0:00:01
     --------------------------------------- 12.8/12.8 MB 30.8 MB/s eta 0:00:00
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [3]:
# Load spacy English module

NER = spacy.load("en_core_web_sm")

# Load the twentieth-century text file

In [4]:

with open('20th_century_Wiki.txt', 'r', encoding='utf-8') as file:
    text = file.read()

# 4. Evaluate and Wrangle the Text Data

In [5]:
# Step 1: Remove Special Characters
cleaned_text = re.sub(r'[^a-zA-Z0-9\s\.\,\-\(\)]', ' ', text)


In [6]:
country_name_mapping = {
    "People's Republic of Bangladesh": "Bangladesh",
    "French Indochina": "Vietnam",
    "Republic of Vietnam": "Vietnam",
    "Democratic Republic of Vietnam": "Vietnam",
    "Soviet Union": "Russia",
    "USSR": "Russia",
    "Union of Soviet Socialist Republics": "Russia",
    "German Empire": "Germany",
    "East Germany": "Germany",
    "West Germany": "Germany",
    "Republic of China": "Taiwan",
    "People's Republic of China": "China",
    "Ceylon": "Sri Lanka",
    "Rhodesia": "Zimbabwe",
    "Portuguese Guinea": "Guinea-Bissau",
    "South-West Africa": "Namibia",
    "British Honduras": "Belize",
    "Persia": "Iran",
    "Burma": "Myanmar",
    "Czechoslovakia": "Czech Republic",
    "Kampuchea": "Cambodia",
    "Zaire": "Democratic Republic of the Congo",
    "Yugoslavia": "Serbia",
    "Upper Volta": "Burkina Faso",
    "French Somaliland": "Djibouti",
    "Ivory Coast": "Côte d'Ivoire",
    "Dutch East Indies": "Indonesia",
    "Formosa": "Taiwan",
    "Gold Coast": "Ghana",
    "New Hebrides": "Vanuatu",
    "British Palestine": "Israel",
    "Trucial States": "United Arab Emirates",
    "Dahomey": "Benin",
    "Moldavian SSR": "Moldova",
    "Byelorussian SSR": "Belarus",
    "Ukrainian SSR": "Ukraine",
    "Latvian SSR": "Latvia",
    "Estonian SSR": "Estonia",
    "Lithuanian SSR": "Lithuania",
    "Karelo-Finnish SSR": "Russia",
    "Manchukuo": "China",
    "Korea": "South Korea",
    "South Yemen": "Yemen",
    "North Yemen": "Yemen",
}

In [7]:
for old_name, new_name in country_name_mapping.items():
    cleaned_text = re.sub(r'\b' + re.escape(old_name) + r'\b', new_name, cleaned_text)

In [8]:
# Save the cleaned text
with open('cleaned_20th_century_Wiki.txt', 'w', encoding='utf-8') as file:
    file.write(cleaned_text)

# 5. Create a Named Entity Recognition (NER) Object

In [10]:
# Create NER object
doc = NER(cleaned_text)

# 6. Split Sentence Entities from the NER Object

In [11]:
# Extract sentences with entities
sentences = []
for sent in doc.sents:
    entities = [(ent.text, ent.label_) for ent in sent.ents]
    sentences.append({"sentence": sent.text, "entities": entities})

# 7. Filter the Entities for Country Names

In [12]:
# Define your list of countries
countries = [
    "Afghanistan", "Albania", "Algeria", "Andorra", "Angola", "Antigua and Barbuda", "Argentina", "Armenia", 
    "Australia", "Austria", "Azerbaijan", "Bahamas", "Bahrain", "Bangladesh", "Barbados", "Belarus", "Belgium", 
    "Belize", "Benin", "Bhutan", "Bolivia", "Bosnia and Herzegovina", "Botswana", "Brazil", "Brunei", 
    "Bulgaria", "Burkina Faso", "Burundi", "Cabo Verde", "Cambodia", "Cameroon", "Canada", "Central African Republic", 
    "Chad", "Chile", "China", "Colombia", "Comoros", "Congo (Congo-Brazzaville)", "Congo", "Costa Rica", 
    "Croatia", "Cuba", "Cyprus", "Czech Republic", "Denmark", "Djibouti", "Dominica", "Dominican Republic", 
    "Ecuador", "Egypt", "El Salvador", "Equatorial Guinea", "Eritrea", "Estonia", "Eswatini (fmr. Swaziland)", 
    "Ethiopia", "Fiji", "Finland", "France", "Gabon", "Gambia", "Georgia", "Germany", "Ghana", "Greece", 
    "Grenada", "Guatemala", "Guinea", "Guinea-Bissau", "Guyana", "Haiti", "Honduras", "Hungary", "Iceland", 
    "India", "Indonesia", "Iran", "Iraq", "Ireland", "Israel", "Italy", "Jamaica", "Japan", "Jordan", 
    "Kazakhstan", "Kenya", "Kiribati", "Korea, North", "Korea, South", "Kuwait", "Kyrgyzstan", "Laos", 
    "Latvia", "Lebanon", "Lesotho", "Liberia", "Libya", "Liechtenstein", "Lithuania", "Luxembourg", "Madagascar", 
    "Malawi", "Malaysia", "Maldives", "Mali", "Malta", "Marshall Islands", "Mauritania", "Mauritius", "Mexico", 
    "Micronesia", "Moldova", "Monaco", "Mongolia", "Montenegro", "Morocco", "Mozambique", "Myanmar (formerly Burma)", 
    "Namibia", "Nauru", "Nepal", "Netherlands", "New Zealand", "Nicaragua", "Niger", "Nigeria", "North Macedonia", 
    "Norway", "Oman", "Pakistan", "Palau", "Panama", "Papua New Guinea", "Paraguay", "Peru", "Philippines", 
    "Poland", "Portugal", "Qatar", "Romania", "Russia", "Rwanda", "Saint Kitts and Nevis", "Saint Lucia", 
    "Saint Vincent and the Grenadines", "Samoa", "San Marino", "Sao Tome and Principe", "Saudi Arabia", 
    "Senegal", "Serbia", "Seychelles", "Sierra Leone", "Singapore", "Slovakia", "Slovenia", "Solomon Islands", 
    "Somalia", "South Africa", "South Sudan", "Spain", "Sri Lanka", "Sudan", "Suriname", "Sweden", 
    "Switzerland", "Syria", "Taiwan", "Tajikistan", "Tanzania", "Thailand", "Timor-Leste", "Togo", "Tonga", 
    "Trinidad and Tobago", "Tunisia", "Turkey", "Turkmenistan", "Tuvalu", "Uganda", "Ukraine", 
    "United Arab Emirates", "United Kingdom", "United States", "Uruguay", "Uzbekistan", "Vanuatu", 
    "Vatican City", "Venezuela", "Vietnam", "Yemen", "Zambia", "Zimbabwe"
]


In [13]:
# Filter entities for countries
filtered_sentences = []
for sent in sentences:
    country_entities = [ent for ent in sent["entities"] if ent[0] in countries]
    if country_entities:
        filtered_sentences.append({"sentence": sent["sentence"], "entities": country_entities})

# 8. Create the Relationships DataFrame

In [14]:
# Create relationships DataFrame
relationships = []
for sent in filtered_sentences:
    entities = [ent[0] for ent in sent["entities"]]
    for i in range(len(entities)):
        for j in range(i + 1, len(entities)):
            relationships.append({"country_1": entities[i], "country_2": entities[j], "context": sent["sentence"]})

df_relationships = pd.DataFrame(relationships)

# 9. Save and Export Your DataFrame

In [15]:
df_relationships.to_csv('country_relationships.csv', index=False)