# Exercise 1.6 — Intro to NLP and Network Analysis
In this exercise, we’ll use Named Entity Recognition (NER) to extract country mentions 
from the *Key Events of the 20th Century* Wikipedia text, and create a relationships 
dataframe showing which countries appear together. This will prepare the data for 
network visualization in Exercise 1.7.


In [None]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import re

# Load SpaCy English model
nlp = spacy.load("en_core_web_sm")


In [1]:
# Load the 20th-century text
with open('../Data/20th_century.txt', 'r', encoding='utf-8', errors='ignore') as file:
    data = file.read().replace('\n', ' ')

print("Text length:", len(data))


Text length: 113010


In [2]:
# Quick inspection for unwanted symbols
print(data[:1000])


    Key events of the 20th century - Wikipedia                           Jump to content        Main menu      Main menu move to sidebar hide    		Navigation 	   Main pageContentsCurrent eventsRandom articleAbout WikipediaContact us      		Contribute 	   HelpLearn to editCommunity portalRecent changesUpload fileSpecial pages                    Search            Search                       Appearance                 Donate  Create account  Log in         Personal tools      Donate Create account Log in      		Pages for logged out editors learn more    ContributionsTalk                             Contents move to sidebar hide     (Top)      1 Historic events in the 20th century     Toggle Historic events in the 20th century subsection      1.1 World at the beginning of the century       1.1.1 "The war to end all wars": World War I (1914–1918)           1.2 Spanish flu       1.2.1 Russian Revolution and communism           1.3 Between the wars       1.3.1 Economic depression         1.3

In [3]:
import re

# Make a working copy
clean_text = data

# Remove menu/navigation words
clean_text = re.sub(r'(Jump to content|Main menu|Navigation|Create account|Log in|Donate|Search|Personal tools)', ' ', clean_text)

# Remove bracketed references like [1], [2], etc.
clean_text = re.sub(r'\[.*?\]', ' ', clean_text)

# Remove non-letter characters (numbers, punctuation, symbols)
clean_text = re.sub(r'[^A-Za-z\s]', ' ', clean_text)

# Collapse multiple spaces
clean_text = re.sub(r'\s+', ' ', clean_text).strip()

print(clean_text[:500])


Key events of the th century Wikipedia move to sidebar hide Main pageContentsCurrent eventsRandom articleAbout WikipediaContact us Contribute HelpLearn to editCommunity portalRecent changesUpload fileSpecial pages Appearance Pages for logged out editors learn more ContributionsTalk Contents move to sidebar hide Top Historic events in the th century Toggle Historic events in the th century subsection World at the beginning of the century The war to end all wars World War I Spanish flu Russian Rev


In [4]:
doc = nlp(data)


NameError: name 'nlp' is not defined

In [5]:
import spacy

# Load the English language model
nlp = spacy.load("en_core_web_sm")


In [6]:
doc = nlp(clean_text)


In [7]:
# Check a few entities detected
for ent in list(doc.ents)[:15]:
    print(ent.text, "-", ent.label_)


Wikipedia - ORG
Appearance Pages - ORG
ContributionsTalk Contents - PRODUCT
Top Historic - PERSON
Toggle Historic - PERSON
the th century - DATE
the beginning of the century - DATE
World War I - EVENT
Spanish - NORP
Russian Revolution - FAC
Global war World War II - WORK_OF_ART
Operation Overlord Final - FAC
the Pacific Background Japanese Expansion - ORG
The Holocaust The Nuclear Age - EVENT
The Cold War War - EVENT


In [8]:
import re

# Start from the raw scraped text
clean_text = data

# Remove HTML navigation/menu words and interface text
patterns_to_remove = [
    r'Jump to content', r'Main menu', r'Navigation', r'Create account',
    r'Log in', r'Donate', r'Search', r'Personal tools', r'Appearance',
    r'WikipediaContact', r'help', r'Contribute', r'Upload file', r'Special pages'
]
for p in patterns_to_remove:
    clean_text = re.sub(p, ' ', clean_text, flags=re.IGNORECASE)

# Remove any [bracketed references], numbers, and symbols
clean_text = re.sub(r'\[.*?\]', ' ', clean_text)
clean_text = re.sub(r'[^A-Za-z\s]', ' ', clean_text)

# Remove repeated whitespace
clean_text = re.sub(r'\s+', ' ', clean_text).strip()

print(clean_text[:500])


Key events of the th century Wikipedia move to sidebar hide Main pageContentsCurrent eventsRandom articleAbout us Learn to editCommunity portalRecent changes Pages for logged out editors learn more ContributionsTalk Contents move to sidebar hide Top Historic events in the th century Toggle Historic events in the th century subsection World at the beginning of the century The war to end all wars World War I Spanish flu Russian Revolution and communism Between the wars Economic depression The rise


In [9]:
import re

clean_text = data

# Remove common Wikipedia interface words and navigation clutter
patterns_to_remove = [
    r'Jump to content', r'Main menu', r'Navigation', r'Create account', r'Log in', r'Donate',
    r'Search', r'Personal tools', r'Appearance', r'Wikipedia', r'Contact us', r'Learn to edit',
    r'Community portal', r'Recent changes', r'Upload file', r'Special pages', r'logged out editors',
    r'move to sidebar hide', r'Contents', r'Random article', r'About us', r'Help', r'Contribute',
    r'Pages for', r'learn more', r'Top', r'Toggle', r'subsection'
]

for p in patterns_to_remove:
    clean_text = re.sub(p, ' ', clean_text, flags=re.IGNORECASE)

# Remove bracketed references like [1], [2], etc.
clean_text = re.sub(r'\[.*?\]', ' ', clean_text)

# Keep only letters and spaces
clean_text = re.sub(r'[^A-Za-z\s]', ' ', clean_text)

# Replace multiple spaces with a single space
clean_text = re.sub(r'\s+', ' ', clean_text).strip()

# Preview cleaned output
print(clean_text[:600])


Key events of the th century Main page Current events About ContributionsTalk Historic events in the th century Historic events in the th century World at the beginning of the century The war to end all wars World War I Spanish flu Russian Revolution and communism Between the wars Economic depression The rise of dictatorship Global war World War II The war in Europe Blitzkrieg Operation Barbarossa Turning tides Operation Overlord Final days The war in the Pacific Background Japanese Expansion Allied offensive Final days The Holocaust The Nuclear Age begins The post war world The end of empires


In [10]:
doc = nlp(clean_text)

# Check the first few detected entities
for ent in list(doc.ents)[:20]:
    print(ent.text, "-", ent.label_)


the th century - DATE
Historic - PERSON
the th century - DATE
the beginning of the century - DATE
World War I - EVENT
Spanish - NORP
Russian Revolution - FAC
Global war World War II - WORK_OF_ART
Operation Overlord Final - FAC
the Pacific Background Japanese Expansion - ORG
The Holocaust The Nuclear Age - EVENT
The Cold War War - EVENT
the Cold War Information - EVENT
the end of the century - DATE
Key - NORP
the th century - DATE
English - LANGUAGE
Tools Tools Actions - ORG
URLDownload - ORG
Download - GPE


In [11]:
import re

clean_text = data

# 1️⃣ Remove Wikipedia UI / navigation junk
patterns_to_remove = [
    r'Jump to content', r'Main menu', r'Navigation', r'Create account', r'Log in', r'Donate',
    r'Search', r'Personal tools', r'Appearance', r'Wikipedia', r'Contact us', r'Help',
    r'Learn to edit', r'Community portal', r'Recent changes', r'Upload file', r'Special pages',
    r'Pages for', r'logged out editors', r'learn more', r'move to sidebar hide', r'ContributionsTalk',
    r'Toggle', r'Top', r'Tools', r'Actions', r'URLDownload', r'Download', r'Article', r'Portal',
    r'View history', r'Talk', r'Contents', r'print export', r'Languages', r'Page information'
]

for p in patterns_to_remove:
    clean_text = re.sub(p, ' ', clean_text, flags=re.IGNORECASE)

# 2️⃣ Remove bracketed references like [1], [2], etc.
clean_text = re.sub(r'\[.*?\]', ' ', clean_text)

# 3️⃣ Keep only letters and spaces
clean_text = re.sub(r'[^A-Za-z\s]', ' ', clean_text)

# 4️⃣ Replace multiple spaces with a single


In [12]:
doc = nlp(clean_text)

for ent in list(doc.ents)[:20]:
    print(ent.text, "-", ent.label_)


century - DATE
the   th century - DATE
century - DATE
the beginning of the century - DATE
World War I                            - EVENT
World War II - EVENT
Europe - LOC
days - DATE
the Pacific               Background - ORG
Japanese - NORP
The Holocaust                The Nuclear Age - EVENT
The Cold War                           War - EVENT
the Cold War               Information - EVENT
the end of the century             See - DATE
century - DATE
English - LANGUAGE
PDFPrintable - ORG
Wikimedia CommonsWikidata - ORG
century - DATE
The World Wars - WORK_OF_ART


In [13]:
# Keep only relevant entity types
valid_labels = ["GPE", "LOC", "EVENT", "NORP", "ORG", "DATE"]

filtered_ents = [(ent.text.strip(), ent.label_) for ent in doc.ents if ent.label_ in valid_labels]

print("Total filtered entities:", len(filtered_ents))
print("Sample:", filtered_ents[:20])


Total filtered entities: 1665
Sample: [('century', 'DATE'), ('the   th century', 'DATE'), ('century', 'DATE'), ('the beginning of the century', 'DATE'), ('World War I', 'EVENT'), ('World War II', 'EVENT'), ('Europe', 'LOC'), ('days', 'DATE'), ('the Pacific               Background', 'ORG'), ('Japanese', 'NORP'), ('The Holocaust                The Nuclear Age', 'EVENT'), ('The Cold War                           War', 'EVENT'), ('the Cold War               Information', 'EVENT'), ('the end of the century             See', 'DATE'), ('century', 'DATE'), ('PDFPrintable', 'ORG'), ('Wikimedia CommonsWikidata', 'ORG'), ('century', 'DATE'), ('the Cold War', 'EVENT'), ('the Space Race', 'ORG')]


In [14]:
# Extract all geopolitical entities (GPE)
countries = [ent.text.strip() for ent in doc.ents if ent.label_ == "GPE"]

print("Sample countries:", countries[:20])
print("Total GPE mentions:", len(countries))


Sample countries: ['Sarajevo', 'Sarajevo', 'the British Empire', 'France', 'the Russian Empire', 'the German Empire', 'Russia', 'Germany', 'Russia', 'Germany', 'the Ottoman Empire', 'Ukraine', 'Germany', 'Germany', 'Yugoslavia', 'Czechoslovakia', 'Austro Hungarian Empire', 'the Soviet Union', 'the Soviet Union', 'Germany']
Total GPE mentions: 443


In [15]:
relationships = []
window = 5  # look at groups of 5 consecutive sentences

sentences = list(doc.sents)

for i in range(len(sentences) - window):
    segment = sentences[i:i + window]
    text_segment = ' '.join([sent.text for sent in segment])
    segment_doc = nlp(text_segment)
    gpes = [ent.text for ent in segment_doc.ents if ent.label_ in ["GPE", "LOC"]]
    gpes = list(set(gpes))  # unique countries within this window
    if len(gpes) > 1:
        for a in gpes:
            for b in gpes:
                if a != b:
                    relationships.append((a, b))


In [16]:
import pandas as pd

relationship_df = pd.DataFrame(relationships, columns=["source", "target"])
relationship_df["value"] = 1
relationship_df = (
    relationship_df.groupby(["source", "target"], as_index=False)["value"].sum()
    .sort_values("value", ascending=False)
)

relationship_df.head(10)


Unnamed: 0,source,target,value
2829,Germany,the Soviet Union,32
2484,Europe,the Soviet Union,32
8740,the Soviet Union,Germany,32
8737,the Soviet Union,Europe,32
2732,Germany,France,30
962,Britain,Germany,30
2618,France,Germany,30
2713,Germany,Britain,30
8998,the United States,Germany,29
3681,Italy,Germany,29


In [17]:
output_path = "../Data/relationships_countries_20th_century.csv"
relationship_df.to_csv(output_path, index=False)
print("Saved:", output_path)


Saved: ../Data/relationships_countries_20th_century.csv
