# Named Entity Recognition Analysis

# Importing Libraries

In [64]:
# Import libraries
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re
from collections import Counter
from fuzzywuzzy.process import extractOne

In [61]:
!pip install fuzzywuzzy



In [62]:
!pip install python-Levenshtein

Collecting python-Levenshtein
  Downloading python_levenshtein-0.27.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.27.1 (from python-Levenshtein)
  Downloading levenshtein-0.27.1-cp39-cp39-win_amd64.whl.metadata (3.6 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.27.1->python-Levenshtein)
  Downloading rapidfuzz-3.12.2-cp39-cp39-win_amd64.whl.metadata (12 kB)
Downloading python_levenshtein-0.27.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.27.1-cp39-cp39-win_amd64.whl (100 kB)
Downloading rapidfuzz-3.12.2-cp39-cp39-win_amd64.whl (1.6 MB)
   ---------------------------------------- 0.0/1.6 MB ? eta -:--:--
   ---------------------------------------- 1.6/1.6 MB 8.7 MB/s eta 0:00:00
Installing collected packages: rapidfuzz, Levenshtein, python-Levenshtein
Successfully installed Levenshtein-0.27.1 python-Levenshtein-0.27.1 rapidfuzz-3.12.2


In [4]:
# Download English module

!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ------------------ --------------------- 6.0/12.8 MB 33.5 MB/s eta 0:00:01
     ------------------------------ -------- 10.0/12.8 MB 34.4 MB/s eta 0:00:01
     --------------------------------------  12.6/12.8 MB 28.1 MB/s eta 0:00:01
     --------------------------------------- 12.8/12.8 MB 19.1 MB/s eta 0:00:00
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.4.1
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [5]:
# Load spaCy's large English model
nlp = spacy.load("en_core_web_sm")

# Load the Twentieth-Century Text Data

In [6]:
# Load the text data
file_path = "20th_century_events_cleaned.txt"
with open(file_path, "r", encoding="utf-8") as file:
    text_data = file.read()

In [7]:
# Print a preview
print(text_data[:1000])  # First 1000 characters



Key events of the 20th century - Wikipedia

Personal tools
Donate Create account Log in
		Pages for logged out editors learn more
ContributionsTalk
Contents
move to sidebar
hide
(Top)
1
Historic events in the 20th century
Toggle Historic events in the 20th century subsection
1.1
World at the beginning of the century
1.1.1
"The war to end all wars": World War I (1914–1918)
1.1.2
Russian Revolution and communism
1.2
Between the wars
1.2.1
Economic depression
1.2.2
The rise of dictatorship
1.3
Global war: World War II (1939–1945)
1.3.1
The war in Europe
1.3.2
Blitzkrieg
1.3.3
Operation Barbarossa
1.3.4
Turning tides
1.3.5
Operation Overlord
1.3.6
Final days
1.3.7
The war in the Pacific
1.3.7.1
Background
1.3.8
Japanese Expansion
1.3.9
Allied offensive
1.3.10
Final days
1.3.11
The Holocaust
1.3.12
The Nuclear Age begins
1.4
The post-war world
1.4.1
The end of empires: decolonization
1.4.2
The Cold War (1947–1991)
1.4.3
War by proxy
1.4.4
The space race
1.4.5
The end of the Cold War
1.4.6


# Text Wrangling and Cleaning

In [51]:
# Remove special characters, multiple spaces, and normalize country names
def clean_text(text):
    text = re.sub(r"\[.*?\]", "", text)  # Remove any [edit] or brackets content
    text = re.sub(r"[^a-zA-Z0-9.,;!?\'\"\s]", "", text)  # Remove unwanted characters
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    text = text.replace("\n", " ")  # Remove newline characters
    
    return text

In [52]:
# Apply text cleaning
cleaned_text = clean_text(text_data)


In [53]:
# Save cleaned text to a new file
cleaned_file_path = "20th_century_events_cleaned_normalized.txt"
with open(cleaned_file_path, "w", encoding="utf-8") as file:
    file.write(cleaned_text)

In [54]:
# Print a preview
print(cleaned_text[:1000])

Key events of the 20th century Wikipedia Personal tools Donate Create account Log in Pages for logged out editors learn more ContributionsTalk Contents move to sidebar hide Top 1 Historic events in the 20th century Toggle Historic events in the 20th century subsection 1.1 World at the beginning of the century 1.1.1 "The war to end all wars" World War I 19141918 1.1.2 Russian Revolution and communism 1.2 Between the wars 1.2.1 Economic depression 1.2.2 The rise of dictatorship 1.3 Global war World War II 19391945 1.3.1 The war in Europe 1.3.2 Blitzkrieg 1.3.3 Operation Barbarossa 1.3.4 Turning tides 1.3.5 Operation Overlord 1.3.6 Final days 1.3.7 The war in the Pacific 1.3.7.1 Background 1.3.8 Japanese Expansion 1.3.9 Allied offensive 1.3.10 Final days 1.3.11 The Holocaust 1.3.12 The Nuclear Age begins 1.4 The postwar world 1.4.1 The end of empires decolonization 1.4.2 The Cold War 19471991 1.4.3 War by proxy 1.4.4 The space race 1.4.5 The end of the Cold War 1.4.6 Information and commu

#  Named Entity Recognition (NER) using spaCy

In [38]:
# Apply NLP processing
doc = nlp(cleaned_text)


In [39]:
# Extract named entities
entities = [(ent.text, ent.label_) for ent in doc.ents]

In [40]:
# Display first 20 entities
print(entities[:20])

[('the 20th century', 'DATE'), ('Wikipedia Personal', 'ORG'), ('Donate Create', 'PRODUCT'), ('ContributionsTalk Contents', 'PRODUCT'), ('1', 'CARDINAL'), ('the 20th century', 'DATE'), ('Toggle Historic', 'ORG'), ('the 20th century', 'DATE'), ('1.1', 'CARDINAL'), ('the beginning of the century', 'DATE'), ('World War I', 'EVENT'), ('19141918 1.1.2', 'DATE'), ('1.2', 'CARDINAL'), ('1.2.1', 'CARDINAL'), ('1.3', 'CARDINAL'), ('World War II', 'EVENT'), ('Europe', 'LOC'), ('Blitzkrieg 1.3.3 Operation Barbarossa 1.3.4', 'ORG'), ('Operation Overlord 1.3.6', 'EVENT'), ('Japanese Expansion', 'ORG')]


# Extract and Standardize Country Entities


In [55]:
# List of countries to filter (Updated for consistency)
countries = ["Germany", "Japan", "United States", "France", "Italy", "China", "Russia", "India", "United Kingdom", "Canada"]


In [65]:
# Standardize country mentions using fuzzy matching
def match_country(entity):
    match = extractOne(entity, countries)
    if match:  # Ensure a valid match before accessing its elements
        best_match, score = match
        return best_match if score > 80 else None  # Use a threshold of 80 for accuracy
    return None



In [66]:
# Filter and standardize country entities
country_mentions = [match_country(ent.text) for ent in doc.ents if match_country(ent.text)]

In [67]:
# Count occurrences
country_counts = Counter(country_mentions)

In [68]:
# Convert to DataFrame
df_countries = pd.DataFrame(country_counts.items(), columns=["Country", "Mentions"])


In [69]:
# Display the DataFrame
print(df_countries.head())  # Print first 5 rows

   Country  Mentions
0    Japan        64
1   France        16
2   Russia        22
3  Germany        98
4    Italy        12


# Create a Relationships DataFrame


In [70]:
# Extract sentences containing country mentions
sentences = [sent.text for sent in doc.sents if any(country in sent.text for country in countries)]

In [71]:
# Create relationships DataFrame
relationship_data = []
for sent in sentences:
    present_countries = [match_country(country) for country in countries if match_country(country) and country in sent]
    present_countries = list(set(present_countries))  # Remove duplicates
    if len(present_countries) > 1:
        relationship_data.append({"Sentence": sent, "Countries": ", ".join(present_countries)})

In [72]:
df_relationships = pd.DataFrame(relationship_data)

In [73]:
# Display the DataFrame
print(df_relationships.head())  # Print first 5 rows

                                            Sentence               Countries
0  After a period of diplomatic and military esca...          France, Russia
1  The Bolsheviks negotiated the Treaty of BrestL...         Germany, Russia
2  In the treaty, Bolshevik Russia ceded the Balt...         Germany, Russia
3  Germany, 1933 Fascism first appeared in Italy ...          Italy, Germany
4  The Nazi Party in Germany was dedicated to the...  Germany, United States


#  Save and Export the DataFrame

In [74]:
# Save to CSV
df_relationships.to_csv("country_relationships.csv", index=False)