In [78]:
import requests
import spacy

from bs4 import BeautifulSoup
import re

In [79]:
url = "https://en.wikipedia.org/wiki/Napoleon"

In [80]:
response = requests.get(url)

In [81]:
response.status_code

200

In [82]:
# Getting text in paragraphs and headings only
pattern = r'<p>(.*?)<\/p>|<h\d>(.*?)<\/h\d>'

In [83]:
matches = re.findall(pattern, response.text, re.DOTALL)

In [84]:
captured_text = ""

for match in matches:
    for word in match:
        if word:
            captured_text += word + "\n"

In [85]:
captured_text

'<b>Napoleon Bonaparte</b> (born Napoleone di Buonaparte;<sup id="cite_ref-FOOTNOTEDwyer2008axv_2-0" class="reference"><a href="#cite_note-FOOTNOTEDwyer2008axv-2">&#91;1&#93;</a></sup><sup id="cite_ref-3" class="reference"><a href="#cite_note-3">&#91;b&#93;</a></sup> 15 August 1769 – 5 May 1821), later known by his <a href="/wiki/Regnal_name" title="Regnal name">regnal name</a> <b>Napoleon&#160;I</b>, was a French emperor and military commander who rose to prominence during the <a href="/wiki/French_Revolution" title="French Revolution">French Revolution</a> and led <a href="/wiki/Military_career_of_Napoleon_Bonaparte" class="mw-redirect" title="Military career of Napoleon Bonaparte">successful campaigns</a> during the <a href="/wiki/French_Revolutionary_Wars" title="French Revolutionary Wars">Revolutionary Wars</a>. He was the leader of the <a href="/wiki/French_First_Republic" title="French First Republic">French Republic</a> as <a href="/wiki/French_Consulate" title="French Consulat

In [86]:
soup = BeautifulSoup(captured_text, 'html.parser')
soup

<b>Napoleon Bonaparte</b> (born Napoleone di Buonaparte;<sup class="reference" id="cite_ref-FOOTNOTEDwyer2008axv_2-0"><a href="#cite_note-FOOTNOTEDwyer2008axv-2">[1]</a></sup><sup class="reference" id="cite_ref-3"><a href="#cite_note-3">[b]</a></sup> 15 August 1769 – 5 May 1821), later known by his <a href="/wiki/Regnal_name" title="Regnal name">regnal name</a> <b>Napoleon I</b>, was a French emperor and military commander who rose to prominence during the <a href="/wiki/French_Revolution" title="French Revolution">French Revolution</a> and led <a class="mw-redirect" href="/wiki/Military_career_of_Napoleon_Bonaparte" title="Military career of Napoleon Bonaparte">successful campaigns</a> during the <a href="/wiki/French_Revolutionary_Wars" title="French Revolutionary Wars">Revolutionary Wars</a>. He was the leader of the <a href="/wiki/French_First_Republic" title="French First Republic">French Republic</a> as <a href="/wiki/French_Consulate" title="French Consulate">First Consul</a> fr

In [87]:
text = soup.get_text()
len(text)

94508

In [88]:
# Remove extra whitespace
text = re.sub(r'\s+', ' ', text)
len(text)

94364

In [89]:
# Remove Parentheses
text = re.sub(r'\[', ' ', text)
text = re.sub(r'\]', ' ', text)
text = re.sub(r'\(', ' ', text)
text = re.sub(r'\)', ' ', text)
text = re.sub(r'\.', ' ', text)
text = re.sub(r'\-', ' ', text)
text = re.sub(r'\,', ' ', text)
text = re.sub(r'  ', ' ', text)

len(text)

92168

In [91]:
text

'Napoleon Bonaparte born Napoleone di Buonaparte; 1 b 15 August 1769 – 5 May 1821  later known by his regnal name Napoleon I was a French emperor and military commander who rose to prominence during the French Revolution and led successful campaigns during the Revolutionary Wars He was the leader of the French Republic as First Consul from 1799 to 1804 then of the French Empire as Emperor of the French from 1804 until 1814 and briefly again in 1815 His political and cultural legacy endures as a celebrated and controversial leader He initiated many enduring reforms but has been criticized for his authoritarian rule He is considered one of the greatest military commanders in history and his wars and campaigns are still studied at military schools worldwide However historians still debate the degree to which he was responsible for the Napoleonic Wars in which between three and six million people died 2 3 Napoleon was born on the island of Corsica into a family descended from Italian nobil

In [92]:
# Remove leading and trailing whitespace
text = text.strip()
len(text)

92167

In [93]:
text

'Napoleon Bonaparte born Napoleone di Buonaparte; 1 b 15 August 1769 – 5 May 1821  later known by his regnal name Napoleon I was a French emperor and military commander who rose to prominence during the French Revolution and led successful campaigns during the Revolutionary Wars He was the leader of the French Republic as First Consul from 1799 to 1804 then of the French Empire as Emperor of the French from 1804 until 1814 and briefly again in 1815 His political and cultural legacy endures as a celebrated and controversial leader He initiated many enduring reforms but has been criticized for his authoritarian rule He is considered one of the greatest military commanders in history and his wars and campaigns are still studied at military schools worldwide However historians still debate the degree to which he was responsible for the Napoleonic Wars in which between three and six million people died 2 3 Napoleon was born on the island of Corsica into a family descended from Italian nobil

In [94]:
nlp = spacy.load("en_core_web_sm")

In [95]:
doc = nlp(text)

In [96]:
# Removing Stop Words
text = ' '.join([token.text for token in doc if not token.is_stop])
len(text)

65792

In [97]:
doc = nlp(text)

In [98]:
lemmas=[]

for token in doc:
  lemmas.append(token.lemma_)
  print(f"{token} | {token.lemma_}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
respective | respective
spheres | sphere
influence | influence
172 | 172
Napoleon | Napoleon
offered | offer
Alexander | Alexander
relatively | relatively
lenient | lenient
terms | term
— | —
demanding | demand
Russia | Russia
join | join
Continental | Continental
System | System
withdraw | withdraw
forces | force
Wallachia | Wallachia
Moldavia | Moldavia
hand | hand
Ionian | Ionian
Islands | Islands
France | France
contrast | contrast
Prussia | Prussia
treated | treat
harshly | harshly
lost | lose
half | half
territory | territory
population | population
underwent | underwent
year | year
occupation | occupation
costing | cost
1 | 1
4 | 4
billion | billion
francs | franc
Prussian | prussian
territory | territory
Napoleon | Napoleon
created | create
Kingdom | Kingdom
Westphalia | Westphalia
ruled | rule
young | young
brother | brother
Jérôme | Jérôme
Duchy | Duchy
Warsaw | Warsaw
173 | 173
174 | 174
Prussia | Prussia
humil

In [99]:
lemmas

['Napoleon',
 'Bonaparte',
 'bear',
 'Napoleone',
 'di',
 'Buonaparte',
 ';',
 '1',
 'b',
 '15',
 'August',
 '1769',
 '–',
 '5',
 '1821',
 '  ',
 'later',
 'know',
 'regnal',
 'Napoleon',
 'French',
 'emperor',
 'military',
 'commander',
 'rise',
 'prominence',
 'French',
 'Revolution',
 'lead',
 'successful',
 'campaign',
 'Revolutionary',
 'Wars',
 'leader',
 'French',
 'Republic',
 'Consul',
 '1799',
 '1804',
 'French',
 'Empire',
 'Emperor',
 'French',
 '1804',
 '1814',
 'briefly',
 '1815',
 'political',
 'cultural',
 'legacy',
 'endures',
 'celebrate',
 'controversial',
 'leader',
 'initiate',
 'endure',
 'reform',
 'criticize',
 'authoritarian',
 'rule',
 'consider',
 'great',
 'military',
 'commander',
 'history',
 'war',
 'campaign',
 'study',
 'military',
 'school',
 'worldwide',
 'historians',
 'debate',
 'degree',
 'responsible',
 'Napoleonic',
 'Wars',
 'million',
 'people',
 'die',
 '2',
 '3',
 'Napoleon',
 'bear',
 'island',
 'Corsica',
 'family',
 'descend',
 'italian',


In [100]:
unique_lemmas = set(lemmas)
unique_lemmas

{'blunder',
 'References',
 'model',
 'difficulty',
 'wipe',
 'insurrection',
 'India',
 'Nile',
 'Appearance',
 'jewish',
 'initial',
 'Karl',
 'unable',
 'squadron',
 'legacy',
 'Murat',
 'Vatican',
 'comprehensive',
 'turmoil',
 'obese',
 '426',
 'drop',
 'rhyme',
 'directory',
 'posterity',
 'Denuelle',
 'Enghien',
 'Koran',
 'feeling',
 'clumsy',
 'Eastern',
 'HMS',
 'twin',
 'height',
 'Carlyle',
 'fund',
 'Sir',
 'Coup',
 'Cádiz',
 'efficiently',
 'Farington',
 'climate',
 'vessel',
 'Syria',
 '256',
 'hang',
 'maternal',
 "l'armée",
 '325',
 'abdicate',
 'Laplace',
 'wedge',
 'scholarship',
 'Campo',
 '400',
 'orientalism',
 '1815',
 'affair',
 '403',
 'Iberian',
 'Assembly',
 'ensue',
 'coalitionarie',
 'civilian',
 'compassion',
 '255',
 '96',
 'junior',
 'Finisterre',
 'Campaign',
 'minor',
 '295',
 'reintroduction',
 'Corps',
 'Erfurt',
 'tall',
 '350',
 'Tsar',
 'Carl',
 'Lutheran',
 'undertake',
 'give',
 'contact',
 'classic',
 'formal',
 "l'Empereur",
 'uneasy',
 'elimi

In [101]:
len(unique_lemmas)

3346