In [1]:
import requests
from bs4 import BeautifulSoup
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [2]:
nltk.download('punkt')  # Download the required resource for word tokenization

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
nltk.download('wordnet')  # Ensure you have downloaded the required resources

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [4]:
nltk.download('stopwords')  # Ensure you have downloaded the required resources

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
url = 'https://en.wikipedia.org/wiki/Illuminati'

In [6]:
response = requests.get(url)
html_content = response.content

In [7]:
soup = BeautifulSoup(html_content, 'html.parser')

In [8]:
paragraphs = soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])

In [9]:
print(paragraphs)

[<h2 class="vector-pinnable-header-label">Contents</h2>, <h1 class="firstHeading mw-first-heading" id="firstHeading"><span class="mw-page-title-main">Illuminati</span></h1>, <p class="mw-empty-elt">
</p>, <p>The <b>Illuminati</b> (<span class="rt-commentedText nowrap"><span class="IPA nopopups noexcerpt" lang="en-fonipa"><a href="/wiki/Help:IPA/English" title="Help:IPA/English">/<span style="border-bottom:1px dotted"><span title="/ɪ/: 'i' in 'kit'">ɪ</span><span title="/ˌ/: secondary stress follows">ˌ</span><span title="'l' in 'lie'">l</span><span title="/uː/: 'oo' in 'goose'">uː</span><span title="'m' in 'my'">m</span><span title="/ɪ/: 'i' in 'kit'">ɪ</span><span title="/ˈ/: primary stress follows">ˈ</span><span title="'n' in 'nigh'">n</span><span title="/ɑː/: 'a' in 'father'">ɑː</span><span title="'t' in 'tie'">t</span><span title="/i/: 'y' in 'happy'">i</span></span>/</a></span></span>; plural of <a href="/wiki/Latin" title="Latin">Latin</a> <i>illuminatus</i>, 'enlightened') is a n

In [10]:
full_text = "\n".join([p.get_text() for p in paragraphs])

In [11]:
print(full_text)

Contents
Illuminati


The Illuminati (/ɪˌluːmɪˈnɑːti/; plural of Latin illuminatus, 'enlightened') is a name given to several groups, both real and fictitious. Historically, the name usually refers to the Bavarian Illuminati, an Enlightenment-era secret society founded on 1 May 1776 in Bavaria, today part of Germany. The society's stated goals were to oppose superstition, obscurantism, religious influence over public life, and abuses of state power. "The order of the day," they wrote in their general statutes, "is to put an end to the machinations of the purveyors of injustice, to control them without dominating them."[1] The Illuminati—along with Freemasonry and other secret societies—were outlawed through edict by Charles Theodore, Elector of Bavaria, with the encouragement of the Catholic Church, in 1784, 1785, 1787 and 1790.[2] During subsequent years, the group was generally vilified by conservative and religious critics who claimed that the Illuminati continued underground and we

In [12]:
def cleaning_data_from_chars(data):
  cleaned_data = re.sub(r'[^a-zA-Z\s]', '', data)
  return cleaned_data

In [13]:
def lowercase_text(data):
    return data.lower()

In [14]:
def tokenize_text(data):
    # Tokenize the text into individual words
    tokens = word_tokenize(data)
    return tokens

In [15]:
def lemmatize_text(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]
    return lemmatized_words

In [16]:
def remove_stopwords(lemmatized_tokens):
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in lemmatized_tokens if word.lower() not in stop_words]
    return filtered_tokens

In [17]:
def get_unique_words(tokens):
    unique_words = set(tokens)
    return unique_words

In [18]:
cleaned_data = cleaning_data_from_chars(full_text)

In [19]:
print(cleaned_data)

Contents
Illuminati


The Illuminati lumnti plural of Latin illuminatus enlightened is a name given to several groups both real and fictitious Historically the name usually refers to the Bavarian Illuminati an Enlightenmentera secret society founded on  May  in Bavaria today part of Germany The societys stated goals were to oppose superstition obscurantism religious influence over public life and abuses of state power The order of the day they wrote in their general statutes is to put an end to the machinations of the purveyors of injustice to control them without dominating them The Illuminatialong with Freemasonry and other secret societieswere outlawed through edict by Charles Theodore Elector of Bavaria with the encouragement of the Catholic Church in    and  During subsequent years the group was generally vilified by conservative and religious critics who claimed that the Illuminati continued underground and were responsible for the French Revolution

It attracted literary men suc

In [20]:
lower_case_cleaned_data = lowercase_text(cleaned_data)

In [21]:
print(lower_case_cleaned_data)

contents
illuminati


the illuminati lumnti plural of latin illuminatus enlightened is a name given to several groups both real and fictitious historically the name usually refers to the bavarian illuminati an enlightenmentera secret society founded on  may  in bavaria today part of germany the societys stated goals were to oppose superstition obscurantism religious influence over public life and abuses of state power the order of the day they wrote in their general statutes is to put an end to the machinations of the purveyors of injustice to control them without dominating them the illuminatialong with freemasonry and other secret societieswere outlawed through edict by charles theodore elector of bavaria with the encouragement of the catholic church in    and  during subsequent years the group was generally vilified by conservative and religious critics who claimed that the illuminati continued underground and were responsible for the french revolution

it attracted literary men suc

In [22]:
tokenized_text = tokenize_text(lower_case_cleaned_data)

In [23]:
print(tokenized_text)

['contents', 'illuminati', 'the', 'illuminati', 'lumnti', 'plural', 'of', 'latin', 'illuminatus', 'enlightened', 'is', 'a', 'name', 'given', 'to', 'several', 'groups', 'both', 'real', 'and', 'fictitious', 'historically', 'the', 'name', 'usually', 'refers', 'to', 'the', 'bavarian', 'illuminati', 'an', 'enlightenmentera', 'secret', 'society', 'founded', 'on', 'may', 'in', 'bavaria', 'today', 'part', 'of', 'germany', 'the', 'societys', 'stated', 'goals', 'were', 'to', 'oppose', 'superstition', 'obscurantism', 'religious', 'influence', 'over', 'public', 'life', 'and', 'abuses', 'of', 'state', 'power', 'the', 'order', 'of', 'the', 'day', 'they', 'wrote', 'in', 'their', 'general', 'statutes', 'is', 'to', 'put', 'an', 'end', 'to', 'the', 'machinations', 'of', 'the', 'purveyors', 'of', 'injustice', 'to', 'control', 'them', 'without', 'dominating', 'them', 'the', 'illuminatialong', 'with', 'freemasonry', 'and', 'other', 'secret', 'societieswere', 'outlawed', 'through', 'edict', 'by', 'charles',

In [24]:
lemmatized_text = lemmatize_text(tokenized_text)

In [25]:
print(lemmatized_text)

['content', 'illuminati', 'the', 'illuminati', 'lumnti', 'plural', 'of', 'latin', 'illuminatus', 'enlightened', 'is', 'a', 'name', 'given', 'to', 'several', 'group', 'both', 'real', 'and', 'fictitious', 'historically', 'the', 'name', 'usually', 'refers', 'to', 'the', 'bavarian', 'illuminati', 'an', 'enlightenmentera', 'secret', 'society', 'founded', 'on', 'may', 'in', 'bavaria', 'today', 'part', 'of', 'germany', 'the', 'society', 'stated', 'goal', 'were', 'to', 'oppose', 'superstition', 'obscurantism', 'religious', 'influence', 'over', 'public', 'life', 'and', 'abuse', 'of', 'state', 'power', 'the', 'order', 'of', 'the', 'day', 'they', 'wrote', 'in', 'their', 'general', 'statute', 'is', 'to', 'put', 'an', 'end', 'to', 'the', 'machination', 'of', 'the', 'purveyor', 'of', 'injustice', 'to', 'control', 'them', 'without', 'dominating', 'them', 'the', 'illuminatialong', 'with', 'freemasonry', 'and', 'other', 'secret', 'societieswere', 'outlawed', 'through', 'edict', 'by', 'charles', 'theodo

In [26]:
lemmatized_text_without_stopwords = remove_stopwords(lemmatized_text)

In [27]:
print(lemmatized_text_without_stopwords)

['content', 'illuminati', 'illuminati', 'lumnti', 'plural', 'latin', 'illuminatus', 'enlightened', 'name', 'given', 'several', 'group', 'real', 'fictitious', 'historically', 'name', 'usually', 'refers', 'bavarian', 'illuminati', 'enlightenmentera', 'secret', 'society', 'founded', 'may', 'bavaria', 'today', 'part', 'germany', 'society', 'stated', 'goal', 'oppose', 'superstition', 'obscurantism', 'religious', 'influence', 'public', 'life', 'abuse', 'state', 'power', 'order', 'day', 'wrote', 'general', 'statute', 'put', 'end', 'machination', 'purveyor', 'injustice', 'control', 'without', 'dominating', 'illuminatialong', 'freemasonry', 'secret', 'societieswere', 'outlawed', 'edict', 'charles', 'theodore', 'elector', 'bavaria', 'encouragement', 'catholic', 'church', 'subsequent', 'year', 'group', 'wa', 'generally', 'vilified', 'conservative', 'religious', 'critic', 'claimed', 'illuminati', 'continued', 'underground', 'responsible', 'french', 'revolution', 'attracted', 'literary', 'men', 'jo

In [28]:
unique_words = get_unique_words(lemmatized_text_without_stopwords)

In [31]:
print(unique_words)
print(len(unique_words))

{'injustice', 'sect', 'presentday', 'ethos', 'blue', 'external', 'scottish', 'dice', 'predicated', 'delayed', 'religion', 'almost', 'subtle', 'purporting', 'significant', 'freedom', 'suspicion', 'childhood', 'brigido', 'signed', 'indiscretion', 'prime', 'generally', 'theory', 'thus', 'morse', 'reference', 'purse', 'gained', 'character', 'significantly', 'imported', 'among', 'erratic', 'philosophy', 'stated', 'university', 'key', 'half', 'contend', 'spurring', 'problematic', 'manoeuvre', 'remained', 'development', 'newspaper', 'kaspar', 'admitted', 'defend', 'denunciation', 'attached', 'piece', 'strain', 'dominated', 'mystical', 'invented', 'copy', 'target', 'perceived', 'roterodamus', 'committee', 'wrote', 'form', 'barruel', 'plot', 'authority', 'respectively', 'throughout', 'historically', 'appeared', 'contact', 'bode', 'legacy', 'extra', 'source', 'canon', 'sparta', 'threatened', 'illustrating', 'people', 'created', 'announced', 'law', 'von', 'ignored', 'mark', 'prevent', 'jeanbaptis

In [30]:
for word in unique_words:
  if len(word) <= 3:
    print (word)
  else:
    continue

key
law
von
day
use
end
way
yet
two
aim
f
men
sat
rev
man
saw
bee
spa
ran
owl
ii
lie
via
ha
one
set
wa
low
run
far
de
di
see
led
abb
le
la
new
lay
act
fee
ear
may
put
cit
spy
eye
u
jew
der
xiv
