In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

url = "https://en.wikipedia.org/wiki/History_of_Islam"
html = urlopen(url).read()
soup = BeautifulSoup(html, features="html.parser")

# Remove script and style elements
for script in soup(["script", "style"]):
    script.extract()

# Get the text
text = soup.get_text()

# Break into lines and remove leading/trailing spaces
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
cleaned_text = " ".join(chunks)

print(cleaned_text)


    History of Islam - Wikipedia                                    Jump to content        Main menu      Main menu move to sidebar hide    Navigation    Main pageContentsCurrent eventsRandom articleAbout WikipediaContact usDonate      Contribute    HelpLearn to editCommunity portalRecent changesUpload file                    Search            Search                              Create account  Log in         Personal tools      Create account Log in      Pages for logged out editors learn more    ContributionsTalk                             Contents move to sidebar hide     (Top)      1Timeline        2Early sources and historiography        3Origins of Islam        4Rashidun Caliphate        5Umayyad Caliphate        6Islamic world during the Abbasid Caliphate    Toggle Islamic world during the Abbasid Caliphate subsection      6.1Golden Baghdad Abbasids        6.2Rise of regional powers        6.3High Baghdad Abbasids        6.4Middle Baghdad Abbasids        6.5Late Baghdad Abbasid

In [3]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# Download NLTK data )
nltk.download("punkt")
nltk.download("stopwords")

# Tokenization
tokens = word_tokenize(cleaned_text)

# Remove punctuation and convert to lowercase
tokens = [word.lower() for word in tokens if word.isalpha()]

# Remove punctuation
tokens = [word for word in tokens if word not in string.punctuation]

# Remove stop words
stop_words = set(stopwords.words("english"))
filtered_tokens = [word for word in tokens if word not in stop_words]

# Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]

# Get unique words using a set
unique_words = set(stemmed_tokens)

# Print the unique words
print("Unique words after stemming:")
for word in unique_words:
    print(word)



[nltk_data] Downloading package punkt to /Users/noornizar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/noornizar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unique words after stemming:
jawi
wikidataarticl
high
repeatedli
academ
septemb
ibadi
roy
fought
abbad
read
anoth
perhap
seri
ever
joseph
parliament
punjab
fatwa
welfar
striction
blue
qr
ariyanam
petersburg
sabzawari
tourism
intern
save
recit
test
ehlert
unquiet
resid
unlik
identifi
fernández
quraysh
place
chamber
africa
repress
commenc
ternat
ad
certain
conquest
prose
similar
focu
fundament
aleppo
ziyārīd
ideolog
moral
alik
textal
jhu
rum
trust
al
formid
mustafa
hut
demand
rememb
tlemcen
reinhart
pakistan
eric
physicist
indol
ummayad
cleric
fierro
feder
ʿabd
promin
contain
grain
suprem
ithaca
ṣafavi
bon
leftist
difficult
record
mongol
sa
mastoor
herbert
ilkhanid
van
blanshard
interest
gloriou
historian
transfer
ethnic
kāfir
troubl
malis
rout
archaeolog
lifetim
menu
jew
saint
historiographi
crucial
liter
mumtaz
albanian
regiment
roder
econom
meantim
saylac
guadalet
ancient
wayback
seiz
sampler
other
number
mohammada
implor
rise
rowman
warlord
collabor
gener
aqsa
opposit
lanka
ashirvadi