In [34]:
from bs4 import BeautifulSoup
import requests

url = "https://en.wikipedia.org/wiki/Main_Page"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
paragraphs = soup.find_all('p')
paragraph_text = [p.get_text().strip() for p in paragraphs]
print(paragraph_text)


["Grant's Canal was a military project to construct a canal through a bend in the Mississippi River opposite Vicksburg, Mississippi, during the American Civil War. Control of Vicksburg and the Mississippi was considered crucial by both the Union and the Confederacy.  In June\xa01862, Union officer Thomas Williams was sent to De Soto Point with his men to dig a canal to bypass the strong Confederate defenses around Vicksburg.  Disease and falling river levels prevented completion, and the project was abandoned until January\xa01863, when Ulysses S. Grant took an interest.  The upstream entrance of the canal was moved, but heavy rains and flooding interfered with the project.  Work was abandoned in March, and Grant eventually used other methods to capture Vicksburg.  In 1876, the Mississippi changed course, cutting across De Soto Point near the route of the old canal and isolating Vicksburg from the river.  The city's river access has since been restored.  Only a small section of the can

In [35]:
import re

def clean_data(data):
    cleaned_data = []
    for entry in data:
        cleaned_entry = re.sub(r'[^a-zA-Z0-9\s]', '', entry)  # Remove all non-alphanumeric characters
        cleaned_data.append(cleaned_entry.strip()) 
        cleaned_data.append(cleaned_entry)
    return cleaned_data

# Clean the data
cleaned_data = clean_data(paragraph_text)

# Print the cleaned data
print(cleaned_data)


['Grants Canal was a military project to construct a canal through a bend in the Mississippi River opposite Vicksburg Mississippi during the American Civil War Control of Vicksburg and the Mississippi was considered crucial by both the Union and the Confederacy  In June\xa01862 Union officer Thomas Williams was sent to De Soto Point with his men to dig a canal to bypass the strong Confederate defenses around Vicksburg  Disease and falling river levels prevented completion and the project was abandoned until January\xa01863 when Ulysses S Grant took an interest  The upstream entrance of the canal was moved but heavy rains and flooding interfered with the project  Work was abandoned in March and Grant eventually used other methods to capture Vicksburg  In 1876 the Mississippi changed course cutting across De Soto Point near the route of the old canal and isolating Vicksburg from the river  The citys river access has since been restored  Only a small section of the canal survives Full\xa0

In [36]:
normalized_data = [word.lower() for word in cleaned_data]

print(normalized_data)

['grants canal was a military project to construct a canal through a bend in the mississippi river opposite vicksburg mississippi during the american civil war control of vicksburg and the mississippi was considered crucial by both the union and the confederacy  in june\xa01862 union officer thomas williams was sent to de soto point with his men to dig a canal to bypass the strong confederate defenses around vicksburg  disease and falling river levels prevented completion and the project was abandoned until january\xa01863 when ulysses s grant took an interest  the upstream entrance of the canal was moved but heavy rains and flooding interfered with the project  work was abandoned in march and grant eventually used other methods to capture vicksburg  in 1876 the mississippi changed course cutting across de soto point near the route of the old canal and isolating vicksburg from the river  the citys river access has since been restored  only a small section of the canal survives full\xa0

In [37]:
words = []
for sentence in normalized_data:
    words.extend(sentence.split())

print(words)

['grants', 'canal', 'was', 'a', 'military', 'project', 'to', 'construct', 'a', 'canal', 'through', 'a', 'bend', 'in', 'the', 'mississippi', 'river', 'opposite', 'vicksburg', 'mississippi', 'during', 'the', 'american', 'civil', 'war', 'control', 'of', 'vicksburg', 'and', 'the', 'mississippi', 'was', 'considered', 'crucial', 'by', 'both', 'the', 'union', 'and', 'the', 'confederacy', 'in', 'june', '1862', 'union', 'officer', 'thomas', 'williams', 'was', 'sent', 'to', 'de', 'soto', 'point', 'with', 'his', 'men', 'to', 'dig', 'a', 'canal', 'to', 'bypass', 'the', 'strong', 'confederate', 'defenses', 'around', 'vicksburg', 'disease', 'and', 'falling', 'river', 'levels', 'prevented', 'completion', 'and', 'the', 'project', 'was', 'abandoned', 'until', 'january', '1863', 'when', 'ulysses', 's', 'grant', 'took', 'an', 'interest', 'the', 'upstream', 'entrance', 'of', 'the', 'canal', 'was', 'moved', 'but', 'heavy', 'rains', 'and', 'flooding', 'interfered', 'with', 'the', 'project', 'work', 'was', '

In [38]:
from nltk.stem import PorterStemmer

# Initialize PorterStemmer
stemmer = PorterStemmer()

# Stem each word
stemmed_words = [stemmer.stem(word) for word in words]

print(stemmed_words)


['grant', 'canal', 'wa', 'a', 'militari', 'project', 'to', 'construct', 'a', 'canal', 'through', 'a', 'bend', 'in', 'the', 'mississippi', 'river', 'opposit', 'vicksburg', 'mississippi', 'dure', 'the', 'american', 'civil', 'war', 'control', 'of', 'vicksburg', 'and', 'the', 'mississippi', 'wa', 'consid', 'crucial', 'by', 'both', 'the', 'union', 'and', 'the', 'confederaci', 'in', 'june', '1862', 'union', 'offic', 'thoma', 'william', 'wa', 'sent', 'to', 'de', 'soto', 'point', 'with', 'hi', 'men', 'to', 'dig', 'a', 'canal', 'to', 'bypass', 'the', 'strong', 'confeder', 'defens', 'around', 'vicksburg', 'diseas', 'and', 'fall', 'river', 'level', 'prevent', 'complet', 'and', 'the', 'project', 'wa', 'abandon', 'until', 'januari', '1863', 'when', 'ulyss', 's', 'grant', 'took', 'an', 'interest', 'the', 'upstream', 'entranc', 'of', 'the', 'canal', 'wa', 'move', 'but', 'heavi', 'rain', 'and', 'flood', 'interf', 'with', 'the', 'project', 'work', 'wa', 'abandon', 'in', 'march', 'and', 'grant', 'eventu

In [39]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

filtered_data = [word for word in stemmed_words if word.lower() not in stop_words]

print(filtered_data)


['grants', 'canal', 'military', 'project', 'construct', 'canal', 'bend', 'mississippi', 'river', 'opposite', 'vicksburg', 'mississippi', 'american', 'civil', 'war', 'control', 'vicksburg', 'mississippi', 'considered', 'crucial', 'union', 'confederacy', 'june', '1862', 'union', 'officer', 'thomas', 'williams', 'sent', 'de', 'soto', 'point', 'men', 'dig', 'canal', 'bypass', 'strong', 'confederate', 'defenses', 'around', 'vicksburg', 'disease', 'falling', 'river', 'levels', 'prevented', 'completion', 'project', 'abandoned', 'january', '1863', 'ulysses', 'grant', 'took', 'interest', 'upstream', 'entrance', 'canal', 'moved', 'heavy', 'rains', 'flooding', 'interfered', 'project', 'work', 'abandoned', 'march', 'grant', 'eventually', 'used', 'methods', 'capture', 'vicksburg', '1876', 'mississippi', 'changed', 'course', 'cutting', 'across', 'de', 'soto', 'point', 'near', 'route', 'old', 'canal', 'isolating', 'vicksburg', 'river', 'citys', 'river', 'access', 'since', 'restored', 'small', 'sectio

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\youssef\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [40]:
unique_words = set(filtered_data)

print(unique_words)

{'awards', 'sea', 'premiered', 'nominations', 'entrance', 'course', 'completion', 'imperial', 'work', 'network', 'grant', 'path', 'built', 'fushimi', 'considered', 'dig', 'methods', 'office', 'sites', 'written', 'garnered', 'citys', 'eight', 'williams', 'named', 'abandoned', 'wikipedias', 'level', 'segments', 'control', 'thomas', 'torii', 'mississippi', '15', 'popular', 'march', 'war', 'located', 'entertainment', 'confederacy', 'list', 'ninetythree', 'inaritaisha', 'isolating', 'news', 'york', 'depicted', 'hill', 'section', 'january', 'inariyama', 'one', 'organization', 'american', 'soto', 'levels', 'business', 'vicksburg', 'restored', 'photograph', 'largest', 'ft', 'tiangco', 'ulysses', 'give', 'kkai', 'interfered', 'public', 'mel', 'award', 'structure', 'events', 'around', 'hosts', 'full', 'academy', '24', 'construct', 'hosted', '1876', 'listed', '2014', 'grants', 'credit', 'basile', '2004', 'deity', 'opposite', 'philippine', 'sits', 'pmpc', 'flooding', 'festivals', 'received', 'heia