In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
import re
import requests
from bs4 import BeautifulSoup
import numpy as np

In [2]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
def clean_data_text(data):
    # Normalization
    cleaned_data = re.sub(r'[^a-zA-Z\s]', '', data)
    #Tokonization
    tokonizes=nltk.word_tokenize(cleaned_data.lower())
    #Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokonizes if word not in stop_words]
    #stemming
    stemer=PorterStemmer()
    stemer_words=[stemer.stem(word) for word in tokens]
    
    lematizer=WordNetLemmatizer()
    lematizer_words=[lematizer.lemmatize(word) for word in tokonizes]

    unique_words=list(set(stemer_words))

    return unique_words

In [None]:
def extract_text_from_html(url):
    # Extract HTML from URL
    response = requests.get(url)
    html = response.text

    # Extract text from HTML
    soup = BeautifulSoup(html, 'html.parser')
    text = ""
    for paragraph in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
        text += paragraph.get_text() + " "
    return text

In [4]:
def get_unique_words(tokens):
    return list(set(tokens))

In [None]:
# Example usage
url = "https://en.wikipedia.org/wiki/Python_(programming_language)"
html_text = extract_text_from_html(url)
print("html text:", html_text)

html text: Contents Python (programming language) 
 Python is a high-level, general-purpose programming language. Its design philosophy emphasizes code readability with the use of significant indentation.[31]
 Python is dynamically typed and garbage-collected. It supports multiple programming paradigms, including structured (particularly procedural), object-oriented and functional programming. It is often described as a "batteries included" language due to its comprehensive standard library.[32][33]
 Guido van Rossum began working on Python in the late 1980s as a successor to the ABC programming language and first released it in 1991 as Python 0.9.0.[34] Python 2.0 was released in 2000. Python 3.0, released in 2008, was a major revision not completely backward-compatible with earlier versions. Python 2.7.18, released in 2020, was the last release of Python 2.[35]
 Python consistently ranks as one of the most popular programming languages, and has gained widespread use in the machine le

In [None]:
unique_words = clean_data_text(html_text)
print("Unique words:", unique_words)

Unique words: ['vision', 'avail', 'denialofservic', 'instead', 'lie', 'exist', 'platform', 'b', 'python', 'give', 'normal', 'commerci', 'gain', 'combinatori', 'cinema', 'februari', 'made', 'crossplatform', 'django', 'schedul', 'preced', 'rather', 'mapper', 'notebook', 'extens', 'alex', 'kind', 'equat', 'creator', 'per', 'elect', 'set', 'dynam', 'claus', 'metaprogram', 'dunder', 'mathemat', 'openbsd', 'messag', 'pythonedit', 'surround', 'receiv', 'retent', 'swift', 'best', 'case', 'revis', 'rank', 'choic', 'earlier', 'methodsedit', 'among', 'academ', 'readingedit', 'cover', 'outdat', 'rewrit', 'resolut', 'compon', 'team', 'queri', 'match', 'evolv', 'name', 'aid', 'term', 'silent', 'relat', 'tail', 'highest', 'much', 'alter', 'amoeba', 'pycharm', 'network', 'enter', 'fact', 'less', 'janu', 'valu', 'connect', 'compliment', 'easili', 'modifi', 'life', 'mostli', 'differ', 'curli', 'rubi', 'rise', 'support', 'pypi', 'problog', 'paradigm', 'bestow', 'time', 'field', 'tkinter', 'prematur', 'pa