In [45]:
import requests
from bs4 import BeautifulSoup
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer


In [46]:
def get_html_text(url):
    try:
        # Fetch HTML content from the URL
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for any HTTP error
        html_content = response.text
        
        # Parse HTML using BeautifulSoup
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # Extract text from the parsed HTML
        text = soup.get_text()
        
        return text
    except requests.exceptions.RequestException as e:
        print("Error fetching URL:", e)
        return None


In [47]:
def tokenize(text):
    tokens = []
    current_token = ""
    for char in text:
        if char.isalnum():  # Check if the character is alphanumeric
            current_token += char
        else:
            if current_token:
                tokens.append(current_token.lower())  # Add the token to the list
                current_token = ""
            if char.strip():  # Check if the character is not whitespace
                tokens.append(char)  # Add non-alphanumeric characters as separate tokens
    if current_token:
        tokens.append(current_token.lower())  # Add the last token if any
    return tokens

In [48]:


def preprocess_text(text):
    # Remove non-alphanumeric characters and convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text).lower()

    
    # Tokenize the text
    tokens = tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    
    # Stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    
    return stemmed_tokens




In [54]:
# Example usage:
url = "https://edusites.uregina.ca/brandymogg/2021/06/11/i-am-a-baker-but-please-dont-ask-me-to-bake-a-cake-in-code/"

In [55]:

html_text = get_html_text(url)
if html_text:
    preprocessed_tokens = preprocess_text(html_text)
    unique_words = set(preprocessed_tokens)
    print("Unique words:", unique_words)
else:
    print("Failed to fetch or parse HTML content.")

Unique words: {'final', 'right', 'acknowledg', 'pennington', 'troubl', 'quotat', 'reward', 'dr', 'doesnt', 'squar', 'resourc', 'haili', 'manifesto', 'everyon', 'sort', 'video', 'next', 'figur', 'replyyour', 'back', 'imagin', 'day', 'realli', 'someth', 'first', 'recent', 'java', 'skill', 'last', 'jun', 'physic', 'knowledg', 'nice', 'entri', 'orient', 'done', 'blog', 'krengnektak', 'basic', 'wordpressorg', 'believ', 'ago', 'june', 'get', 'gone', 'critic', 'far', 'second', 'though', 'search', 'use', 'real', 'see', 'futur', 'websit', 'error', 'hit', 'certain', 'import', 'got', 'twitter', 'class', 'teacher', 'come', 'project', 'tri', 'anyon', 'make', 'stick', 'coveragenext', 'sinc', 'pretti', 'bribe', 'e', 'given', 'put', 'literaci', 'skip', 'dobak', 'watch', 'job', 'grade', 'ever', 'categori', 'us', 'rule', 'develop', 'game', 'insert', 'attempt', 'embarrassingli', 'amaz', 'khavari', 'give', 'notion', 'mogg', 'cleanportfolio', 'avail', 'includ', 'endin', 'share', 'life', 'especi', 'either',

In [57]:
for word in unique_words:
    if len(word) < 3 :
        print(word)

dr
e
us
