In [None]:
import requests
from bs4 import BeautifulSoup
import nltk
import spacy
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import Counter
import textwrap

import matplotlib.pyplot as plt

In [None]:
nlp = spacy.load('en_core_web_sm')

# Download required NLTK datasets
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
def scrape_web_page(url):
    """
    Fetches HTML content from the given URL.
    Returns the HTML content if successful, otherwise None.
    """
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        return None

In [None]:
def extract_text_from_html(html_content):
    """
    Extracts and concatenates text from all <p> tags in the HTML content.
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    paragraphs = soup.find_all('p')
    text = ' '.join([para.get_text() for para in paragraphs])
    return text

In [None]:
def tokenize_text(text):
    """
    Tokenizes text into words and sentences.
    """
    words = word_tokenize(text)
    sentences = sent_tokenize(text)
    return words, sentences

In [None]:
def remove_stop_words(words):
    """
    Removes stop words from a list of words.
    """
    stop_words = set(stopwords.words('english'))
    return [word for word in words if word.lower() not in stop_words]

In [None]:
def stem_words(words):
    """
    Stems words using the Porter Stemmer.
    """
    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in words]

In [None]:
def lemmatize_text(text):
    """
    Lemmatizes text using SpaCy.
    """
    doc = nlp(text)
    return [token.lemma_ for token in doc]

In [None]:
def extract_named_entities(text):
    """
    Extracts named entities using SpaCy.
    """
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

In [None]:
def pos_tag_nltk(words):
    """
    Performs Part-of-Speech tagging using NLTK.
    """
    return nltk.pos_tag(words)

In [None]:
def pos_tag_spacy(text):
    """
    Performs Part-of-Speech tagging using SpaCy.
    """
    doc = nlp(text)
    return [(token.text, token.pos_) for token in doc]

In [None]:
def word_frequency(words):
    """
    Analyzes word frequency and returns the 10 most common words.
    """
    return Counter(words).most_common(10)

In [None]:
def perform_nlp_tasks(text):
    """
    Orchestrates all NLP tasks and returns results as a dictionary.
    """
    words, sentences = tokenize_text(text)
    filtered_words = remove_stop_words(words)
    stemmed_words = stem_words(filtered_words)
    lemmatized_words = lemmatize_text(text)
    entities = extract_named_entities(text)
    pos_tags_nltk = pos_tag_nltk(filtered_words)
    pos_tags_spacy = pos_tag_spacy(text)
    word_freq = word_frequency(filtered_words)

    return {
        "words": words,
        "sentences": sentences,
        "filtered_words": filtered_words,
        "stemmed_words": stemmed_words,
        "lemmatized_words": lemmatized_words,
        "entities": entities,
        "pos_tags_nltk": pos_tags_nltk,
        "pos_tags_spacy": pos_tags_spacy,
        "word_freq": word_freq,
    }

In [None]:
url = "https://apnews.com/article/lakers-blazers-score-lebron-6ed76fdd53d949a38bc0eadab4981959"

In [None]:
html_content = scrape_web_page(url)

In [None]:
import nltk

In [None]:
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [None]:
if html_content:

    text = extract_text_from_html(html_content)

    wrapped_text = textwrap.fill(text, width=80)
    with open("web_content.txt", "w") as file:
        file.write(wrapped_text)

    # Read the text from the file
    with open("web_content.txt", "r") as file:
        text_from_file = file.read()

    nlp_results = perform_nlp_tasks(text_from_file)
    output = (
        "========== Tokenized Words =========="
        f"{nlp_results['words']}\n\n"
        "========== Sentences =========="
        f"{nlp_results['sentences']}\n\n"
        "========== Filtered Words (Without Stop Words) =========="
        f"{nlp_results['filtered_words']}\n\n"
        "========== Stemmed Words =========="
        f"{nlp_results['stemmed_words']}\n\n"
        "========== Lemmatized Words =========="
        f"{nlp_results['lemmatized_words']}\n\n"
        "========== Named Entities =========="
        f"{nlp_results['entities']}\n\n"
        "========== POS Tags (Using NLTK) =========="
        f"{nlp_results['pos_tags_nltk']}\n\n"
        "========== POS Tags (Using SpaCy) =========="
        f"{nlp_results['pos_tags_spacy']}\n\n"
        "========== Word Frequency =========="
        f"{nlp_results['word_freq']}\n"
    )

    print(output)

    # Save the output to a file
    with open("nlp_output.txt", "w") as file:
        file.write(output)
else:
    print("Failed to retrieve the webpage.")










