In [1]:
import requests
from bs4 import BeautifulSoup
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [2]:
# import nltk
# nltk.download()

## Fetch and Parse HTML

Use the request library to fetch the HTML content of a webpage and then use BeautifulSoup to parse it.

In [3]:
url = "https://growdataskills.com/project-nlp"
response = requests.get(url)
html_content = response.content

soup = BeautifulSoup(html_content, "html.parser")

## Extract Text Data

Once you have the parsed HTML, extract the relevant text data using various methods such as `.find()`, `.find_all()`, and `.get_text()` method.

In [4]:
# Example: Extracting all paragraphs
paragraphs = soup.find_all('p')

# Extracting text from each paragraph
paragraph_texts = [paragraph.get_text() for paragraph in paragraphs]

## Text Preprocessing

Text preprocessing invovles various steps to clean and normalize the extracted text.

In [5]:
# Convert to lowercase
lowercase_text = [text.lower() for text in paragraph_texts]

# Remove special characters using regex
cleaned_text = [re.sub(r'[^a-zA-Z0-9\s]', '', text) for text in lowercase_text]

# Tokenization
tokenized_text = [word_tokenize(text) for text in cleaned_text]

# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered__text = [[word for word in tokens if word not in stop_words] for tokens in tokenized_text]

# Stemming
stemmer = PorterStemmer()
stemmed_text = [[stemmer.stem(word) for word in tokens] for tokens in filtered__text]

# Further Processing

You can performed additional steps such as removing empty tokens, converting the processed text back to sentences or paragraphs, and so on, based on your requirements

In [6]:
# Remove empty tokens
final_text = [[word for word in tokens if word.strip()] for tokens in stemmed_text]

# Convert tokens back to sentences
sentences = [' '.join(tokens) for tokens in final_text]

# Convert sentences bak to paragraphs
processed_paragraphs = '\n\n'.join(sentences)

# Save Processed Text

finally, you can save the processed text to a file for further analysis

In [7]:
with open('processed_text.txt', 'w', encoding='utf-8') as file:
    file.write(processed_paragraphs)
    
print(processed_paragraphs)

shubhankit sirvaiya data scientist world wide technolog ex accentur growth strategi mentor 700 student around data scienc domain

empow aspir data profession soar upskil opportun

copyright 2023 grow data skill right reserv

welcom grow data skill chat support repres respond within hour

enquiri
