<a href="https://colab.research.google.com/github/arkapriyathecoderinprogress/WebDataETLPipeline/blob/main/WebDataETL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install beautifulsoup4 nltk



In [3]:
import requests
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from collections import Counter
import pandas as pd
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
class WebScraper:
  def __init__(self,url):
    self.url = url

  def extract_article_text(self):
    response = requests.get(self.url)
    html_content = response.content
    soup = BeautifulSoup(html_content, "html.parser")
    article_text = soup.get_text()
    return article_text

In [8]:
class TextProcessor:
  def __init__(self,nltk_stopwords):
    self.nltk_stopwords = nltk_stopwords

  def tokenize_and_clean(self,text):
    words = text.split()
    filtered_words = [word.lower() for word in words if word.isalpha() and word.lower() not in self.nltk_stopwords]
    return filtered_words

In [9]:
class ETLPipeline:
  def __init__(self,url):
    self.url = url
    self.nltk_stopwords = set(stopwords.words("english"))

  def run(self):
    scraper = WebScraper(self.url)
    article_text = scraper.extract_article_text()

    processor = TextProcessor(self.nltk_stopwords)
    filtered_words = processor.tokenize_and_clean(article_text)

    word_freq = Counter(filtered_words)
    df = pd.DataFrame(word_freq.items(), columns=["Words", "Frequencies"])
    df = df.sort_values(by="Frequencies", ascending=False)
    return df

In [10]:
if __name__ == "__main__":
  article_url = "https://www.investopedia.com/terms/m/montecarlosimulation.asp"
  pipeline = ETLPipeline(article_url)
  result_df = pipeline.run()
  print(result_df.head(20))

           Words  Frequencies
1          carlo           56
0          monte           53
80    simulation           35
211        price           21
63          best           21
333        daily           20
147       random           15
149         used           14
82          risk           13
84   simulations           13
59         rates           12
46     financial           12
138  probability           12
18          view           12
332     periodic           11
215          use           11
334      returns           10
34      personal            9
355     function            9
193      average            9
