In [8]:
import requests
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from collections import Counter
import pandas as pd
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ambik\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
class WebScraper:
    def __init__(self,url):
        self.url=url
    def extract_article_text(self):
        response=requests.get(self.url)
        html_content=response.content
        soup=BeautifulSoup(html_content,'html.parser')
        article_text=soup.get_text()
        return article_text

In [10]:
class TextProcessor:
    def __init__(self,nltk_stopwords):
        self.nltk_stopwords=nltk_stopwords
    def tokenize_and_clean(self,text):
        words=text.split()
        filtered_words=[word.lower() for word in words if word.isalpha() and word.lower() not in self.nltk_stopwords]
        return filtered_words

In [13]:
class ETLPipeline:
    def __init__(self,url):
        self.url=url
        self.nltk_stopwords=set(stopwords.words('english'))
    def run(self):
        scraper=WebScraper(self.url)
        article_text=scraper.extract_article_text()
        processor=TextProcessor(self.nltk_stopwords)
        filtered_words=processor.tokenize_and_clean(article_text)
        word_freq=Counter(filtered_words)
        df=pd.DataFrame(word_freq.items(),columns=['words','frequencies'])
        df=df.sort_values(by='frequencies',ascending=False)
        return df

In [15]:
if __name__ == "__main__":
    #article_url = "https://amankharwal.medium.com/what-is-time-series-analysis-in-data-science-f89aaa1c0814"
    article_url = input("Enter the URL of the article: ")
    pipeline = ETLPipeline(article_url)
    result_df = pipeline.run()
    print(result_df.head())

Enter the URL of the article:  https://thecleverprogrammer.com/2023/08/14/web-data-etl-pipeline-using-python/


       words  frequencies
1       data           35
0        web           15
2        etl           14
3   pipeline           14
87     class           10


In [23]:
result_df

Unnamed: 0,words,frequencies
1,data,35
0,web,15
2,etl,14
3,pipeline,14
87,class,10
...,...,...
50,relevant,1
51,information,1
52,raw,1
53,transformed,1
