In [1]:
import requests

In [2]:
from bs4 import BeautifulSoup

In [3]:
from nltk.corpus import stopwords

In [4]:
from collections import Counter

In [5]:
import pandas as pd

In [6]:
import nltk

In [7]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/hashstudioz/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [8]:
# Extracting text from any article on the web

class WebScraper:
    
    def __init__(self, url):
        self.url = url
        
    def extract_arcticle_text(self):
        response = requests.get(self.url)
        html_content = response.content
        soup = BeautifulSoup(html_content, 'html.parser')
        article_text = soup.get_text()
        return article_text

In [9]:
# Clean and preprocess the text extracted from the article

In [10]:
class TextProcessor:
    
    def __init__(self, nltk_stopwords):
        self.nltk_stopwords = nltk_stopwords
        
    def tokenize_and_clean(self, text):
        words = text.split()
        filtered_words = [word.lower() for word in words if word.isalpha() and word.lower() not in self.nltk_stopwords]
        return filtered_words

In [11]:
# The entire ETL (Extract, Transform, Load) process for extracting article text, processing it, and generating 
# a DataFrame of word frequencies

In [36]:
class ETLPipeline:
    
    def __init__(self, url):
        self.url = url
        self.nltk_stopwords = set(stopwords.words('english'))
        
    def run(self):
        scrapper = WebScraper(self.url)
        article_text = scrapper.extract_arcticle_text()
        
        processor = TextProcessor(self.nltk_stopwords)
        filtered_words = processor.tokenize_and_clean(article_text)
        
        word_freq = Counter(filtered_words)
        df = pd.DataFrame(word_freq.items(), columns=['Words', 'Total_Count'])
#         df = df.sort_values(by='Total_Count', ascending=False)
        return df

In [39]:
if __name__ == '__main__':
#     article_url = "https://amankharwal.medium.com/what-is-time-series-analysis-in-data-science-f89aaa1c0814"
    article_url = "https://thecleverprogrammer.com/2023/08/14/web-data-etl-pipeline-using-python/"
    pipeline = ETLPipeline(article_url)
    result_df = pipeline.run()
    filename = "Article_Scraping.csv"
    result_df.to_csv(filename)


['Words', 'Total_Count']
         Words  Total_Count
0          web           15
1         data           32
2          etl           14
3     pipeline           14
4        using           10
..         ...          ...
198   facebook            1
199  instagram            1
200     medium            1
201   linkedin            1
202  copyright            1

[203 rows x 2 columns]
