In [1]:
import os
os.chdir("../")
!pwd

/d/Projects/Quote_Recommender


In [2]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen = True)
class DataIngestionConfig:
    root_dir: Path
    save_path: Path
    tags: list
    num_pages: int

In [3]:
from src.Quote_Recommender.constants import *
from src.Quote_Recommender.utils.common import read_yaml, create_directories

In [10]:
class ConfigManager:
    def __init__(
            self,
            config_filepath = CONFIG_FILE_PATH,
            params_filepath = PARAMS_FILE_PATH 
    ):
        self.config = read_yaml(Path(config_filepath))
        self.params = read_yaml(Path(params_filepath))

        create_directories([self.config.artifacts_root])

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion
        params = self.params.data_ingestion
        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir = config.root_dir,
            save_path = config.save_path,
            tags = params.tags,
            num_pages = params.num_pages
        )
    
        return data_ingestion_config

In [5]:
import os
import requests
import pandas as pd
from collections import OrderedDict
from bs4 import BeautifulSoup
import random
from src.Quote_Recommender.utils.common import get_size
from src.Quote_Recommender import logger

In [13]:
class DataIngestion:
    def __init__(self, config = DataIngestionConfig):
        self.config = config

    def download_data(self):
        if not os.path.exists(self.config.save_path):
            quotes = []
            author = []
            tags_mul = []
            tags = self.config.tags

            for tag in tags:
                ran_pag = list(set([random.randint(1,100) for _ in range(self.config.num_pages)]))
                for page in ran_pag:
                    url = f"https://www.goodreads.com/quotes/tag/{tag}?page={page}"
                    page_res = requests.get(url)
                    content = BeautifulSoup(page_res.content, 'html.parser')
                    quotes_html_page = content.find_all('div',{'class':'quoteDetails'})

                    for quote_html in quotes_html_page:
                        quotes.append(quote_html.find('div',{'class':'quoteText'}).get_text().strip().split('\n')[0])
                        author.append(quote_html.find('span',{'class':'authorOrTitle'}).get_text().strip())
                        if quote_html.find('div',{'class':'greyText smallText left'}) is not None:
                            tags_list = [tag.get_text() for tag in quote_html.find('div',{'class':'greyText smallText left'}).find_all('a')]
                            tags_ls = list(OrderedDict.fromkeys(tags_list))
                            if 'attributed-no-source' in tags_ls:
                                tags_ls.remove('attributed-no-source')
                        else:
                            tags_ls = None
                        tags_mul.append(tags_ls)
            
            data_frame = pd.DataFrame({
                "Quotes": quotes,
                "Author": author,
                "Tags": tags_mul
            })

            data_frame.to_csv(self.config.save_path)
            logger.info(f"data scraped/collected at {self.config.save_path}")
        
        else:
            logger.info(f"file already exists of size: {get_size(Path(self.config.save_path))}")

In [14]:
try:
    config = ConfigManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config = data_ingestion_config)
    data_ingestion.download_data()
except Exception as e:
    raise e

[2023-06-16 12:14:16,928: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-06-16 12:14:16,932: INFO: common: yaml file: params.yaml loaded successfully]
[2023-06-16 12:16:46,246: INFO: 4196187431: data scraped/collected at artifacts/data_ingestion/data.csv]
