In [1]:
import requests
import pandas
import base64
import os
import re
import bs4
from bs4 import BeautifulSoup 
from datetime import datetime
from pydantic import BaseModel
import json
import uuid
import random
import pprint as pp

In [2]:
class Livemint_reader:
    #__base_url: str
    __headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119 Safari/537.36'}
    
    def __init__(self, url=''):
        if(url):
            self.__base_url = url
        else:
            self.__base_url = 'https://www.livemint.com/'
        
    
    def find_articles_on_home_page(self) -> pandas.DataFrame:         
        ''' Made redundant with feed reading
        '''
        response = requests.get(url=self.__base_url, headers=Livemint_reader.__headers)
        # Get the top headlines in the first ATF block
        # always better to grow arrays first then form a dataframe from them
        rel_titles = []
        rel_links = []
        rel_link_id = []
        uuids = []

        if(response.status_code == requests.codes.ok):
            soup = BeautifulSoup(response.content, 'html.parser')
            hero_stories = soup.select('.heroStory .imgStory a')
            if(hero_stories):
                for story in hero_stories:
                    rel_titles.append(story.contents[0].strip())
                    rel_links.append(story.get('href'))
                    rel_link_id.append( story.get('href')[-19:][:14])
                    uuid_num = uuid.UUID(int=random.getrandbits(128), version=4)
                    uuids.append(uuid_num)
                    #print(story.get('href'))
                    #print(story.contents[0].strip())
                
            news_stories = soup.select('li.newsBlock h3 a')
            if(news_stories):
                for story in news_stories:
                    rel_titles.append(story.contents[0].strip())
                    rel_links.append(story.get('href'))
                    rel_link_id.append( story.get('href')[-19:][:14])
                    uuid_num = uuid.UUID(int=random.getrandbits(128), version=4)
                    uuids.append(uuid_num)
                    #print(story.get('href'))
                    #print(story.contents[0].strip())
            
            if(hero_stories or news_stories):
                top_headlines = pandas.DataFrame({
                                                    'uuid': uuids, 
                                                    'title': rel_titles,     #Don't change these column names, they are important for merging later 
                                                    'link': rel_links,
                                                    'link_id': rel_link_id
                                                })
                top_headlines.insert(1, 'site', 'Livemint')
                return top_headlines
            else:
                return None
        else:
            return None

    def read_article(self, art_url:str) -> pandas.DataFrame:
        headline = ''
        subhead = ''
        body = ''
        art_date = ''
        
        self.__base_url = art_url  #This is bad design
        
        #art_url = top_headlines.iat[2, 1]
        art_res = requests.get(url=self.__base_url, headers=Livemint_reader.__headers)
        if(art_res.status_code == requests.codes.ok):
            #Done
            art_soup = BeautifulSoup(art_res.content, 'html.parser')
            head_h1 = art_soup.find('h1', {'id':'article-0'})  #This might not be affected by changes in web page 
            if head_h1:
                headline = head_h1.get_text()
            else:
                print(f'{art_url}: No headline found')

            subhead_pattern = re.compile(r'storyPage_summary.*')
            sub_head_h2 = art_soup.find('h2', class_=subhead_pattern) #This might fail on a daily basis, just switch to using h2 then 
            if sub_head_h2:
                subhead = sub_head_h2.get_text() # Can be used to improve sharpness of LLM 
            else:
                print(f'{art_url}: No subhead found')
            
            art_body_divs = art_soup.find_all('div', {'class': 'storyParagraph'}) # should work well and not fail
            if art_body_divs:
                for para_div in art_body_divs:
                    para = para_div.get_text()
                    if 'Disclaimer' not in para:   #reject boiler plate text to reduce tokens to LLMs
                        if(body):
                            body = body + '\n' +  para
                        else:
                            body = para
            else:
                print(f'{art_url}: No Article Body found')

            date_pattern = re.compile(r'storyPage_date.*')
            art_date_div = art_soup.find('div', class_=date_pattern) #find date
            if art_date_div:
                art_date = art_date_div.get_text()
            else:
                print(f'{art_url}: No article date found')
            #print (body)
            article_frame = pandas.DataFrame([[headline, subhead, body, art_date]], columns=['article_title', 'article_subheader', 'article_body', 'article_date'])
            return article_frame
        else:
            print('Access to specific article' + art_url + 'is failing')
            return None


In [3]:
#Tester
#rdr = Livemint_reader()
#home_links = rdr.find_articles_on_home_page()
#pp.pprint(home_links)
#art1 = rdr.read_article(home_links.iloc[0, 3])
#pp.pprint(art1)