In [1]:
import requests
import pandas
import base64
import os
import bs4
from google import genai
from google.genai import types
from bs4 import BeautifulSoup 
from datetime import datetime
from pydantic import BaseModel
import json
from openpyxl import load_workbook
import uuid
import random
import pprint
import dotenv as env
env.load_dotenv('../keys.env')

True

In [2]:
def read_livemint_headlines():
    url = "https://www.livemint.com/"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    # Get the top headlines in the first ATF block

    # always better to grow arrays first then form a dataframe from them
    rel_titles = []
    rel_links = []
    rel_link_id = []
    uuids = []

    if(response.status_code == requests.codes.ok):
        soup = BeautifulSoup(response.content, 'html.parser')
        hero_stories = soup.select('.heroStory .imgStory a')
        if(hero_stories):
            for story in hero_stories:
                rel_titles.append(story.contents[0].strip())
                rel_links.append(story.get('href'))
                rel_link_id.append( story.get('href')[-19:][:14])
                uuid_num = uuid.UUID(int=random.getrandbits(128), version=4)
                uuids.append(uuid_num)
                #print(story.get('href'))
                #print(story.contents[0].strip())
                
        news_stories = soup.select('li.newsBlock h3 a')
        if(news_stories):
            for story in news_stories:
                rel_titles.append(story.contents[0].strip())
                rel_links.append(story.get('href'))
                rel_link_id.append( story.get('href')[-19:][:14])
                uuid_num = uuid.UUID(int=random.getrandbits(128), version=4)
                uuids.append(uuid_num)
                #print(story.get('href'))
                #print(story.contents[0].strip())
        
        if(hero_stories or news_stories):
            top_headlines = pandas.DataFrame({
                                                    'uuid': uuids, 
                                                    'title': rel_titles,     #Don't change these column names, they are important for merging later 
                                                    'link': rel_links,
                                                    'link_id': rel_link_id
                                                })
            top_headlines.insert(1, 'site', 'Livemint')
            return top_headlines
        else:
            return None
    else:
        return None

In [3]:
def read_et_headlines():
    #'''This function can produce duplicates in its returned dataframe. This is because ET website is not structured properly. Eliminate the duplicates using the rel_link_id in postprocessig'''
    url = 'https://economictimes.indiatimes.com/'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    # always better to grow arrays first then form a dataframe from them
    rel_titles = []
    rel_links = []
    rel_link_id = []
    uuids = []

    if(response.status_code == requests.codes.ok):
        soup = BeautifulSoup(response.content, 'html.parser')
        hero_stories = soup.select('#investIdeas ul.newsList li a')
        if(hero_stories):
            for story in hero_stories:
                rel_titles.append(story.get_text())                   #only 1 span inside the headline which contains the link texct which we read
                link = story.get('href').split('?')[0]                # source=homepage&medium=investment_ideas_content&campaign=prime_discovery is junk and can be removed
                rel_links.append(f'https://economictimes.indiatimes.com{link}')
                rel_link_id.append(link[-13:][:9])
                uuid_num = uuid.UUID(int=random.getrandbits(128), version=4)
                uuids.append(uuid_num)
                #print(story.get('href'))
                #print(story.select('span'))
        else:
            print('No hero stories found')
        
        news_stories = soup.select('ul.newsList li a')
        if(news_stories):
            for story in news_stories:
                text = story.get_text()
                link = story.get('href').split('?')[0]                                     # source=homepage&medium=investment_ideas_content&campaign=prime_discovery is junk and can be removed
                if(text!='ET MARKETS' and text!='' and 'timeslearn'not in link):                             # No need to process these links
                    rel_titles.append(text)
                    if('https://economictimes.indiatimes.com' not in link):                       # ET links are not consistent in design
                        rel_links.append(f'https://economictimes.indiatimes.com{link}')
                    else:
                        rel_links.append(link)
                    rel_link_id.append(link[-13:][:9])
                    uuid_num = uuid.UUID(int=random.getrandbits(128), version=4)
                    uuids.append(uuid_num)
                    #print(story.get('href'))
                    #print(story.contents[0].strip())
        else:
            print('No topline stories found')
        if(hero_stories or news_stories):
            top_headlines = pandas.DataFrame({
                                                    'uuid': uuids, 
                                                    'title': rel_titles,     #Don't change these column names, they are important for merging later 
                                                    'link': rel_links,
                                                    'link_id': rel_link_id
                                                })
            top_headlines.insert(1, 'site', 'ET')
            return top_headlines
        else:
            return None
    else:
        return None

In [4]:
def read_livemint_extra():
    # Removing this function because the upper function is simpler and more comprehensive.
    url = "https://www.livemint.com/"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    # Get the top headlines in the first ATF block

    # alwasy better to grow arrays first then form a dataframe from them
    rel_titles = []
    rel_links = []
    uuids = []
    
    #parsing the page now for relevant links and story titles
    if(response.status_code == requests.codes.ok):
        #print('Access good')
        #print(response.status_code)
        soup = BeautifulSoup(response.content, 'html.parser')
        central_block = soup.find("div", {"class": "designCol3"})
        if(central_block):
            blocks = central_block.find_all('div', {'class': 'contentBox'})
            if(blocks):
                for block in blocks[:2]:
                    hero_link = block.find('li', {'class': 'heroStory'}) #only one herostory
                    if(hero_link):   #heroplink is only in block 1 and not block 2
                        main_text = hero_link.select('h2 a')  #bs4.element.Tag
                        li_name = main_text.pop(0) 
                        rel_titles.append(li_name.contents[0])
                        rel_links.append(li_name.get('href'))
                        
                        uuid_num = uuid.UUID(int=random.getrandbits(128), version=4)
                        uuids.append(uuid_num)
                        
                        #print(li_name.get('href'))
                        #print(li_name.contents[0])
                    block_links = block.find_all('li', {'class': 'newsBlock'})  #we leave aside the herostory block
                    for link in block_links:
                        #print(link) #link is bs4.element.ResultSet
                        main_text = link.select('h3 a')  #bs4.element.Tag
                        li_name = main_text.pop(0) 
                        rel_titles.append(li_name.contents[0])
                        rel_links.append(li_name.get('href'))
                        
                        uuid_num = uuid.UUID(int=random.getrandbits(128), version=4)
                        uuids.append(uuid_num)
                        #print(li_name.get('href'))
                        #print(li_name.contents[0])                    
                        #print(uuid_num)                
                #top_headlines.insert(0, 'date', datetime.today().strftime('%Y-%m-%d'))  #removed in favor of date from the article itself.
                #return top_headlines
                #print (top_headlines)
            else:
                print('Vertical main content blocks not found')
                #return None
        else:
            print('Main content block not found')
            #return None

        # This block does not work because the rightBlockNew because this block is not 
        # .storyList element exists in the DOM, but it's not a direct child of body#search. 
        # The presence of div#__next as a child of the body suggests this might be a single-page application (SPA), likely built with React. 
        # In SPAs, content is often loaded dynamically after the initial page load. This means that the .storyList element is likely rendered by JavaScript after your initial code has run. That is why you were not able to find the element.
        # This can only be resolved by a Selenium execution
        
        right_block = soup.find('div', {'class': 'rightBlockNew'})
        print(right_block.prettify())
        if(right_block):
            link_divs = right_block.find_all('ul', {'class': 'newsLinsting'})
            #print(link_divs.prettify())
            for div in link_divs:
                readable_li = div.find('div', {'class': 'storyList'})
                print(div.prettify())
                main_text = readable_li.select('a')  #bs4.element.Tag
                li_name = main_text.pop(0) 
                rel_titles.append(li_name.contents[0])
                rel_links.append(li_name.get('href'))

                print(li_name.get('href'))
                print(li_name.contents[0]) 
                
                uuid_num = uuid.UUID(int=random.getrandbits(128), version=4)
                uuids.append(uuid_num)
                
        else:
            print('Right content block not found')
        
        if((central_block)or (right_block)):
            top_headlines = pandas.DataFrame({
                                                    'uuid': uuids, 
                                                    'title': rel_titles,     #Don't change these column names, they are important for merging later 
                                                    'link': rel_links
                                                })
            top_headlines.insert(1, 'site', 'Livemint')
            return top_headlines
        else:
            return None
    else:
        print('Access to Livemint is failing')
        return None 

In [5]:
#don't change these names, they are imporrtant for data processing later 

class Classifications(BaseModel):
    title: str
    classification: bool
    explanation: str

In [6]:
# Process ther main headlines to see which ones have stock recommendations
# Produces output in a schema set above in class Classifications as a JSON string

def classify_headlines(headlines):
    client = genai.Client(
            #api_key = GEMINI_API_KEY
            api_key=os.environ.get("GEMINI_API_KEY"),
        )
    
    model = 'gemini-2.0-flash-lite'
    #gemini-2.0-flash-lite'
    #"gemini-2.5-pro-exp-03-25"
    # see list here: https://ai.google.dev/gemini-api/docs/models#gemini-2.5-pro-preview-03-25

    print(f'Executing headline classification for {headlines.size} headlines')
    
    generate_content_config = types.GenerateContentConfig(
                                temperature=0.8,
                                #response_mime_type="text/plain", # for simpler use cases where only text is expected from the model 
                                system_instruction=[
                                                    types.Part.from_text(text="""You are a text analyser to classify if a given sentence has a recommendation for acting on any stocks. 
                                                    You are given a json list of individual sentences. Operate on each sentence as per the following logic.
                                                    Looking only at the contents of each the sentences and identifying words like *recommend*, *buy*, and phrases like *what to buy*, *which to buy*, *buy or sell*, you classify 
                                                    if the sentence points to an explicit recommendation.
                                                    Your output should be a True or a False where True means that the sentence has a stock recommendation and False means it does not.
                                                    Explain your classifying logic for the given sentence in 50 characters.
                                                    Return your response of the original sentence, your classification and your logic against each sentence.
                                                    """),
                                            ],
                                response_mime_type='application/json',
                                response_schema=list[Classifications],    #force response in a structured format from Gemini
                                )
    contents = [
        types.Content(
            role="user",
            parts=[
                types.Part.from_text(text=headlines.to_json()),  #this part has to be fixed to send a json list of strings to the model. Otherwise cheaper models can mix up the sentence and the delimiter and miss some sentences - is Done.  
                ],
            ),
        ]
    
    model_reply = client.models.generate_content(model=model, contents=contents, config=generate_content_config)
    #print((model_reply))
    #for chunk in client.models.generate_content_stream(model=model, contents=contents, config=generate_content_config):
    #    model_reply = model_reply + chunk.text
    return model_reply.text

In [7]:
def read_livemint2():
    url = "https://www.livemint.com/"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    #list_items = soup.find_all(style=lambda value: value and 'display:none' in value.replace(' ', '').lower())
    #story_lists = soup.select('.heroStory .imgStory a')
    #for story in story_lists:
    #    print(story.get('href'))
    #    print(story.contents[0].strip())
    
    story_lists = soup.select('li.newsBlock h3 a')
    for story in story_lists:
        print(story.get('href'))
        print(story.contents[0].strip())
        
    #print(story.get('href'))
    #print(story.contents[0].strip())
        
    #right_block = soup.find('div', {'class': 'rightBlockNew'})
    #print(right_block.prettify())
    #for item in list_items:
    #    print(item.prettify())

In [8]:
#one line tester
#hl = read_et_headlines()
#hl.drop_duplicates(subset=['link_id'], keep='first', inplace=True, ignore_index=True)
#hl.to_csv('et_hl_text.csv')
#print(hl.shape)

In [9]:
# One line tester.

#outcome = classify_headlines(pandas.Series(['BJPs jibe at Cong over National Herald case: Not Indira Gandhis emergency',
#                                            'Buy or sell: Sumeet Bagadia recommends three stocks to buy on Monday — 21 April'
#                                           ]))   #no longer valid as we are now sending a Series
#outdf = pandas.DataFrame(json.loads(outcome))
#outdf