In [1]:
import requests
import pandas
import base64
import os
import bs4
from google import genai
from google.genai import types
from bs4 import BeautifulSoup 
from datetime import datetime
from pydantic import BaseModel
import json
from openpyxl import load_workbook
import uuid
import random

In [2]:
print(pandas.__version__)

2.2.3


In [8]:
def read_livemint():
    url = "https://www.livemint.com/"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    # Get the top headlines in the first ATF block

    # alwasy better to grow arrays first then form a dataframe from them
    rel_titles = []
    rel_links = []
    uuids = []
    
    #parsing the page now for relevant links and story titles
    if(response.status_code == requests.codes.ok):
        #print('Access good')
        #print(response.status_code)
        soup = BeautifulSoup(response.content, 'html.parser')
        central_block = soup.find("div", {"class": "designCol3"})
        if(central_block):
            blocks = central_block.find_all('div', {'class': 'contentBox'})
            if(blocks):
                for block in blocks[:2]:
                    hero_link = block.find('li', {'class': 'heroStory'}) #only one herostory
                    if(hero_link):   #heroplink is only in block 1 and not block 2
                        main_text = hero_link.select('h2 a')  #bs4.element.Tag
                        li_name = main_text.pop(0) 
                        rel_titles.append(li_name.contents[0])
                        rel_links.append(li_name.get('href'))
                        
                        uuid_num = uuid.UUID(int=random.getrandbits(128), version=4)
                        uuids.append(uuid_num)
                        
                        #print(li_name.get('href'))
                        #print(li_name.contents[0])
                    block_links = block.find_all('li', {'class': 'newsBlock'})  #we leave aside the herostory block
                    for link in block_links:
                        #print(link) #link is bs4.element.ResultSet
                        main_text = link.select('h3 a')  #bs4.element.Tag
                        li_name = main_text.pop(0) 
                        rel_titles.append(li_name.contents[0])
                        rel_links.append(li_name.get('href'))
                        
                        uuid_num = uuid.UUID(int=random.getrandbits(128), version=4)
                        uuids.append(uuid_num)
                        #print(li_name.get('href'))
                        #print(li_name.contents[0])                    
                        #print(uuid_num)
                
                top_headlines = pandas.DataFrame({
                                                    'uuid': uuids, 
                                                    'title': rel_titles,     #Don't change these column names, they are important for merging later 
                                                    'link': rel_links
                                                })
                top_headlines.insert(1, 'site', 'Livemint')
                
                #top_headlines.insert(0, 'date', datetime.today().strftime('%Y-%m-%d'))  #removed in favor of date from the article itself.
                return top_headlines
                #print (top_headlines)
            else:
                print('Vertical main content blocks not found')
                return None
        else:
            print('Main content block not found')
            return None
    else:
        print('Access to Livemint is failing')
        return None 

In [15]:
#don't change these names, they are imporrtant for data processing later 

class Classifications(BaseModel):
    title: str
    classification: bool
    explanation: str

In [13]:
# Process ther main headlines to see which ones have stock recommendations
# Produces output in a schema set above in class Classifications as a JSON string

def classify_headlines(headlines):
    #print("|".join(headlines))
    GEMINI_API_KEY = 'AIzaSyACDeNRlyKY2lIMOzn3OzEtEkIZWXieCIo'
    client = genai.Client(
            api_key = GEMINI_API_KEY
            #api_key=os.environ.get("GEMINI_API_KEY"),
        )
    
    model = 'gemini-2.0-flash-lite'
    #"gemini-2.5-pro-exp-03-25"
    # see list here: https://ai.google.dev/gemini-api/docs/models#gemini-2.5-pro-preview-03-25
    
    generate_content_config = types.GenerateContentConfig(
                                temperature=0.8,
                                #response_mime_type="text/plain", # for simpler use cases where only text is expected from the model 
                                system_instruction=[
                                                    types.Part.from_text(text="""You are a text analyser to classify if a given sentence has a recommendation for acting on any stocks. 
                                                    You are fed a list of sentences, separated by '|'. 
                                                    Split the input sentence on the '|' delimiter and operate one by one on the split outputs.
                                                    Looking only at the contents of each the sentences and identifying words like *recommend*, *buy*, and phrases like *what to buy*, *which to buy*, *buy or sell*, you classify 
                                                    if the sentence points to an explicit recommendation.
                                                    Your output should be a True or a False where True means that the sentence has a stock recommendation and False means it does not.
                                                    Explain your classifying logic for the given sentence in 50 characters.
                                                    Return your response of the original sentence, your classification and your logic against each sentence.
                                                    """),
                                            ],
                                response_mime_type='application/json',
                                response_schema=list[Classifications],    #force response in a structured format from Gemini
                                )
    contents = [
        types.Content(
            role="user",
            parts=[
                types.Part.from_text(text="|".join(headlines)),  #this part has to be fixed to send a json list of strings to the model. Otherwise cheaper models can mix up the sentence and the delimiter and miss some sentences.  
                ],
            ),
        ]
    
    model_reply = client.models.generate_content(model=model, contents=contents, config=generate_content_config)
    #print((model_reply))
    #for chunk in client.models.generate_content_stream(model=model, contents=contents, config=generate_content_config):
    #    model_reply = model_reply + chunk.text
    return model_reply.text

In [10]:
#one line tester
hl = read_livemint()
hl

Unnamed: 0,uuid,site,title,link
0,239cca47-e1f6-4748-8abf-4b61605fd0a1,Livemint,Trump tariffs: Can India navigate the global t...,https://www.livemint.com/economy/trump-tariffs...
1,6eb5785f-30a1-4486-b717-abcb220cef76,Livemint,US exempts THESE 20 items from Trump tariffs a...,https://www.livemint.com/news/us-news/from-sma...
2,fd408d87-581c-4983-aa8c-aa705214d844,Livemint,US Fed expects tariff-led inflation to hit 3.5...,https://www.livemint.com/economy/us-fed-eyes-t...
3,9f9650f3-7806-4433-920f-8defe664a741,Livemint,Russia hits Indian firm’s warehouse in Ukraine...,https://www.livemint.com/news/world/russian-mi...
4,30f3d101-e4f2-462b-b34a-66602a31caf0,Livemint,"Murshidabad Violence: Over 150 arrested, Govt ...",https://www.livemint.com/news/murshidabad-viol...
5,fdc94db8-0518-4db7-9c63-6d1f8c4fc59f,Livemint,TN CM MK Stalin says AIADMK an ‘old bonded sla...,https://www.livemint.com/politics/tamil-nadu-c...
6,dd4ec639-2929-47b2-ae7a-f0e81c9ba1d3,Livemint,ED moves to seize ₹661 crore assets of Congre...,https://www.livemint.com/news/national-herald-...


In [16]:
# One line tester.

outcome = classify_headlines(hl['title'].tolist())
outdf = pandas.DataFrame(json.loads(outcome))
outdf

Unnamed: 0,title,classification,explanation
0,Trump tariffs: Can India navigate the global t...,False,No recommendation keywords found.
1,US exempts THESE 20 items from Trump tariffs a...,False,No recommendation keywords found.
2,US Fed expects tariff-led inflation to hit 3.5...,False,No recommendation keywords found.
3,Russia hits Indian firm’s warehouse in Ukraine...,False,No recommendation keywords found.
4,"Murshidabad Violence: Over 150 arrested, Govt ...",False,No recommendation keywords found.
5,TN CM MK Stalin says AIADMK an ‘old bonded sla...,False,No recommendation keywords found.
6,ED moves to seize ₹661 crore assets of Congre...,False,No recommendation keywords found.
