In [1]:
import requests
import pandas
import base64
import os
from google import genai
from google.genai import types
from bs4 import BeautifulSoup 
from datetime import datetime
from pydantic import BaseModel
import json
import os
import dotenv as env
env.load_dotenv('../keys.env')

True

In [2]:
#Process the identified links to get their texts as a list of headline, subheader and body content
def get_livemint_articles(art_url):
    headline = ''
    subhead = ''
    body = ''
    art_date = ''
    
    #art_url = top_headlines.iat[2, 1]
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119 Safari/537.36'
    }
    art_res = requests.get(art_url, headers=headers)
    if(art_res.status_code == requests.codes.ok):
        #Done
        art_soup = BeautifulSoup(art_res.content, 'html.parser')
        head_h1 = art_soup.find('h1', {'id':'article-0'})  #This might not be affected by changes in web page 
        if head_h1:
            headline = head_h1.get_text()
        else:
            print(f'{art_url}: No headline found')
            
        sub_head_h2 = art_soup.find('h2', {'class':'storyPage_summary__Ge5SX'}) #This might fail on a daily basis, just switch to using h2 then 
        if sub_head_h2:
            subhead = sub_head_h2.get_text() # Can be used to improve sharpness of LLM 
        else:
            print(f'{art_url}: No subhead found')
        
        art_body_divs = art_soup.find_all('div', {'class': 'storyParagraph'}) # should work well and not fail
        if art_body_divs:
            for para_div in art_body_divs:
                para = para_div.get_text()
                if 'Disclaimer' not in para:   #reject boiler plate text to reduce tokens to LLMs
                    if(body):
                        body = body + '\n' +  para
                    else:
                        body = para
        else:
            print(f'{art_url}: No Article Body found')
            
        art_date_div = art_soup.find('div', {'class': 'storyPage_date__JS9qJ storyPage_top__RFRL3'}) #find date
        if art_date_div:
            art_date = art_date_div.get_text()
        else:
            print(f'{art_url}: No article date found')
        #print (body)
        article_frame = pandas.DataFrame([[headline, subhead, body, art_date]], columns=['article_title', 'article_subheader', 'article_body', 'article_date'])
        return article_frame
    else:
        print('Access to specific article' + art_url + 'is failing')
        return None

In [3]:
class Recommendations(BaseModel):
    reco_date: str
    analyst: str
    house: str
    stock_name: str
    ticker: str
    current_value: float
    target_value: float
    sl_value: float
    timeframe: str

In [None]:
# Reading the article and making sense of it

def get_value_table(article_frame):
    client = genai.Client(
            #api_key = GEMINI_API_KEY
            api_key=os.environ.get("GEMINI_API_KEY"),
        )
    
    model = 'gemini-2.0-flash-lite'
    #"gemini-2.5-pro-exp-03-25"
    # see list here: https://ai.google.dev/gemini-api/docs/models#gemini-2.5-pro-preview-03-25
    
    generate_content_config = types.GenerateContentConfig(
                                temperature=0.8,
                                #response_mime_type="text/plain", # for simpler use cases where only text is expected from the model 
                                system_instruction=[
                                                    types.Part.from_text(text="""You are a text analyser searching for explicit recommendations on a stock in an article.
                                                    You are given data in a json format. The json format also names the fields of the data.
                                                    Scan 'article_title' field in the json the name of individual stocks.
                                                    You may also find the name of the stock analyst in 'article_title'.
                                                    Scan 'article_subheader' field for the name of individual stocks.
                                                    You may also find the name of the stock analyst and their brokerage house in the 'article_subheader'.
                                                    You may also find the name of the stock analyst and their brokerage house in the 'article_body' field.
                                                    There will also be an 'article_body' field .
                                                    Reading through the contents of the article_title, article_subheader and article_body fields, conduct the following steps:
                                                    
                                                    1. Identify the stock names being recommended in the three fields. These names will be explicity mentioned.
                                                    Ignore names and values of indices like NIFTY, BANKNIFTY, NIFTY REALTY, SENSEX, Dow Jones, S&P 500, NASDQ, NYSE, HANGSENG.
                                                    Only collect individual stock names explicity named in the article
                                                    
                                                    2. Identify the existing price or recommended buying price of the stock as given in these three fields. It may be present as a single value, or as a range (such as 100-125).
                                                    It may also not be present in the article. Use only explicity mentioned values. Convert the range to its mid point. Take -1.0 as the value if not mentioned.
                                                    
                                                    3. Identify the target price of the stock. It may be a single value or a range or multiple values. It will be marked by the words *Target Price* or *Target*.
                                                    Use only explicity mentioned values. Convert a range to its lower value. If there are multple targets, take the highest target. Take -1.0 as the value if not mentioned.
                                                    
                                                    4. Identify the stoploss price of the trade. It may be a single value or a range. It will be marked by the words *Stoploss* or *Stop Loss* or *Exit*.
                                                    Use only explicity mentioned values. convert a range to its upper value. Take -1.0 as the value if not mentioned.
                                                    
                                                    5. Identify the ticker name of the stock identified in 1. Use your inherent knowledge of NSE ticker names if the stock ticker is not explicitly mentioned.
                                                    If you cannot identify the ticker name, say None.
                                                    
                                                    6. Identify any timeframe mentioned for these recommendations. These timeframes can be a month, a week, or a quarter. If no timeframe is mentioned, say None. 
                                                    
                                                    Finally, take the date from the 'article_date' field of the input json and convert it into yyyy-mm-dd format. If this field does not exist, use 'Not Found' as the date.
                                                    
                                                    Return your response in the form of article date, analyst name, brokerage house, stock name, NSE ticker, buy price, target price, stoploss price and timeframe
                                                    for each of the stock name mentioned in the article.
                                                    """),
                                            ],
                                response_mime_type='application/json',
                                response_schema=list[Recommendations],    #force response in a structured format from Gemini
                                )
    contents = [
        types.Content(
            role="user",
            parts=[
                #types.Part.from_text(text="|".join(article_body)),  #this part has to be fixed to send a json list of strings to the model. Otherwise cheaper models can mix up the sentence and the delimiter and miss some sentences.  
                types.Part.from_text(text=article_frame.to_json(orient="records"))
                ],
            ),
        ]
    
    model_reply = client.models.generate_content(model=model, contents=contents, config=generate_content_config)
    return model_reply.text

In [1]:
#One line testing

#out = get_livemint_articles('https://www.livemint.com/market/stock-market-news/stocks-to-buy-or-sell-dharmesh-shah-of-icici-securities-suggests-buying-ultratech-cement-hal-shares-on-21-april-2025-11745088225280.html')
#print(out)
#out_df = pandas.DataFrame([out], columns=['article_title', 'article_subheader', 'article_body', 'article_date'])
#results = get_value_table(out)
#print(json.loads(results))