In [97]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
from nltk.sentiment import SentimentIntensityAnalyzer
import requests
import re
import tqdm

# List of FNO Stocks

In [98]:
fno_list = pd.read_csv('fno_stocks_list.csv')

drop_words = [' Limited', ' Ltd', ' Industries', 'The ', ' (India)', ' (india)',' Enterprises',' Enterprise', ' Company', ' Laboratories', ' Corporation']
fno_list['Stock'] = fno_list['Stock Name']

for word in drop_words:
    fno_list['Stock'] = fno_list['Stock'].map(lambda x: x.replace(word, ''))
    
fno_list['Stock'] = fno_list['Stock'].map(lambda x: x.lower())
fno_list['Symbol'] = fno_list['Symbol'].map(lambda x: x.lower())

symbols = fno_list['Symbol'].values
stocks = fno_list['Stock'].values
filter = sorted(list(set(list(symbols) + list(stocks))))


# Request Data from Pulse

In [169]:
link = 'https://pulse.zerodha.com'

request_headers = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2.1 Safari/605.1.15'
}

response = requests.get(link, headers=request_headers)
html = BeautifulSoup(response.text, 'html')

#Removing articles that show as similar article under a main article
for ul in html.select('ul.similar'):
    ul.decompose()


headlines = html.select('h2.title')
descriptions = html.select('div.desc')
sources = html.select('span.feed')
dates = html.select('span.date')

In [170]:
headlines_list = []
source_links = []
for headline in headlines:
    headlines_list.append(headline.text.strip().lower())
    source_links.append(headline.find('a')['href'])

descriptions_list = []
for description in descriptions:
    descriptions_list.append(description.text.strip().lower())

sources_list = []
for source in sources:
    sources_list.append(source.text.strip().lower().replace('— ', ''))

dates_list = []
for date in dates:
    dates_list.append(date['title'])

news_df = pd.DataFrame({'Date':dates_list, 'Headlines':headlines_list, 'Description':descriptions_list, 'Source':sources_list, 'Source_Link': source_links})


In [171]:
def get_article_text(source, source_link):
    if source == 'moneycontrol':
        try:
            article_response = requests.get(source_link, headers=request_headers)
            article_html = BeautifulSoup(article_response.text, 'html')
            article_html = article_html.find('div', {'class': 'content_wrapper arti-flow'})
            paragraph = article_html.find_all('p')
            
            paragraph_text = ''
            for i in range(len(paragraph)):
                if paragraph[i].get('class') == ['benefitText']:
                    break
                paragraph_text += paragraph[i].text.strip()
            return paragraph_text
            
        except:
            return "Error ocurred"
        
    if source == 'bloomberg quint':
        try:
            article_response = requests.get(source_link, headers=request_headers)
            article_html = BeautifulSoup(article_response.text, 'html')
            paragraph = article_html.find_all('p')
            
            paragraph_text = ''
            for i in range(len(paragraph)):
                if paragraph[i].get('class') == ['benefitText']:
                    break
                paragraph_text += paragraph[i].text.strip()
            return paragraph_text
            
        except:
            return "Error ocurred"
            

    if source == 'economic times':
        try:
            article_response = requests.get(source_link, headers=request_headers)
            article_html = BeautifulSoup(article_response.text, 'html')
            paragraph = article_html.find('article')
            paragraph_text = paragraph.text.strip().split('(You can now subscribe to our ETMarkets WhatsApp channel)')[0]
    
            return paragraph_text
            
        except:
            return "Error ocurred"

    if source == 'the hindu business':
        try:
            article_response = requests.get(source_link, headers=request_headers)
            article_html = BeautifulSoup(article_response.text, 'html')
            article_html = article_html.find('div', {'itemprop': 'articleBody'})
            paragraph = article_html.find_all('p')
            
            paragraph_text = ''
            for i in range(len(paragraph)):
                paragraph_text += paragraph[i].text.strip()
            paragraph_text = paragraph_text.split('COMMents')[0]
            return paragraph_text
            
        except:
            return "Error ocurred"
        
    if source == 'zee business':
        try:
            article_response = requests.get(source_link, headers=request_headers)
            article_html = BeautifulSoup(article_response.text, 'html')
            article_html = article_html.find('div', {'class': 'field-item even'})
            paragraph = article_html.find_all('p')
    
            paragraph_text = ''
            for i in range(len(paragraph)):
                    paragraph_text += paragraph[i].text.strip()
            return paragraph_text
            
        except:
            return "Error ocurred"
        
    if source == 'finshots':
        try:  
            article_response = requests.get(source_link, headers=request_headers)
            article_html = BeautifulSoup(article_response.text, 'html')
            article_html = article_html.find('div', {'class': 'post-content'})
            paragraph = article_html.find_all('p')
    
            paragraph_text = ''
            for i in range(len(paragraph)):
                    paragraph_text += paragraph[i].text.strip()
            paragraph_text = paragraph_text.split("Don't forget to share this story on WhatsApp")[0]
            return paragraph_text
        except:
            return "Error ocurred"
        
    else:
        return 'New Website. Create a rapper for {}'.format(source)

# Filter Articles for FNO Stocks

In [172]:
#Add Stock as a tag in a new column if it's mentioned in the description
news_df['Tags'] = ''
for i in news_df.index:
    desc = news_df.loc[i,'Description']
    tags = [s for s in filter if re.search(r'\b{}\b'.format(re.escape(s)), desc)]

    #tags are under a single string separated by ',' and not an iterable of strings
    
    tags = ', '.join(tags)
    if len(tags) > 0:
        news_df.loc[i,'Tags'] = tags
    else:
        continue

#Drop rows which have on tags i.e. no mention of stock of interest
news_df = news_df[news_df['Tags']!= '']
news_df = news_df.reset_index(drop=True)

In [173]:
news_df['Complete_Article'] = ''

for i in news_df.index:
    news_df.loc[i, 'Complete_Article'] = get_article_text(news_df.loc[i, 'Source'], news_df.loc[i, 'Source_Link'])

In [174]:
news_df

Unnamed: 0,Date,Headlines,Description,Source,Source_Link,Tags,Complete_Article
0,"11:18 PM, 29 Feb 2024",hasmukh adhia completes five year term as bob'...,"adhia, a former gujarat cadre ias officer, con...",bloomberg quint,https://www.ndtvprofit.com/business/hasmukh-ad...,bank of baroda,Bank of Baroda on Thursday announced the compl...
1,"11:18 PM, 29 Feb 2024",bnp paribas sells shares of 12 companies worth...,"bnp paribas has sold shares of 12 companies, i...",bloomberg quint,https://www.ndtvprofit.com/markets/bnp-paribas...,"bharat heavy electricals, punjab national bank",Financial services company BNP Paribas on Thur...
2,"10:51 PM, 29 Feb 2024","jsw steel incorporates subsidiary for hot, col...",the arm was incorporated on feb. 27 in mumbai ...,bloomberg quint,https://www.ndtvprofit.com/business/jsw-steel-...,jsw steel,JSW Steel on Thursday announced incorporating ...
3,"10:40 PM, 29 Feb 2024",trade setup for friday: 15 things to know befo...,"based on the oi percentage, 130 stocks were on...",moneycontrol,https://www.moneycontrol.com/news/business/mar...,"alkem, bata india, motherson, zydus lifesciences",The market is expected to remain rangebound as...
4,"10:24 PM, 29 Feb 2024",india likely to auction 18 critical mineral bl...,"oil-to-metals conglomerate vedanta ltd, state-...",the hindu business,https://www.thehindu.com/business/Economy/indi...,"coal india, vedanta",India will auction 18 critical mineral blocks ...
5,"10:14 PM, 29 Feb 2024",potential antitrust issues in the reliance-dis...,"questions loom over the reliance-disney deal, ...",bloomberg quint,https://www.ndtvprofit.com/law-and-policy/pote...,reliance,The merger between Reliance Industries Ltd.'s ...
6,"10:06 PM, 29 Feb 2024",pidilite industries promotes sudhanshu vats as...,pidilite industries appoints sudhanshu vats as...,bloomberg quint,https://www.ndtvprofit.com/markets/pidilite-in...,pidilite,Pidilite Industries Ltd. has promoted Sudhansh...
7,"09:04 PM, 29 Feb 2024",paytm: yes bank to likely acquire majority mer...,paytm has also likely applied for a third-part...,bloomberg quint,https://www.ndtvprofit.com/business/paytm-yes-...,axis bank,One97 Communications Ltd. has likely signed a ...
8,"08:53 PM, 29 Feb 2024",biocon biologics inks licence pact with jansse...,biocon biologics has signed a licence agreemen...,bloomberg quint,https://www.ndtvprofit.com/business/biocon-bio...,biocon,Biocon Biologics on Thursday said it has inked...
9,"08:50 PM, 29 Feb 2024",govt may not immediately look at divestment in...,"without providing a specific timeline, neeraj ...",moneycontrol,https://www.moneycontrol.com/news/telecom/govt...,idea,The Telecom Department is likely to introduce ...


 # Pass Description through ChatGPT and get an output on price outlook.