In [0]:
%pip install yfinance
%pip install tqdm

Getting all Nifty 50 Stock codes

In [0]:
nifty50codes = ['WIPRO','BAJFINANCE','TATASTEEL','HCLTECH','INDUSINDBK','JSWSTEEL','NESTLEIND','ONGC','COALINDIA','ICICIBANK','INFY','HINDALCO','M&M','BAJAJ-AUTO','BAJAJFINSV','ULTRACEMCO','CIPLA','RELIANCE','LT','TATAMOTORS','NTPC','DRREDDY','MARUTI','ITC','EICHERMOT','SUNPHARMA','JIOFIN','APOLLOHOSP','TCS','BHARTIARTL','TATACONSUM','SBIN','ADANIPORTS','ETERNAL','ASIANPAINT','HINDUNILVR','ADANIENT','SBILIFE','TECHM','POWERGRID','TRENT','HEROMOTOCO','TITAN','GRASIM','HDFCBANK','KOTAKBANK','HDFCLIFE','BEL','SHRIRAMFIN','AXISBANK']

Adding .ns ticker to the names

In [0]:
nifty50codes = [x+".NS" for x in nifty50codes]
print(nifty50codes)

Fetching Stock information and saving to S3

In [0]:
import yfinance as yf
import pandas as pd
from datetime import datetime,timedelta
import os
from pyspark.sql.functions import lit
from tqdm import tqdm
# List of Nifty 50 stocks or your custom list
tickers = nifty50codes

# Output directory
today = datetime.today().strftime("%Y-%m-%d")
lastday = (datetime.today()+timedelta(days=1)).strftime("%Y-%m-%d")
output_dir = f"s3://stock-analysis-yk/stockdatadelta"
os.makedirs(output_dir, exist_ok=True)

def fetch_stock_data(ticker, start=today, end=None):
    print(f"Fetching data for {ticker}")
    stock = yf.Ticker(ticker)
    df = stock.history(start=start, end=end)
    df["Ticker"] = ticker
    df = df.rename(columns={'Stock Splits':'Stock_splits'})
    df.reset_index(inplace=True)
    
    return df

# Fetch and save each ticker's data
for ticker in tqdm(tickers):
    df = spark.createDataFrame(fetch_stock_data(ticker))
    #df.show()
    
    #print("here")
    df.write.format("delta").mode("append").save(f"{output_dir}/{ticker.replace('.NS','')}", header = "True")
    #print("done")
print(f"Saved stock data for {len(tickers)} tickers on {today}")

Fetching Stock news for sentiment analysis of stocks

In [0]:
%pip install vaderSentiment
%pip install lxml

In [0]:

import requests
from bs4 import BeautifulSoup
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd
from datetime import datetime
import os
import lxml
from tqdm import tqdm
# News feed URL (modify keywords as needed)
RSS_URL = "https://news.google.com/rss/search?q=stock+market+india&hl=en-IN&gl=IN&ceid=IN:en"

# Output
today = datetime.today().strftime("%Y-%m-%d")
output_dir = f"s3://stock-analysis-yk/SentimentData"
os.makedirs(output_dir, exist_ok=True)

analyzer = SentimentIntensityAnalyzer()

def get_google_news_headlines(symbol):
    url = f"https://news.google.com/rss/search?q={symbol}+stock+india&hl=en-IN&gl=IN&ceid=IN:en"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, features="xml")
    items = soup.findAll('item')
    headlines = [item.title.text for item in items]
    return headlines

def analyze_sentiment(text_list):
    results = []
    for text in text_list:
        score = analyzer.polarity_scores(text)
        results.append({
            "text": text,
            "compound": score["compound"],
            "pos": score["pos"],
            "neg": score["neg"],
            "neu": score["neu"],
        })
    return pd.DataFrame(results)

def get_sentiment_score(symbol):
    headlines = get_google_news_headlines(symbol)
    df = analyze_sentiment(headlines)
    sentiment_summary = {
        "Ticker": symbol,
        "Date": datetime.today().strftime("%Y-%m-%d"),
        "Sentiment_Score": round(df["compound"].mean(), 3),
        "Positive": round(df["pos"].mean(), 3),
        "Negative": round(df["neg"].mean(), 3),
        "Source": "Google"
    }
    return pd.DataFrame([sentiment_summary])
df = pd.DataFrame()
# Example usage
for ticker in tqdm(nifty50codes):
    df = pd.concat([df,get_sentiment_score(ticker)])
    #print("here")
    #df.write.mode("overwrite").csv(f"{output_dir}/{ticker.replace('.NS','')}", header = "True")
sparkdf = spark.createDataFrame(df)
sparkdf.write.format("delta").mode("overwrite").save(f"{output_dir}", header = "True")

In [0]:
print(df)