In [1]:
import yfinance as yf
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from ta.momentum import RSIIndicator
from ta.trend import MACD
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score
import os

In [2]:
stocks = ["RELIANCE.NS", "HDFCBANK.NS", "INFY.NS"]
stock_data = {}

In [3]:
for stock in stocks:
    if os.path.exists(f"{stock}.csv"):
        stock_data[stock] = pd.read_csv(f"{stock}.csv", index_col=0, parse_dates=True)
    else:
        ticker = yf.Ticker(stock)
        stock_data[stock] = ticker.history(period="max")
        stock_data[stock].to_csv(f"{stock}.csv")

In [4]:
for stock in stocks:
    print(f"\nBasic Statistics for {stock}:")
    print(stock_data[stock][["Open", "High", "Low", "Close", "Volume"]].describe())


Basic Statistics for RELIANCE.NS:
              Open         High          Low        Close        Volume
count  7428.000000  7428.000000  7428.000000  7428.000000  7.428000e+03
mean    330.606448   334.200081   326.786256   330.358959  5.872732e+07
std     418.435364   422.302549   414.404913   418.221028  1.014932e+08
min       3.447046     3.462817     3.425643     3.462817  0.000000e+00
25%      19.602716    19.868367    19.320005    19.591828  1.347409e+07
50%     183.592594   185.615889   180.853095   183.490685  2.377707e+07
75%     410.233114   413.387463   406.147393   408.630997  6.121514e+07
max    1599.022925  1603.358288  1580.137072  1595.484985  1.448889e+09

Basic Statistics for HDFCBANK.NS:
              Open         High          Low        Close        Volume
count  7431.000000  7431.000000  7431.000000  7431.000000  7.431000e+03
mean    460.769056   465.090833   456.226109   460.741536  6.049395e+06
std     560.059051   564.701241   555.357269   560.046262  7.77269

In [7]:
# Create a professional-looking plot for each stock
fig = make_subplots(
    rows=3,
    cols=1,
    subplot_titles=[f"{stock} Closing Price and Volume" for stock in stocks],
    vertical_spacing=0.1,
    specs=[[{'secondary_y': True}] for _ in range(3)]  # Enable secondary_y for each subplot
)

for i, stock in enumerate(stocks, 1):
    df = stock_data[stock]
    # Add candlestick chart for stock price
    fig.add_trace(
        go.Candlestick(
            x=df.index,
            open=df["Open"],
            high=df["High"],
            low=df["Low"],
            close=df["Close"],
            name=stock,
            increasing_line_color="#00CC96",
            decreasing_line_color="#EF553B",
        ),
        row=i,
        col=1,
        secondary_y=False,
    )
    # Add volume bar chart
    fig.add_trace(
        go.Bar(
            x=df.index,
            y=df["Volume"],
            name=f"{stock} Volume",
            marker_color="#636EFA",
            opacity=0.3,
        ),
        row=i,
        col=1,
        secondary_y=True,
    )

# Update layout for a professional look
fig.update_layout(
    title="Nifty50 Stocks: Price and Volume (Last 5 Years)",
    height=1200,
    showlegend=False,
    template="plotly_dark",
    xaxis_rangeslider_visible=False,
)
fig.update_yaxes(title_text="Price (INR)", secondary_y=False)
fig.update_yaxes(title_text="Volume", secondary_y=True)

# Save and display the plot
fig.write_html("nifty50_eda.html")
fig.show()

In [8]:
def preprocess_stock_data(df):
    """
    Clean and preprocess stock data, adding technical indicators.
    """
    # Convert index to datetime
    df.index = pd.to_datetime(df.index)
    
    # Remove unnecessary columns
    if "Dividends" in df.columns:
        del df["Dividends"]
    if "Stock Splits" in df.columns:
        del df["Stock Splits"]
    
    # Handle missing values
    df = df.dropna()
    
    # Add target variable: 1 if tomorrow's close > today's close, else 0
    df["Tomorrow"] = df["Close"].shift(-1)
    df["Target"] = (df["Tomorrow"] > df["Close"]).astype(int)

In [9]:
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1996-01-01 00:00:00+05:30,0.510868,0.511998,0.508358,0.511998,204800,0.0,0.0
1996-01-02 00:00:00+05:30,0.510868,0.513379,0.509927,0.509927,204800,0.0,0.0
1996-01-03 00:00:00+05:30,0.513379,0.513379,0.513379,0.513379,102400,0.0,0.0
1996-01-04 00:00:00+05:30,0.508358,0.510868,0.508358,0.509990,307200,0.0,0.0
1996-01-05 00:00:00+05:30,0.503965,0.503965,0.503965,0.503965,51200,0.0,0.0
...,...,...,...,...,...,...,...
2025-07-28 00:00:00+05:30,1513.900024,1519.300049,1482.500000,1516.000000,6439855,0.0,0.0
2025-07-29 00:00:00+05:30,1512.900024,1517.199951,1496.099976,1513.699951,7070448,0.0,0.0
2025-07-30 00:00:00+05:30,1519.900024,1521.699951,1506.000000,1519.000000,5719051,0.0,0.0
2025-07-31 00:00:00+05:30,1509.699951,1527.099976,1497.000000,1509.000000,6851407,0.0,0.0


In [11]:
macd = MACD(df["Close"])
df["MACD"] = macd.macd()
df["MACD_Signal"] = macd.macd_signal()
    
    # Moving Averages (SMA and EMA)
df["SMA_20"] = df["Close"].rolling(window=20).mean()
df["EMA_20"] = df["Close"].ewm(span=20, adjust=False).mean()
    
    # Volume-based features
df["Volume_MA_20"] = df["Volume"].rolling(window=20).mean()
df["Volume_Ratio"] = df["Volume"] / df["Volume_MA_20"]
    
    # Drop rows with NaN values after adding indicators
df = df.dropna()

In [13]:
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,MACD,MACD_Signal,SMA_20,EMA_20,Volume_MA_20,Volume_Ratio
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1996-02-15 00:00:00+05:30,0.638272,0.646431,0.615051,0.644297,2201600,0.0,0.0,0.031331,0.018905,0.539952,0.559042,921600.00,2.388889
1996-02-16 00:00:00+05:30,0.644297,0.652707,0.620385,0.646117,2764800,0.0,0.0,0.034222,0.021968,0.547561,0.567335,1034240.00,2.673267
1996-02-19 00:00:00+05:30,0.646117,0.633879,0.631368,0.632749,1024000,0.0,0.0,0.035031,0.024581,0.554251,0.573565,1075200.00,0.952381
1996-02-20 00:00:00+05:30,0.632749,0.630741,0.622582,0.622707,614400,0.0,0.0,0.034464,0.026557,0.560518,0.578245,1072640.00,0.572792
1996-02-21 00:00:00+05:30,0.622707,0.622707,0.622707,0.622707,0,0.0,0.0,0.033627,0.027971,0.566895,0.582480,1052160.00,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-07-28 00:00:00+05:30,1513.900024,1519.300049,1482.500000,1516.000000,6439855,0.0,0.0,-15.451989,-4.638383,1591.704999,1577.189198,6944604.45,0.927318
2025-07-29 00:00:00+05:30,1512.900024,1517.199951,1496.099976,1513.699951,7070448,0.0,0.0,-18.410624,-7.392831,1587.049994,1571.142603,7089180.00,0.997358
2025-07-30 00:00:00+05:30,1519.900024,1521.699951,1506.000000,1519.000000,5719051,0.0,0.0,-20.096039,-9.933473,1582.474994,1566.176640,6942450.50,0.823780
2025-07-31 00:00:00+05:30,1509.699951,1527.099976,1497.000000,1509.000000,6851407,0.0,0.0,-21.985228,-12.343824,1576.989996,1560.731246,7022036.10,0.975701
