In [2]:
import yfinance as yf
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from ta.momentum import RSIIndicator
from ta.trend import MACD
from xgboost import XGBClassifier
from sklearn.metrics import precision_score, accuracy_score
from sklearn.model_selection import GridSearchCV, train_test_split
import os

In [3]:
# Define the Nifty50 stocks to analyze
stocks = ["RELIANCE.NS", "HDFCBANK.NS", "INFY.NS"]
stock_data = {}

In [4]:
for stock in stocks:
    csv_file = f"{stock}.csv"
    if os.path.exists(csv_file):
        stock_data[stock] = pd.read_csv(csv_file, index_col=0, parse_dates=True)
        if stock_data[stock].empty:
            print(f"Warning: {csv_file} is empty. Fetching new data...")
            ticker = yf.Ticker(stock)
            stock_data[stock] = ticker.history(period="max")
            stock_data[stock].to_csv(csv_file)
    else:
        ticker = yf.Ticker(stock)
        stock_data[stock] = ticker.history(period="max")
        stock_data[stock].to_csv(csv_file)
    if stock_data[stock] is None or stock_data[stock].empty:
        raise ValueError(f"Failed to load data for {stock}. Check network or ticker validity.")
    print(f"Raw data for {stock} shape: {stock_data[stock].shape}")

Raw data for RELIANCE.NS shape: (253, 5)
Raw data for HDFCBANK.NS shape: (253, 5)
Raw data for INFY.NS shape: (253, 5)


  stock_data[stock] = pd.read_csv(csv_file, index_col=0, parse_dates=True)
  stock_data[stock] = pd.read_csv(csv_file, index_col=0, parse_dates=True)
  stock_data[stock] = pd.read_csv(csv_file, index_col=0, parse_dates=True)


In [5]:
def describe_csv(stock, df):
    """
    Print detailed information about the CSV file for a given stock.
    """
    print(f"\n=== CSV Description for {stock} ===")
    print("1. File Name:", f"{stock}.csv")
    print("2. Columns and Data Types:")
    print(df.dtypes)
    print("\n3. Number of Rows:", len(df))
    print("4. Date Range:", f"{df.index.min()} to {df.index.max()}")
    print("5. Missing Values:")
    print(df.isnull().sum())
    print("\n6. Basic Statistics:")
    print(df[["Open", "High", "Low", "Close", "Volume"]].describe())
    print("\n7. First 5 Rows:")
    print(df.head())
    print("\n" + "="*50 + "\n")

for stock in stocks:
    describe_csv(stock, stock_data[stock])


=== CSV Description for RELIANCE.NS ===
1. File Name: RELIANCE.NS.csv
2. Columns and Data Types:
Close     object
High      object
Low       object
Open      object
Volume    object
dtype: object

3. Number of Rows: 253
4. Date Range: 2024-08-01 to Ticker
5. Missing Values:
Close     1
High      1
Low       1
Open      1
Volume    1
dtype: int64

6. Basic Statistics:
          Open    High                Low              Close       Volume
count      252     252                252                252          252
unique     238     244                242                244          252
top     1431.0  1523.0  1410.699951171875  1278.199951171875  RELIANCE.NS
freq         3       2                  2                  3            1

7. First 5 Rows:
                         Close                High                 Low  \
Price                                                                    
Ticker             RELIANCE.NS         RELIANCE.NS         RELIANCE.NS   
Date               

In [6]:
fig = make_subplots(
    rows=3,
    cols=1,
    subplot_titles=[f"{stock} Closing Price and Volume" for stock in stocks],
    vertical_spacing=0.1,
    specs=[[{'secondary_y': True}] for _ in range(3)]  # Enable secondary_y for each subplot
)

for i, stock in enumerate(stocks, 1):
    df = stock_data[stock]
    # Add candlestick chart for stock price
    fig.add_trace(
        go.Candlestick(
            x=df.index,
            open=df["Open"],
            high=df["High"],
            low=df["Low"],
            close=df["Close"],
            name=stock,
            increasing_line_color="#00CC96",
            decreasing_line_color="#EF553B",
        ),
        row=i,
        col=1,
        secondary_y=False,
    )
    # Add volume bar chart
    fig.add_trace(
        go.Bar(
            x=df.index,
            y=df["Volume"],
            name=f"{stock} Volume",
            marker_color="#636EFA",
            opacity=0.3,
        ),
        row=i,
        col=1,
        secondary_y=True,
    )

# Update layout for a professional look
fig.update_layout(
    title="Nifty50 Stocks: Price and Volume (Last 5 Years)",
    height=1200,
    showlegend=False,
    template="plotly_dark",
    xaxis_rangeslider_visible=False,
)
fig.update_yaxes(title_text="Price (INR)", secondary_y=False)
fig.update_yaxes(title_text="Volume", secondary_y=True)

# Save and display the plot
fig.write_html("nifty50_eda.html")
fig.show()

In [7]:
def preprocess_stock_data(df):
    """
    Clean and preprocess stock data, adding technical indicators.
    """
    try:
        # Convert index to datetime
        df.index = pd.to_datetime(df.index)
        
        # Remove unnecessary columns
        if "Dividends" in df.columns:
            del df["Dividends"]
        if "Stock Splits" in df.columns:
            del df["Stock Splits"]
        
        # Initial drop of missing values
        df = df.dropna()
        print(f"Shape after initial dropna: {df.shape}, NaN count: {df.isna().sum().sum()}")
        
        # Check if data is sufficient
        if len(df) < 10:  # Reduced minimum rows for testing
            raise ValueError(f"Insufficient data after initial cleaning: {len(df)} rows.")
        
        # Add target variable: 1 if tomorrow's close > today's close, else 0
        df["Tomorrow"] = df["Close"].shift(-1)
        df["Target"] = (df["Tomorrow"] > df["Close"]).astype(int)
        df["Tomorrow"] = df["Tomorrow"].fillna(method="ffill")
        df["Target"] = df["Target"].fillna(method="ffill")
        df = df.dropna(subset=["Tomorrow", "Target"])
        print(f"Shape after target addition: {df.shape}, NaN count: {df.isna().sum().sum()}")
        
        # Add technical indicators with reduced windows
        df["RSI_14"] = RSIIndicator(df["Close"], window=7).rsi()  # Reduced from 14 to 7
        macd = MACD(df["Close"], window_slow=26, window_fast=12, window_sign=9)  # Default MACD windows
        df["MACD"] = macd.macd()
        df["MACD_Signal"] = macd.macd_signal()
        df["SMA_10"] = df["Close"].rolling(window=10).mean()  # Reduced from 20 to 10
        df["EMA_10"] = df["Close"].ewm(span=10, adjust=False).mean()  # Reduced from 20 to 10
        
        # Volume-based features
        df["Volume_MA_10"] = df["Volume"].rolling(window=10).mean()  # Reduced from 20 to 10
        df["Volume_Ratio"] = df["Volume"] / df["Volume_MA_10"].replace(0, np.nan)  # Avoid division by zero
        df["Volatility"] = df["Close"].rolling(window=10).std()  # New feature: volatility
        
        # Fill NaN values with reasonable defaults
        df["RSI_14"] = df["RSI_14"].fillna(50)  # Neutral RSI value
        df["MACD"] = df["MACD"].fillna(0)
        df["MACD_Signal"] = df["MACD_Signal"].fillna(0)
        df["SMA_10"] = df["SMA_10"].fillna(method="ffill").fillna(df["Close"].mean())
        df["EMA_10"] = df["EMA_10"].fillna(method="ffill").fillna(df["Close"].mean())
        df["Volume_MA_10"] = df["Volume_MA_10"].fillna(0)
        df["Volume_Ratio"] = df["Volume_Ratio"].fillna(1.0)  # Neutral ratio
        df["Volatility"] = df["Volatility"].fillna(0)  # Fill NaN volatility with 0
        
        # Final drop of any remaining NaN values
        df = df.dropna()
        print(f"Shape after adding indicators: {df.shape}, NaN count: {df.isna().sum().sum()}")
        
        if df.empty:
            raise ValueError("DataFrame is empty after adding indicators.")
        
        return df
    except Exception as e:
        print(f"Error preprocessing data: {str(e)}")
        return None


In [8]:
# Preprocess data for each stock (using earliest available data)
processed_data = {}
for stock in stocks:
    processed_df = preprocess_stock_data(stock_data[stock])  # Removed date filter to use all data
    if processed_df is not None:
        processed_data[stock] = processed_df
    else:
        raise ValueError(f"Preprocessing failed for {stock}. Check data or indicators.")


Error preprocessing data: Unknown datetime string format, unable to parse: Ticker, at position 0



Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.



ValueError: Preprocessing failed for RELIANCE.NS. Check data or indicators.