In [2]:
import yfinance as yf
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from ta.momentum import RSIIndicator
from ta.trend import MACD
from xgboost import XGBClassifier
from sklearn.metrics import precision_score, accuracy_score
from sklearn.model_selection import GridSearchCV, train_test_split
import os

In [3]:
# Define the Nifty50 stocks to analyze
stocks = ["RELIANCE.NS", "HDFCBANK.NS", "INFY.NS"]
stock_data = {}

In [4]:
for stock in stocks:
    csv_file = f"{stock}.csv"
    if os.path.exists(csv_file):
        stock_data[stock] = pd.read_csv(csv_file, index_col=0, parse_dates=True)
        if stock_data[stock].empty:
            print(f"Warning: {csv_file} is empty. Fetching new data...")
            ticker = yf.Ticker(stock)
            stock_data[stock] = ticker.history(period="max")
            stock_data[stock].to_csv(csv_file)
    else:
        ticker = yf.Ticker(stock)
        stock_data[stock] = ticker.history(period="max")
        stock_data[stock].to_csv(csv_file)
    if stock_data[stock] is None or stock_data[stock].empty:
        raise ValueError(f"Failed to load data for {stock}. Check network or ticker validity.")
    print(f"Raw data for {stock} shape: {stock_data[stock].shape}")

Raw data for RELIANCE.NS shape: (253, 5)
Raw data for HDFCBANK.NS shape: (253, 5)
Raw data for INFY.NS shape: (253, 5)


  stock_data[stock] = pd.read_csv(csv_file, index_col=0, parse_dates=True)
  stock_data[stock] = pd.read_csv(csv_file, index_col=0, parse_dates=True)
  stock_data[stock] = pd.read_csv(csv_file, index_col=0, parse_dates=True)


In [5]:
def describe_csv(stock, df):
    """
    Print detailed information about the CSV file for a given stock.
    """
    print(f"\n=== CSV Description for {stock} ===")
    print("1. File Name:", f"{stock}.csv")
    print("2. Columns and Data Types:")
    print(df.dtypes)
    print("\n3. Number of Rows:", len(df))
    print("4. Date Range:", f"{df.index.min()} to {df.index.max()}")
    print("5. Missing Values:")
    print(df.isnull().sum())
    print("\n6. Basic Statistics:")
    print(df[["Open", "High", "Low", "Close", "Volume"]].describe())
    print("\n7. First 5 Rows:")
    print(df.head())
    print("\n" + "="*50 + "\n")

for stock in stocks:
    describe_csv(stock, stock_data[stock])


=== CSV Description for RELIANCE.NS ===
1. File Name: RELIANCE.NS.csv
2. Columns and Data Types:
Close     object
High      object
Low       object
Open      object
Volume    object
dtype: object

3. Number of Rows: 253
4. Date Range: 2024-08-01 to Ticker
5. Missing Values:
Close     1
High      1
Low       1
Open      1
Volume    1
dtype: int64

6. Basic Statistics:
          Open    High                Low              Close       Volume
count      252     252                252                252          252
unique     238     244                242                244          252
top     1431.0  1523.0  1410.699951171875  1278.199951171875  RELIANCE.NS
freq         3       2                  2                  3            1

7. First 5 Rows:
                         Close                High                 Low  \
Price                                                                    
Ticker             RELIANCE.NS         RELIANCE.NS         RELIANCE.NS   
Date               

In [10]:
# Create a professional-looking plot for each stock
fig = make_subplots(
    rows=3,
    cols=1,
    subplot_titles=[f"{stock} Closing Price and Volume" for stock in stocks],
    vertical_spacing=0.1,
    specs=[[{'secondary_y': True}] for _ in range(3)]  # Enable secondary_y for each subplot
)

for i, stock in enumerate(stocks, 1):
    df = stock_data[stock]
    # Add candlestick chart for stock price
    fig.add_trace(
        go.Candlestick(
            x=df.index,
            open=df["Open"],
            high=df["High"],
            low=df["Low"],
            close=df["Close"],
            name=stock,
            increasing_line_color="#00CC96",
            decreasing_line_color="#EF553B",
        ),
        row=i,
        col=1,
        secondary_y=False,
    )
    # Add volume bar chart
    fig.add_trace(
        go.Bar(
            x=df.index,
            y=df["Volume"],
            name=f"{stock} Volume",
            marker_color="#636EFA",
            opacity=0.3,
        ),
        row=i,
        col=1,
        secondary_y=True,
    )

# Update layout for a professional look
fig.update_layout(
    title="Nifty50 Stocks: Price and Volume (Last 5 Years)",
    height=1200,
    showlegend=False,
    template="plotly_dark",
    xaxis_rangeslider_visible=False,
)
fig.update_yaxes(title_text="Price (INR)", secondary_y=False)
fig.update_yaxes(title_text="Volume", secondary_y=True)

# Save and display the plot
fig.write_html("nifty50_eda.html")
fig.show()



In [11]:
# Step 2: Data Cleaning and Preprocessing
def preprocess_stock_data(df):
    """
    Clean and preprocess stock data, adding technical indicators.
    """
    try:
        # Convert index to datetime
        df.index = pd.to_datetime(df.index)
        
        # Remove unnecessary columns
        if "Dividends" in df.columns:
            del df["Dividends"]
        if "Stock Splits" in df.columns:
            del df["Stock Splits"]
        
        # Initial drop of missing values
        df = df.dropna()
        print(f"Shape after initial dropna: {df.shape}, NaN count: {df.isna().sum().sum()}")
        
        # Check if data is sufficient
        if len(df) < 10:  # Reduced minimum rows for testing
            raise ValueError(f"Insufficient data after initial cleaning: {len(df)} rows.")
        
        # Add target variable: 1 if tomorrow's close > today's close, else 0
        df["Tomorrow"] = df["Close"].shift(-1)
        df["Target"] = (df["Tomorrow"] > df["Close"]).astype(int)
        df["Tomorrow"] = df["Tomorrow"].fillna(method="ffill")
        df["Target"] = df["Target"].fillna(method="ffill")
        df = df.dropna(subset=["Tomorrow", "Target"])
        print(f"Shape after target addition: {df.shape}, NaN count: {df.isna().sum().sum()}")
        
        # Add technical indicators with reduced windows
        df["RSI_14"] = RSIIndicator(df["Close"], window=7).rsi()  # Reduced from 14 to 7
        macd = MACD(df["Close"], window_slow=26, window_fast=12, window_sign=9)  # Default MACD windows
        df["MACD"] = macd.macd()
        df["MACD_Signal"] = macd.macd_signal()
        df["SMA_10"] = df["Close"].rolling(window=10).mean()  # Reduced from 20 to 10
        df["EMA_10"] = df["Close"].ewm(span=10, adjust=False).mean()  # Reduced from 20 to 10
        
        # Volume-based features
        df["Volume_MA_10"] = df["Volume"].rolling(window=10).mean()  # Reduced from 20 to 10
        df["Volume_Ratio"] = df["Volume"] / df["Volume_MA_10"].replace(0, np.nan)  # Avoid division by zero
        df["Volatility"] = df["Close"].rolling(window=10).std()  # New feature: volatility
        
        # Fill NaN values with reasonable defaults
        df["RSI_14"] = df["RSI_14"].fillna(50)  # Neutral RSI value
        df["MACD"] = df["MACD"].fillna(0)
        df["MACD_Signal"] = df["MACD_Signal"].fillna(0)
        df["SMA_10"] = df["SMA_10"].fillna(method="ffill").fillna(df["Close"].mean())
        df["EMA_10"] = df["EMA_10"].fillna(method="ffill").fillna(df["Close"].mean())
        df["Volume_MA_10"] = df["Volume_MA_10"].fillna(0)
        df["Volume_Ratio"] = df["Volume_Ratio"].fillna(1.0)  # Neutral ratio
        df["Volatility"] = df["Volatility"].fillna(0)  # Fill NaN volatility with 0
        
        # Final drop of any remaining NaN values
        df = df.dropna()
        print(f"Shape after adding indicators: {df.shape}, NaN count: {df.isna().sum().sum()}")
        
        if df.empty:
            raise ValueError("DataFrame is empty after adding indicators.")
        
        return df
    except Exception as e:
        print(f"Error preprocessing data: {str(e)}")
        return None

# Preprocess data for each stock (using earliest available data)
processed_data = {}
for stock in stocks:
    processed_df = preprocess_stock_data(stock_data[stock])  # Removed date filter to use all data
    if processed_df is not None:
        processed_data[stock] = processed_df
    else:
        raise ValueError(f"Preprocessing failed for {stock}. Check data or indicators.")

# Visualize technical indicators for one stock (e.g., RELIANCE.NS)
stock = "RELIANCE.NS"
if stock not in processed_data or processed_data[stock] is None:
    raise ValueError(f"No valid data for {stock} after preprocessing.")
df = processed_data[stock]
fig = make_subplots(
    rows=4,
    cols=1,
    subplot_titles=("Closing Price with SMA/EMA", "RSI", "MACD", "Volume"),
    vertical_spacing=0.1,
    shared_xaxes=True,
    specs=[[{'secondary_y': False}], [{'secondary_y': False}], [{'secondary_y': False}], [{'secondary_y': False}]]
)

# Closing Price with SMA and EMA
fig.add_trace(
    go.Scatter(x=df.index, y=df["Close"], name="Close", line=dict(color="#00CC96")),
    row=1, col=1,
)
fig.add_trace(
    go.Scatter(x=df.index, y=df["SMA_10"], name="SMA 10", line=dict(color="#EF553B")),
    row=1, col=1,
)
fig.add_trace(
    go.Scatter(x=df.index, y=df["EMA_10"], name="EMA 10", line=dict(color="#FFA15A")),
    row=1, col=1,
)

# RSI
fig.add_trace(
    go.Scatter(x=df.index, y=df["RSI_14"], name="RSI 7", line=dict(color="#636EFA")),
    row=2, col=1,
)
fig.add_hline(y=70, line_dash="dash", line_color="red", row=2, col=1)
fig.add_hline(y=30, line_dash="dash", line_color="green", row=2, col=1)

# MACD
fig.add_trace(
    go.Scatter(x=df.index, y=df["MACD"], name="MACD", line=dict(color="#00CC96")),
    row=3, col=1,
)
fig.add_trace(
    go.Scatter(x=df.index, y=df["MACD_Signal"], name="Signal", line=dict(color="#EF553B")),
    row=3, col=1,
)

# Volume
fig.add_trace(
    go.Scatter(x=df.index, y=df["Volume"], name="Volume", line=dict(color="#636EFA", width=0.5, dash="dot")),
    row=4, col=1,
)

# Update layout
fig.update_layout(
    title=f"{stock}: Technical Indicators",
    height=1200,
    template="plotly_dark",
    showlegend=True,
)
fig.write_html(f"{stock}_indicators.html")
fig.show()

# Step 3: Model Training and Prediction
def predict(train, test, predictors, model, threshold=0.5):
    """
    Train model and predict next-day movement with optimized threshold.
    """
    model.fit(train[predictors], train["Target"])
    preds_proba = model.predict_proba(test[predictors])[:, 1]
    preds = np.where(preds_proba >= threshold, 1, 0)
    preds = pd.Series(preds, index=test.index, name="Predictions")
    combined = pd.concat([test["Target"], preds], axis=1)
    return combined, preds_proba

def backtest(data, model, predictors, scale_pos_weight):
    """
    Backtest the model with train/validation/test split and threshold optimization.
    """
    # Calculate scale_pos_weight based on class imbalance
    neg, pos = np.bincount(data["Target"])
    scale_pos_weight_val = neg / pos if pos > 0 else 1.0
    
    # Split data: 70% train, 15% validation, 15% test
    train_data, temp_data = train_test_split(data, train_size=0.7, shuffle=False)
    val_data, test_data = train_test_split(temp_data, train_size=0.5, shuffle=False)
    
    # Define parameter grid for GridSearchCV
    param_grid = {
        'n_estimators': [100, 300, 500],
        'min_child_weight': [10, 20, 50]  # XGBoost uses min_child_weight instead of min_samples_split
    }
    
    # Initialize and train model with GridSearchCV
    xgb_model = XGBClassifier(scale_pos_weight=scale_pos_weight_val, random_state=1)
    grid_search = GridSearchCV(xgb_model, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
    grid_search.fit(train_data[predictors], train_data["Target"])
    best_model = grid_search.best_estimator_
    print(f"Best parameters for {data.name}: {grid_search.best_params_}")
    
    # Optimize threshold on validation set
    _, val_probs = predict(val_data, val_data, predictors, best_model, threshold=0.5)
    best_threshold = 0.5
    best_score = 0.0
    for thresh in np.arange(0.5, 0.9, 0.1):
        preds = np.where(val_probs >= thresh, 1, 0)
        score = 0.5 * precision_score(val_data["Target"], preds) + 0.5 * accuracy_score(val_data["Target"], preds)
        if score > best_score:
            best_score = score
            best_threshold = thresh
    print(f"Optimal threshold for {data.name}: {best_threshold}")
    
    # Apply best model and threshold to test set
    predictions, _ = predict(test_data, test_data, predictors, best_model, threshold=best_threshold)
    return predictions

# Define predictors
predictors = ["Close", "Volume", "RSI_14", "MACD", "MACD_Signal", "SMA_10", "EMA_10", "Volume_Ratio", "Volatility"]

# Train and backtest for each stock
predictions = {}
for stock in stocks:
    if stock in processed_data and processed_data[stock] is not None:
        processed_data[stock].name = stock  # Set name for GridSearchCV and threshold optimization
        predictions[stock] = backtest(processed_data[stock], XGBClassifier(), predictors, scale_pos_weight=1.0)
    else:
        print(f"Skipping {stock} due to preprocessing failure.")

# Evaluate model performance
for stock in stocks:
    if stock in predictions and predictions[stock] is not None:
        print(f"\nModel Performance for {stock}:")
        print("Precision Score:", precision_score(predictions[stock]["Target"], predictions[stock]["Predictions"]))
        print("Accuracy Score:", accuracy_score(predictions[stock]["Target"], predictions[stock]["Predictions"]))
        print(predictions[stock]["Predictions"].value_counts())

# Step 4: Trading Strategy
def trading_strategy(predictions, df):
    """
    Implement a simple trading strategy based on predictions and compute cumulative returns.
    """
    # Create a copy and preserve the name attribute
    df = df.copy()
    df.name = df.name  # Ensure name is carried over from the input df
    
    df["Predictions"] = predictions["Predictions"]
    df["Position"] = df["Predictions"].shift(1)  # Buy/sell on next day
    df["Returns"] = df["Close"].pct_change()
    df["Strategy_Returns"] = df["Returns"] * df["Position"]
    
    # Handle NaN and infinite values
    df["Strategy_Returns"] = df["Strategy_Returns"].fillna(0)
    df["Strategy_Returns"] = df["Strategy_Returns"].replace([np.inf, -np.inf], 0)
    
    # Compute cumulative returns
    df["Cumulative_Returns"] = (1 + df["Strategy_Returns"]).cumprod()
    
    # Compute buy-and-hold returns for benchmarking
    df["Buy_Hold_Returns"] = (1 + df["Returns"]).cumprod()
    
    # Handle initial NaN in cumulative returns
    df["Cumulative_Returns"] = df["Cumulative_Returns"].fillna(1.0)
    df["Buy_Hold_Returns"] = df["Buy_Hold_Returns"].fillna(1.0)
    
    # Debugging: Print summary of returns
    print(f"\nTrading Strategy Summary for {df.name}:")
    print("Number of NaN in Strategy Returns:", df["Strategy_Returns"].isna().sum())
    print("Number of NaN in Cumulative Returns:", df["Cumulative_Returns"].isna().sum())
    print("Cumulative Returns (last 5):")
    print(df["Cumulative_Returns"].tail())
    
    return df

# Apply trading strategy and visualize results
for stock in stocks:
    if stock in processed_data and processed_data[stock] is not None:
        processed_data[stock].name = stock  # Set name for debugging
        df = trading_strategy(predictions[stock], processed_data[stock])
        
        # Plot cumulative returns with buy-and-hold benchmark
        fig = go.Figure()
        fig.add_trace(
            go.Scatter(
                x=df.index,
                y=df["Cumulative_Returns"],
                name="Strategy Returns",
                line=dict(color="#00CC96"),
            )
        )
        fig.add_trace(
            go.Scatter(
                x=df.index,
                y=df["Buy_Hold_Returns"],
                name="Buy and Hold Returns",
                line=dict(color="#EF553B", dash="dash"),
            )
        )
        fig.update_layout(
            title=f"{stock}: Cumulative Returns of Trading Strategy vs. Buy and Hold",
            yaxis_title="Cumulative Returns",
            template="plotly_dark",
            showlegend=True,
            height=600,
            xaxis=dict(rangeslider=dict(visible=False))  # Disable rangeslider for simplicity
        )
        fig.update_xaxes(title_text="Date")
        fig.write_html(f"{stock}_strategy_returns.html")
        fig.show()
    else:
        print(f"Skipping trading strategy visualization for {stock} due to data issues.")

Error preprocessing data: Unknown datetime string format, unable to parse: Ticker, at position 0



Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.



ValueError: Preprocessing failed for RELIANCE.NS. Check data or indicators.

In [13]:
# -*- coding: utf-8 -*-
"""
Nifty50 Stock Price Prediction
This script performs EDA, describes CSV files, preprocesses data, predicts next-day price movements
for three Nifty50 stocks using XGBoost and technical indicators, and visualizes trading strategy results.
"""

import yfinance as yf
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from ta.momentum import RSIIndicator
from ta.trend import MACD
from xgboost import XGBClassifier
from sklearn.metrics import precision_score, accuracy_score
from sklearn.model_selection import GridSearchCV, train_test_split
import os

# Step 1: Exploratory Data Analysis (EDA)
# Define the Nifty50 stocks to analyze
stocks = ["RELIANCE.NS", "HDFCBANK.NS", "INFY.NS"]
stock_data = {}

# Fetch historical data for each stock
for stock in stocks:
    csv_file = f"{stock}.csv"
    if os.path.exists(csv_file):
        try:
            stock_data[stock] = pd.read_csv(csv_file, index_col=0, parse_dates=True, date_format="%Y-%m-%d")
            if stock_data[stock].empty or stock_data[stock].index[0] == "Ticker":
                print(f"Warning: {csv_file} is invalid or empty. Fetching new data...")
                os.remove(csv_file)  # Delete invalid file
        except Exception as e:
            print(f"Error reading {csv_file}: {str(e)}. Fetching new data...")
            if os.path.exists(csv_file):
                os.remove(csv_file)
    if not os.path.exists(csv_file) or stock_data.get(stock) is None:
        ticker = yf.Ticker(stock)
        stock_data[stock] = ticker.history(period="max")
        stock_data[stock].to_csv(csv_file)
    if stock_data[stock] is None or stock_data[stock].empty:
        raise ValueError(f"Failed to load data for {stock}. Check network or ticker validity.")
    print(f"Raw data for {stock} shape: {stock_data[stock].shape}")
    print(f"Raw data head for {stock}:\n{stock_data[stock].head()}")

# Describe each CSV file
def describe_csv(stock, df):
    """
    Print detailed information about the CSV file for a given stock.
    """
    print(f"\n=== CSV Description for {stock} ===")
    print("1. File Name:", f"{stock}.csv")
    print("2. Columns and Data Types:")
    print(df.dtypes)
    print("\n3. Number of Rows:", len(df))
    print("4. Date Range:", f"{df.index.min()} to {df.index.max()}")
    print("5. Missing Values:")
    print(df.isnull().sum())
    print("\n6. Basic Statistics:")
    print(df[["Open", "High", "Low", "Close", "Volume"]].describe())
    print("\n7. First 5 Rows:")
    print(df.head())
    print("\n" + "="*50 + "\n")

for stock in stocks:
    describe_csv(stock, stock_data[stock])

# Create a professional-looking plot for each stock
fig = make_subplots(
    rows=3,
    cols=1,
    subplot_titles=[f"{stock} Closing Price and Volume" for stock in stocks],
    vertical_spacing=0.1,
    specs=[[{'secondary_y': True}] for _ in range(3)]  # Enable secondary_y for each subplot
)

for i, stock in enumerate(stocks, 1):
    df = stock_data[stock]
    # Add candlestick chart for stock price
    fig.add_trace(
        go.Candlestick(
            x=df.index,
            open=df["Open"],
            high=df["High"],
            low=df["Low"],
            close=df["Close"],
            name=stock,
            increasing_line_color="#00CC96",
            decreasing_line_color="#EF553B",
        ),
        row=i,
        col=1,
        secondary_y=False,
    )
    # Add volume bar chart
    fig.add_trace(
        go.Bar(
            x=df.index,
            y=df["Volume"],
            name=f"{stock} Volume",
            marker_color="#636EFA",
            opacity=0.3,
        ),
        row=i,
        col=1,
        secondary_y=True,
    )

# Update layout for a professional look
fig.update_layout(
    title="Nifty50 Stocks: Price and Volume (Last 5 Years)",
    height=1200,
    showlegend=False,
    template="plotly_dark",
    xaxis_rangeslider_visible=False,
)
fig.update_yaxes(title_text="Price (INR)", secondary_y=False)
fig.update_yaxes(title_text="Volume", secondary_y=True)

# Save and display the plot
fig.write_html("nifty50_eda.html")
fig.show()

# Step 2: Data Cleaning and Preprocessing
def preprocess_stock_data(df):
    """
    Clean and preprocess stock data, adding technical indicators.
    """
    try:
        # Convert index to datetime
        df.index = pd.to_datetime(df.index)
        
        # Remove unnecessary columns
        if "Dividends" in df.columns:
            del df["Dividends"]
        if "Stock Splits" in df.columns:
            del df["Stock Splits"]
        
        # Initial drop of missing values
        df = df.dropna()
        print(f"Step 1 - Shape after initial dropna: {df.shape}, NaN count: {df.isna().sum().sum()}")
        
        # Check if data is sufficient
        if len(df) < 5:  # Relaxed from 10 to 5 for testing
            raise ValueError(f"Insufficient data after initial cleaning: {len(df)} rows.")
        
        # Add target variable: 1 if tomorrow's close > today's close, else 0
        df["Tomorrow"] = df["Close"].shift(-1)
        df["Target"] = (df["Tomorrow"] > df["Close"]).astype(int)
        df = df.dropna(subset=["Tomorrow", "Target"])
        print(f"Step 2 - Shape after target addition: {df.shape}, NaN count: {df.isna().sum().sum()}")
        
        # Add technical indicators with reduced windows
        df["RSI_14"] = RSIIndicator(df["Close"], window=7).rsi()
        macd = MACD(df["Close"], window_slow=26, window_fast=12, window_sign=9)
        df["MACD"] = macd.macd()
        df["MACD_Signal"] = macd.macd_signal()
        df["SMA_10"] = df["Close"].rolling(window=10).mean()
        df["EMA_10"] = df["Close"].ewm(span=10, adjust=False).mean()
        
        # Volume-based features
        df["Volume_MA_10"] = df["Volume"].rolling(window=10).mean()
        df["Volume_Ratio"] = df["Volume"] / df["Volume_MA_10"].replace(0, np.nan)
        df["Volatility"] = df["Close"].rolling(window=10).std()
        
        # Fill NaN values with reasonable defaults before final drop
        df["RSI_14"] = df["RSI_14"].fillna(50)
        df["MACD"] = df["MACD"].fillna(0)
        df["MACD_Signal"] = df["MACD_Signal"].fillna(0)
        df["SMA_10"] = df["SMA_10"].fillna(method="ffill").fillna(df["Close"].mean())
        df["EMA_10"] = df["EMA_10"].fillna(method="ffill").fillna(df["Close"].mean())
        df["Volume_MA_10"] = df["Volume_MA_10"].fillna(0)
        df["Volume_Ratio"] = df["Volume_Ratio"].fillna(1.0)
        df["Volatility"] = df["Volatility"].fillna(0)
        print(f"Step 3 - Shape after indicator calc and NaN fill: {df.shape}, NaN count: {df.isna().sum().sum()}")
        
        # Final drop of any remaining NaN values
        df = df.dropna()
        print(f"Step 4 - Shape after final dropna: {df.shape}, NaN count: {df.isna().sum().sum()}")
        
        if df.empty:
            raise ValueError("DataFrame is empty after adding indicators.")
        
        return df
    except Exception as e:
        print(f"Error preprocessing data: {str(e)}")
        return None

# Preprocess data for each stock (using earliest available data)
processed_data = {}
for stock in stocks:
    processed_df = preprocess_stock_data(stock_data[stock])  # Removed date filter to use all data
    if processed_df is not None:
        processed_data[stock] = processed_df
    else:
        raise ValueError(f"Preprocessing failed for {stock}. Check data or indicators.")

# Visualize technical indicators for one stock (e.g., RELIANCE.NS)
stock = "RELIANCE.NS"
if stock not in processed_data or processed_data[stock] is None:
    raise ValueError(f"No valid data for {stock} after preprocessing.")
df = processed_data[stock]
fig = make_subplots(
    rows=4,
    cols=1,
    subplot_titles=("Closing Price with SMA/EMA", "RSI", "MACD", "Volume"),
    vertical_spacing=0.1,
    shared_xaxes=True,
    specs=[[{'secondary_y': False}], [{'secondary_y': False}], [{'secondary_y': False}], [{'secondary_y': False}]]
)

# Closing Price with SMA and EMA
fig.add_trace(
    go.Scatter(x=df.index, y=df["Close"], name="Close", line=dict(color="#00CC96")),
    row=1, col=1,
)
fig.add_trace(
    go.Scatter(x=df.index, y=df["SMA_10"], name="SMA 10", line=dict(color="#EF553B")),
    row=1, col=1,
)
fig.add_trace(
    go.Scatter(x=df.index, y=df["EMA_10"], name="EMA 10", line=dict(color="#FFA15A")),
    row=1, col=1,
)

# RSI
fig.add_trace(
    go.Scatter(x=df.index, y=df["RSI_14"], name="RSI 7", line=dict(color="#636EFA")),
    row=2, col=1,
)
fig.add_hline(y=70, line_dash="dash", line_color="red", row=2, col=1)
fig.add_hline(y=30, line_dash="dash", line_color="green", row=2, col=1)

# MACD
fig.add_trace(
    go.Scatter(x=df.index, y=df["MACD"], name="MACD", line=dict(color="#00CC96")),
    row=3, col=1,
)
fig.add_trace(
    go.Scatter(x=df.index, y=df["MACD_Signal"], name="Signal", line=dict(color="#EF553B")),
    row=3, col=1,
)

# Volume
fig.add_trace(
    go.Scatter(x=df.index, y=df["Volume"], name="Volume", line=dict(color="#636EFA", width=0.5, dash="dot")),
    row=4, col=1,
)

# Update layout
fig.update_layout(
    title=f"{stock}: Technical Indicators",
    height=1200,
    template="plotly_dark",
    showlegend=True,
)
fig.write_html(f"{stock}_indicators.html")
fig.show()

Raw data for RELIANCE.NS shape: (7428, 7)
Raw data head for RELIANCE.NS:
                               Open      High       Low     Close     Volume  \
Date                                                                           
1996-01-01 00:00:00+05:30  4.618591  4.643373  4.588175  4.635488  104121369   
1996-01-02 00:00:00+05:30  4.624223  4.646753  4.565646  4.599440  168743308   
1996-01-03 00:00:00+05:30  4.674916  4.887822  4.624225  4.634363  209323879   
1996-01-04 00:00:00+05:30  4.590429  4.605073  4.529599  4.591555  216900264   
1996-01-05 00:00:00+05:30  4.573532  4.573532  4.520587  4.560014  166708467   

                           Dividends  Stock Splits  
Date                                                
1996-01-01 00:00:00+05:30        0.0           0.0  
1996-01-02 00:00:00+05:30        0.0           0.0  
1996-01-03 00:00:00+05:30        0.0           0.0  
1996-01-04 00:00:00+05:30        0.0           0.0  
1996-01-05 00:00:00+05:30        0.0           0

Step 1 - Shape after initial dropna: (7428, 5), NaN count: 0
Step 2 - Shape after target addition: (7427, 7), NaN count: 0
Step 3 - Shape after indicator calc and NaN fill: (7427, 15), NaN count: 0
Step 4 - Shape after final dropna: (7427, 15), NaN count: 0
Step 1 - Shape after initial dropna: (7431, 5), NaN count: 0
Step 2 - Shape after target addition: (7430, 7), NaN count: 0
Step 3 - Shape after indicator calc and NaN fill: (7430, 15), NaN count: 0
Step 4 - Shape after final dropna: (7430, 15), NaN count: 0
Step 1 - Shape after initial dropna: (7431, 5), NaN count: 0
Step 2 - Shape after target addition: (7430, 7), NaN count: 0
Step 3 - Shape after indicator calc and NaN fill: (7430, 15), NaN count: 0
Step 4 - Shape after final dropna: (7430, 15), NaN count: 0



Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.


Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.


Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.


Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.


Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.


Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.



In [14]:
# Step 3: Model Training and Prediction
def predict(train, test, predictors, model, threshold=0.5):
    """
    Train model and predict next-day movement with optimized threshold.
    """
    model.fit(train[predictors], train["Target"])
    preds_proba = model.predict_proba(test[predictors])[:, 1]
    preds = np.where(preds_proba >= threshold, 1, 0)
    preds = pd.Series(preds, index=test.index, name="Predictions")
    combined = pd.concat([test["Target"], preds], axis=1)
    return combined, preds_proba

def backtest(data, model, predictors, scale_pos_weight):
    """
    Backtest the model with train/validation/test split and threshold optimization.
    """
    # Calculate scale_pos_weight based on class imbalance
    neg, pos = np.bincount(data["Target"])
    scale_pos_weight_val = neg / pos if pos > 0 else 1.0
    
    # Split data: 70% train, 15% validation, 15% test
    train_data, temp_data = train_test_split(data, train_size=0.7, shuffle=False)
    val_data, test_data = train_test_split(temp_data, train_size=0.5, shuffle=False)
    
    # Define parameter grid for GridSearchCV
    param_grid = {
        'n_estimators': [100, 300, 500],
        'min_child_weight': [10, 20, 50]  # XGBoost uses min_child_weight instead of min_samples_split
    }
    
    # Initialize and train model with GridSearchCV
    xgb_model = XGBClassifier(scale_pos_weight=scale_pos_weight_val, random_state=1)
    grid_search = GridSearchCV(xgb_model, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
    grid_search.fit(train_data[predictors], train_data["Target"])
    best_model = grid_search.best_estimator_
    print(f"Best parameters for {data.name}: {grid_search.best_params_}")
    
    # Optimize threshold on validation set
    _, val_probs = predict(val_data, val_data, predictors, best_model, threshold=0.5)
    best_threshold = 0.5
    best_score = 0.0
    for thresh in np.arange(0.5, 0.9, 0.1):
        preds = np.where(val_probs >= thresh, 1, 0)
        score = 0.5 * precision_score(val_data["Target"], preds) + 0.5 * accuracy_score(val_data["Target"], preds)
        if score > best_score:
            best_score = score
            best_threshold = thresh
    print(f"Optimal threshold for {data.name}: {best_threshold}")
    
    # Apply best model and threshold to test set
    predictions, _ = predict(test_data, test_data, predictors, best_model, threshold=best_threshold)
    return predictions

# Define predictors
predictors = ["Close", "Volume", "RSI_14", "MACD", "MACD_Signal", "SMA_10", "EMA_10", "Volume_Ratio", "Volatility"]

# Train and backtest for each stock
predictions = {}
for stock in stocks:
    if stock in processed_data and processed_data[stock] is not None:
        processed_data[stock].name = stock  # Set name for GridSearchCV and threshold optimization
        predictions[stock] = backtest(processed_data[stock], XGBClassifier(), predictors, scale_pos_weight=1.0)
    else:
        print(f"Skipping {stock} due to preprocessing failure.")

# Evaluate model performance
for stock in stocks:
    if stock in predictions and predictions[stock] is not None:
        print(f"\nModel Performance for {stock}:")
        print("Precision Score:", precision_score(predictions[stock]["Target"], predictions[stock]["Predictions"]))
        print("Accuracy Score:", accuracy_score(predictions[stock]["Target"], predictions[stock]["Predictions"]))
        print(predictions[stock]["Predictions"].value_counts())

Best parameters for RELIANCE.NS: {'min_child_weight': 10, 'n_estimators': 300}
Optimal threshold for RELIANCE.NS: 0.5
Best parameters for HDFCBANK.NS: {'min_child_weight': 50, 'n_estimators': 500}
Optimal threshold for HDFCBANK.NS: 0.6
Best parameters for INFY.NS: {'min_child_weight': 50, 'n_estimators': 500}
Optimal threshold for INFY.NS: 0.6

Model Performance for RELIANCE.NS:
Precision Score: 1.0
Accuracy Score: 1.0
Predictions
1    580
0    535
Name: count, dtype: int64

Model Performance for HDFCBANK.NS:
Precision Score: 0.918854415274463
Accuracy Score: 0.7919282511210762
Predictions
0    696
1    419
Name: count, dtype: int64

Model Performance for INFY.NS:
Precision Score: 0.9122340425531915
Accuracy Score: 0.7623318385650224
Predictions
0    739
1    376
Name: count, dtype: int64


In [2]:
# Step 4: Trading Strategy
def trading_strategy(predictions, df):
    """
    Implement a simple trading strategy based on predictions and compute cumulative returns.
    """
    # Create a copy and preserve the name attribute
    df = df.copy()
    df.name = df.name if hasattr(df, 'name') else None  # Safely check and set name
    if df.name is None:
        df.name = "Unknown"  # Default name if not set
    
    df["Predictions"] = predictions["Predictions"]
    df["Position"] = df["Predictions"].shift(1)  # Buy/sell on next day
    df["Returns"] = df["Close"].pct_change()
    df["Strategy_Returns"] = df["Returns"] * df["Position"]
    
    # Handle NaN and infinite values
    df["Strategy_Returns"] = df["Strategy_Returns"].fillna(0)
    df["Strategy_Returns"] = df["Strategy_Returns"].replace([np.inf, -np.inf], 0)
    
    # Compute cumulative returns
    df["Cumulative_Returns"] = (1 + df["Strategy_Returns"]).cumprod()
    
    # Compute buy-and-hold returns for benchmarking
    df["Buy_Hold_Returns"] = (1 + df["Returns"]).cumprod()
    
    # Handle initial NaN in cumulative returns
    df["Cumulative_Returns"] = df["Cumulative_Returns"].fillna(1.0)
    df["Buy_Hold_Returns"] = df["Buy_Hold_Returns"].fillna(1.0)
    
    # Debugging: Print summary of returns
    print(f"\nTrading Strategy Summary for {df.name}:")
    print("Number of NaN in Strategy Returns:", df["Strategy_Returns"].isna().sum())
    print("Number of NaN in Cumulative Returns:", df["Cumulative_Returns"].isna().sum())
    print("Cumulative Returns (last 5):")
    print(df["Cumulative_Returns"].tail())
    
    return df

# Apply trading strategy and visualize results
# Ensure stocks is defined here if running this section independently
stocks = ["RELIANCE.NS", "HDFCBANK.NS", "INFY.NS"]  # Redefine if needed
for stock in stocks:
    if stock in processed_data and processed_data[stock] is not None:
        processed_data[stock].name = stock  # Set name for debugging
        df = trading_strategy(predictions[stock], processed_data[stock])
        
        # Plot cumulative returns with buy-and-hold benchmark
        fig = go.Figure()
        fig.add_trace(
            go.Scatter(
                x=df.index,
                y=df["Cumulative_Returns"],
                name="Strategy Returns",
                line=dict(color="#00CC96"),
            )
        )
        fig.add_trace(
            go.Scatter(
                x=df.index,
                y=df["Buy_Hold_Returns"],
                name="Buy and Hold Returns",
                line=dict(color="#EF553B", dash="dash"),
            )
        )
        fig.update_layout(
            title=f"{stock}: Cumulative Returns of Trading Strategy vs. Buy and Hold",
            yaxis_title="Cumulative Returns",
            template="plotly_dark",
            showlegend=True,
            height=600,
            xaxis=dict(rangeslider=dict(visible=False))  # Disable rangeslider for simplicity
        )
        fig.update_xaxes(title_text="Date")
        fig.write_html(f"{stock}_strategy_returns.html")
        fig.show()
    else:
        print(f"Skipping trading strategy visualization for {stock} due to data issues.")

NameError: name 'processed_data' is not defined