In [1]:
import yfinance as yf
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from ta.momentum import RSIIndicator
from ta.trend import MACD
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import warnings

warnings.filterwarnings("ignore")
import os

In [2]:
stocks = ["RELIANCE.NS", "HDFCBANK.NS", "INFY.NS"]
stock_data = {}

In [3]:
for stock in stocks:
    if os.path.exists(f"{stock}.csv"):
        stock_data[stock] = pd.read_csv(f"{stock}.csv", index_col=0, parse_dates=True)
    else:
        ticker = yf.Ticker(stock)
        stock_data[stock] = ticker.history(period="max")
        stock_data[stock].to_csv(f"{stock}.csv")

In [4]:
for stock in stocks:
    print(f"\nBasic Statistics for {stock}:")
    print(stock_data[stock][["Open", "High", "Low", "Close", "Volume"]].describe())


Basic Statistics for RELIANCE.NS:
              Open         High          Low        Close        Volume
count  7428.000000  7428.000000  7428.000000  7428.000000  7.428000e+03
mean    330.606448   334.200081   326.786256   330.358959  5.872732e+07
std     418.435364   422.302549   414.404913   418.221028  1.014932e+08
min       3.447046     3.462817     3.425643     3.462817  0.000000e+00
25%      19.602716    19.868367    19.320005    19.591828  1.347409e+07
50%     183.592594   185.615889   180.853095   183.490685  2.377707e+07
75%     410.233114   413.387463   406.147393   408.630997  6.121514e+07
max    1599.022925  1603.358288  1580.137072  1595.484985  1.448889e+09

Basic Statistics for HDFCBANK.NS:
              Open         High          Low        Close        Volume
count  7431.000000  7431.000000  7431.000000  7431.000000  7.431000e+03
mean    460.769056   465.090833   456.226109   460.741536  6.049395e+06
std     560.059051   564.701241   555.357269   560.046262  7.77269

In [5]:
# Create a professional-looking plot for each stock
fig = make_subplots(
    rows=3,
    cols=1,
    subplot_titles=[f"{stock} Closing Price and Volume" for stock in stocks],
    vertical_spacing=0.1,
    specs=[[{'secondary_y': True}] for _ in range(3)]  # Enable secondary_y for each subplot
)

for i, stock in enumerate(stocks, 1):
    df = stock_data[stock]
    # Add candlestick chart for stock price
    fig.add_trace(
        go.Candlestick(
            x=df.index,
            open=df["Open"],
            high=df["High"],
            low=df["Low"],
            close=df["Close"],
            name=stock,
            increasing_line_color="#00CC96",
            decreasing_line_color="#EF553B",
        ),
        row=i,
        col=1,
        secondary_y=False,
    )
    # Add volume bar chart
    fig.add_trace(
        go.Bar(
            x=df.index,
            y=df["Volume"],
            name=f"{stock} Volume",
            marker_color="#636EFA",
            opacity=0.3,
        ),
        row=i,
        col=1,
        secondary_y=True,
    )

# Update layout for a professional look
fig.update_layout(
    title="Nifty50 Stocks: Price and Volume (Last 5 Years)",
    height=1200,
    showlegend=False,
    template="plotly_dark",
    xaxis_rangeslider_visible=False,
)
fig.update_yaxes(title_text="Price (INR)", secondary_y=False)
fig.update_yaxes(title_text="Volume", secondary_y=True)

# Save and display the plot
fig.write_html("nifty50_eda.html")
fig.show()

In [6]:
# Step 2: Data Cleaning and Preprocessing
def preprocess_stock_data(df):
    """
    Clean and preprocess stock data, adding technical indicators.
    """
    try:
        # Convert index to datetime
        df.index = pd.to_datetime(df.index)
        
        # Remove unnecessary columns
        if "Dividends" in df.columns:
            del df["Dividends"]
        if "Stock Splits" in df.columns:
            del df["Stock Splits"]
        
        # Initial drop of missing values
        df = df.dropna()
        print(f"Shape after initial dropna: {df.shape}, NaN count: {df.isna().sum().sum()}")
        
        # Check if data is sufficient
        if len(df) < 10:  # Reduced minimum rows for testing
            raise ValueError(f"Insufficient data after initial cleaning: {len(df)} rows.")
        
        # Add target variable: 1 if tomorrow's close > today's close, else 0
        df["Tomorrow"] = df["Close"].shift(-1)
        df["Target"] = (df["Tomorrow"] > df["Close"]).astype(int)
        df["Tomorrow"] = df["Tomorrow"].fillna(method="ffill")
        df["Target"] = df["Target"].fillna(method="ffill")
        df = df.dropna(subset=["Tomorrow", "Target"])
        print(f"Shape after target addition: {df.shape}, NaN count: {df.isna().sum().sum()}")
        
        # Add technical indicators with reduced windows
        df["RSI_14"] = RSIIndicator(df["Close"], window=7).rsi()  # Reduced from 14 to 7
        macd = MACD(df["Close"], window_slow=26, window_fast=12, window_sign=9)  # Default MACD windows
        df["MACD"] = macd.macd()
        df["MACD_Signal"] = macd.macd_signal()
        df["SMA_10"] = df["Close"].rolling(window=10).mean()  # Reduced from 20 to 10
        df["EMA_10"] = df["Close"].ewm(span=10, adjust=False).mean()  # Reduced from 20 to 10
        
        # Volume-based features
        df["Volume_MA_10"] = df["Volume"].rolling(window=10).mean()  # Reduced from 20 to 10
        df["Volume_Ratio"] = df["Volume"] / df["Volume_MA_10"].replace(0, np.nan)  # Avoid division by zero
        
        # Fill NaN values with reasonable defaults
        df["RSI_14"] = df["RSI_14"].fillna(50)  # Neutral RSI value
        df["MACD"] = df["MACD"].fillna(0)
        df["MACD_Signal"] = df["MACD_Signal"].fillna(0)
        df["SMA_10"] = df["SMA_10"].fillna(method="ffill").fillna(df["Close"].mean())
        df["EMA_10"] = df["EMA_10"].fillna(method="ffill").fillna(df["Close"].mean())
        df["Volume_MA_10"] = df["Volume_MA_10"].fillna(0)
        df["Volume_Ratio"] = df["Volume_Ratio"].fillna(1.0)  # Neutral ratio
        
        # Final drop of any remaining NaN values
        df = df.dropna()
        print(f"Shape after adding indicators: {df.shape}, NaN count: {df.isna().sum().sum()}")
        
        if df.empty:
            raise ValueError("DataFrame is empty after adding indicators.")
        
        return df
    except Exception as e:
        print(f"Error preprocessing data: {str(e)}")
        return None

# Preprocess data for each stock (using earliest available data)
processed_data = {}
for stock in stocks:
    processed_df = preprocess_stock_data(stock_data[stock])  # Removed date filter to use all data
    if processed_df is not None:
        processed_data[stock] = processed_df
    else:
        raise ValueError(f"Preprocessing failed for {stock}. Check data or indicators.")

Shape after initial dropna: (7428, 5), NaN count: 0
Shape after target addition: (7428, 7), NaN count: 0
Shape after adding indicators: (7428, 14), NaN count: 0
Shape after initial dropna: (7431, 5), NaN count: 0
Shape after target addition: (7431, 7), NaN count: 0
Shape after adding indicators: (7431, 14), NaN count: 0
Shape after initial dropna: (7431, 5), NaN count: 0
Shape after target addition: (7431, 7), NaN count: 0
Shape after adding indicators: (7431, 14), NaN count: 0


In [7]:
# Visualize technical indicators for one stock (e.g., RELIANCE.NS)
stock = "RELIANCE.NS"
if stock not in processed_data or processed_data[stock] is None:
    raise ValueError(f"No valid data for {stock} after preprocessing.")
df = processed_data[stock]
fig = make_subplots(
    rows=4,
    cols=1,
    subplot_titles=("Closing Price with SMA/EMA", "RSI", "MACD", "Volume"),
    vertical_spacing=0.1,
    shared_xaxes=True,
    specs=[[{'secondary_y': False}], [{'secondary_y': False}], [{'secondary_y': False}], [{'secondary_y': False}]]
)

# Closing Price with SMA and EMA
fig.add_trace(
    go.Scatter(x=df.index, y=df["Close"], name="Close", line=dict(color="#00CC96")),
    row=1, col=1,
)
fig.add_trace(
    go.Scatter(x=df.index, y=df["SMA_10"], name="SMA 10", line=dict(color="#EF553B")),
    row=1, col=1,
)
fig.add_trace(
    go.Scatter(x=df.index, y=df["EMA_10"], name="EMA 10", line=dict(color="#FFA15A")),
    row=1, col=1,
)

# RSI
fig.add_trace(
    go.Scatter(x=df.index, y=df["RSI_14"], name="RSI 7", line=dict(color="#636EFA")),
    row=2, col=1,
)
fig.add_hline(y=70, line_dash="dash", line_color="red", row=2, col=1)
fig.add_hline(y=30, line_dash="dash", line_color="green", row=2, col=1)

# MACD
fig.add_trace(
    go.Scatter(x=df.index, y=df["MACD"], name="MACD", line=dict(color="#00CC96")),
    row=3, col=1,
)
fig.add_trace(
    go.Scatter(x=df.index, y=df["MACD_Signal"], name="Signal", line=dict(color="#EF553B")),
    row=3, col=1,
)

# Volume
fig.add_trace(
    go.Scatter(x=df.index, y=df["Volume"], name="Volume", line=dict(color="#636EFA", width=0.5, dash="dot")),
    row=4, col=1,
)

# Update layout
fig.update_layout(
    title=f"{stock}: Technical Indicators",
    height=1200,
    template="plotly_dark",
    showlegend=True,
)
fig.write_html(f"{stock}_indicators.html")
fig.show()

In [8]:
# Step 3: Model Training and Prediction
def predict(train, test, predictors, model):
    """
    Train model and predict next-day movement.
    """
    model.fit(train[predictors], train["Target"])
    preds = model.predict_proba(test[predictors])[:, 1]
    preds = np.where(preds >= 0.6, 1, 0)  # Threshold at 0.6
    preds = pd.Series(preds, index=test.index, name="Predictions")
    combined = pd.concat([test["Target"], preds], axis=1)
    return combined

def backtest(data, model, predictors, start=2500, step=250):
    """
    Backtest the model over the dataset.
    """
    all_predictions = []
    for i in range(start, data.shape[0], step):
        train = data.iloc[0:i].copy()
        test = data.iloc[i:(i + step)].copy()
        predictions = predict(train, test, predictors, model)
        all_predictions.append(predictions)
    return pd.concat(all_predictions)

# Define predictors
predictors = ["Close", "Volume", "RSI_14", "MACD", "MACD_Signal", "SMA_10", "EMA_10", "Volume_Ratio"]

# Train and backtest for each stock
model = RandomForestClassifier(n_estimators=200, min_samples_split=50, random_state=1)
predictions = {}
for stock in stocks:
    if stock in processed_data and processed_data[stock] is not None:
        predictions[stock] = backtest(processed_data[stock], model, predictors)
    else:
        print(f"Skipping {stock} due to preprocessing failure.")

# Evaluate model performance
for stock in stocks:
    if stock in predictions and predictions[stock] is not None:
        print(f"\nModel Performance for {stock}:")
        print("Precision Score:", precision_score(predictions[stock]["Target"], predictions[stock]["Predictions"]))
        print("Accuracy Score:", accuracy_score(predictions[stock]["Target"], predictions[stock]["Predictions"]))
        print(predictions[stock]["Predictions"].value_counts())


Model Performance for RELIANCE.NS:
Precision Score: 0.5143540669856459
Accuracy Score: 0.4882305194805195
Predictions
0    4510
1     418
Name: count, dtype: int64

Model Performance for HDFCBANK.NS:
Precision Score: 0.49310344827586206
Accuracy Score: 0.4838775096329345
Predictions
0    4641
1     290
Name: count, dtype: int64

Model Performance for INFY.NS:
Precision Score: 0.5157894736842106
Accuracy Score: 0.48793348205232207
Predictions
0    4266
1     665
Name: count, dtype: int64


In [12]:
def trading_strategy(predictions, df):
    """
    Implement a simple trading strategy based on predictions and compute cumulative returns.
    """
    # Create a copy and preserve the name attribute
    df = df.copy()
    df.name = df.name  # Ensure name is carried over from the input df
    
    df["Predictions"] = predictions["Predictions"]
    df["Position"] = df["Predictions"].shift(1)  # Buy/sell on next day
    df["Returns"] = df["Close"].pct_change()
    df["Strategy_Returns"] = df["Returns"] * df["Position"]
    
    # Handle NaN and infinite values
    df["Strategy_Returns"] = df["Strategy_Returns"].fillna(0)
    df["Strategy_Returns"] = df["Strategy_Returns"].replace([np.inf, -np.inf], 0)
    
    # Compute cumulative returns
    df["Cumulative_Returns"] = (1 + df["Strategy_Returns"]).cumprod()
    
    # Compute buy-and-hold returns for benchmarking
    df["Buy_Hold_Returns"] = (1 + df["Returns"]).cumprod()
    
    # Handle initial NaN in cumulative returns
    df["Cumulative_Returns"] = df["Cumulative_Returns"].fillna(1.0)
    df["Buy_Hold_Returns"] = df["Buy_Hold_Returns"].fillna(1.0)
    
    # Debugging: Print summary of returns
    print(f"\nTrading Strategy Summary for {df.name}:")
    print("Number of NaN in Strategy Returns:", df["Strategy_Returns"].isna().sum())
    print("Number of NaN in Cumulative Returns:", df["Cumulative_Returns"].isna().sum())
    print("Cumulative Returns (last 5):")
    print(df["Cumulative_Returns"].tail())
    
    return df

# Apply trading strategy and visualize results
for stock in stocks:
    if stock in processed_data and processed_data[stock] is not None:
        processed_data[stock].name = stock  # Set name for debugging
        df = trading_strategy(predictions[stock], processed_data[stock])
        
        # Plot cumulative returns with buy-and-hold benchmark
        fig = go.Figure()
        fig.add_trace(
            go.Scatter(
                x=df.index,
                y=df["Cumulative_Returns"],
                name="Strategy Returns",
                line=dict(color="#00CC96"),
            )
        )
        fig.add_trace(
            go.Scatter(
                x=df.index,
                y=df["Buy_Hold_Returns"],
                name="Buy and Hold Returns",
                line=dict(color="#EF553B", dash="dash"),
            )
        )
        fig.update_layout(
            title=f"{stock}: Cumulative Returns of Trading Strategy vs. Buy and Hold",
            yaxis_title="Cumulative Returns",
            template="plotly_dark",
            showlegend=True,
            height=600,
            xaxis=dict(rangeslider=dict(visible=False))  # Disable rangeslider for simplicity
        )
        fig.update_xaxes(title_text="Date")
        fig.write_html(f"{stock}_strategy_returns.html")
        fig.show()
    else:
        print(f"Skipping trading strategy visualization for {stock} due to data issues.")

AttributeError: 'DataFrame' object has no attribute 'name'