In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sb
from scipy.stats import skew
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from datetime import date, timedelta
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score
import math
from tensorflow.keras.layers import Bidirectional

In [2]:
symbol = input("enter symbol here : ")

##### Fetch Live stock data 

In [3]:
import yfinance as yf

print(f"Fetching data for '{symbol}' using yfinance...")

try:
    # 2. Create a Ticker object
    sym_data = yf.Ticker(symbol)

    # 3. Fetch historical market data (last 60 days with 10-minute intervals)
    data = sym_data.history(period="3mo", interval="1d")

    data = data.reset_index()  # Reset index to make 'Datetime' a column
    # The column name after reset_index depends on the yfinance version,
    # it could be 'Datetime' or 'index'. We'll handle 'Datetime'.
    # if "Datetime" in data.columns:
    #     # Format the 'Datetime' column to 'YYYY-MM-DD HH:MM'
    #     data["Datetime"] = data["Datetime"].dt.strftime("%Y-%m-%d %H:%M")
    # elif "index" in data.columns:
    #     data = data.rename(columns={"index": "Datetime"})
    #     data["Datetime"] = data["Datetime"].dt.strftime("%Y-%m-%d %H:%M")

    # We now have the 'Datetime' column to include.
    data = data[[ "Open", "High", "Low", "Close"]]

    # 7. Display the final data
    print("\nHere are the last 5 data points:")
    print(data.tail().round(2))

except Exception as e:
    print(f"\nAn error occurred: {e}")

Fetching data for '^NSEI' using yfinance...

Here are the last 5 data points:
        Open      High       Low     Close
60  25142.00  25153.65  25054.90  25083.75
61  25064.15  25084.85  24859.15  24870.10
62  24949.15  25021.55  24894.35  24967.75
63  24899.50  24919.65  24689.60  24712.05
64  24695.80  24702.65  24481.60  24500.90


##### Neccesary Data pre-processing steps 

In [None]:
# Summary statistics of numerical columns
print("summary statistics  \n",data.describe().T)

##### Stock Price Visualization 

In [None]:
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

plt.figure(figsize=(14, 6))
plt.plot(data["Datetime"], data["Close"], label="Close Price", linewidth=1.2)

# Convert Datetime column back to datetime objects (if it's string-formatted earlier)
data["Datetime"] = pd.to_datetime(data["Datetime"])

# Format x-axis to show only HH:MM
plt.gca().xaxis.set_major_locator(mdates.AutoDateLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter("%H:%M"))

plt.xticks(rotation=45, ha="right")
plt.xlabel("Time (HH:MM)")
plt.ylabel("Close Price (INR)")
plt.title(f"{symbol} - Close Price Over Time")
plt.legend()
plt.tight_layout()
plt.show()

##### Scatter plot shows the association between two variable (how much they're depended)

In [None]:
# Scatter plot between Open & Close prices
plt.figure(figsize=(8, 4))
sb.scatterplot(x=data['Open'], y=data['Close'], hue=data['Open'], palette='coolwarm')
plt.title("Scatter Plot: Open vs Close Prices")
plt.show()

##### Heatmap shows the correlations of all the numerical cols and shows how all are connected

In [None]:
numeric_cols = data.select_dtypes(include=['float64', 'int64'])
# heatmap for visualizing relationships between numerical columns
plt.figure(figsize=(6, 4))
sb.heatmap(numeric_cols.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Heatmap of Correlations")
plt.show()

###### Rule of thumb:-
- 0.5 < skew < 0.5 → fairly symmetrical
- Otherwise → consider transformation

In [None]:
print("Skewness (Close):", skew(data['Close']))
print("Skewness (Open):", skew(data['Open']))
print("Skewness (High):", skew(data['High']))
print("Skewness (Low):", skew(data['Low']))

#### Skewness Analysis 
- Market data or live time series data has very uneven skewness which is more common and expected due high volatility and other factors on markets

In [None]:
# Volume:
# Extremely skewed — a few trading days have unusually high volumes.
# This can cause LSTM to overemphasize rare spikes.
# Solution : Log transformation to reduce skewness.

## As of now, we are not using volume in our model, but if we do, we can apply log transformation to reduce skewness in future. 

In [None]:
## Adding a trend column to the data
# This column indicates whether the price is expected to go up (1) or down (0
data['Trend'] = (data['Close'].shift(-1) > data['Close']).astype(int)

In [None]:
print("Overview of dataset after adding some features \n\n")
data = data.round(2)  # rounding off the values for better readability
print(data.tail())

### LSTM MODEL

##### 1️⃣ Preparing Your Data for LSTM
###### LSTMs expect 3D input: [samples, timesteps, features]
where
```
samples = number of training examples
timesteps = how many past days to look back
features = number of columns (Open, High, Low, Close, Volume)
```

Step 1: Apply transformations & scaling

In [None]:
# Scale all features
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(data[["Open", "High", "Low", "Close"]])

Step 2: Create sequences(timesteps)

In [None]:
def create_sequences(dataset, time_steps=90):
    X, y = [], []
    for i in range(len(dataset) - time_steps):
        X.append(dataset[i:(i + time_steps)])  # past 60 days
        y.append(dataset[i + time_steps, 0])   # 'Close' is 1st column here
    # Convert to numpy arrays for better performance with LSTM
    return np.array(X), np.array(y)

time_steps = 60
X, y = create_sequences(scaled_data, time_steps)


Step 3: Train-test split (no shuffle)

In [None]:
train_size = int(len(X) * 0.8)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]
print(f"Training data shape: {X_train.shape}, {y_train.shape}")
print(f"Testing data shape: {X_test.shape}, {y_test.shape}")

Step 4: LSTM model (Keras - RNN variant) 

In [None]:
model = Sequential()
model.add(
    LSTM(50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2]))
)
model.add(Dropout(0.2))
model.add(LSTM(50))
model.add(Dropout(0.2))
model.add(Dense(1))  # Predict close price

model.compile(optimizer="adam", loss="mean_squared_error")

history = model.fit(
    X_train,
    y_train,
    epochs=50,
    batch_size=32,
    validation_data=(X_test, y_test),
    verbose=1,
)

Model evaluation metrics

In [None]:
# Predictions
y_pred = model.predict(X_test)
# Inverse transform (to get actual prices back)
# y_test and y_pred are scaled — need to bring them back

# Create empty array for inverse scaling
y_test_inv = np.zeros((len(y_test), scaled_data.shape[1]))
y_pred_inv = np.zeros((len(y_pred), scaled_data.shape[1]))

# Put the close price predictions back in the correct column
y_test_inv[:, 3] = y_test
y_pred_inv[:, 3] = y_pred[:, 0]

# Inverse transform
y_test_inv = scaler.inverse_transform(y_test_inv)[:, 3]
y_pred_inv = scaler.inverse_transform(y_pred_inv)[:, 3]


In [None]:
rmse = math.sqrt(mean_squared_error(y_test_inv, y_pred_inv))
mae = mean_absolute_error(y_test_inv, y_pred_inv)
r2 = r2_score(y_test_inv, y_pred_inv)
print("RMSE:", rmse)
print("MAE:", mae)
print("R2 score:", r2)

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(y_test_inv, label="Actual Price")
plt.plot(y_pred_inv, label="Predicted Price")
plt.title(f"Actual vs Predicted Stock Price {symbol}")
plt.xlabel("Time")
plt.ylabel("Price (INR)")
plt.legend()
plt.show()

In [None]:
# --- 1. User Input for Forecast Period ---
try:
    forecast_minutes = int(input("Enter the number of minutes to forecast (e.g., 15, 30, 60): "))
    if forecast_minutes <= 0:
        raise ValueError
except ValueError:
    print("Invalid input. Please enter a positive number. Defaulting to 30 minutes.")
    forecast_minutes = 30

In [None]:
# Since data interval is 5 minutes, calculate the number of steps to predict
future_steps = int(forecast_minutes / 5)
print(
    f"Forecasting for {forecast_minutes} minutes, which is {future_steps} 5-minute steps."
)

# Get the index of the 'Close' column. This makes the code more robust.
close_idx = data.columns.get_loc("Close") - 1 # Subtract 1 because we dropped the 'Date' column for scaling

# --- 1. Determine the number of features dynamically ---
n_features = scaled_data.shape[1]
print(f"Data has {n_features} features.")

# --- 2. Corrected Forecasting Logic ---
lookback = time_steps  # time_steps = 60 from your training cell

# Start with the last 'lookback' steps from your scaled dataset
last_sequence = scaled_data[-lookback:].copy()
forecast_predictions = []

for _ in range(future_steps):
    # Reshape the sequence to be a single sample
    # Use the dynamic n_features instead of a hardcoded 5
    X_input = np.reshape(last_sequence, (1, lookback, n_features))

    # Predict the next 5-minute interval's closing price (scaled)
    pred_scaled = model.predict(X_input)[0, 0]

    # Store the scaled prediction
    forecast_predictions.append(pred_scaled)

    # Create the next input row for the sequence
    # Use the predicted close price and carry over the other features from the last step
    next_input_row = last_sequence[-1].copy()  # Get the last row of 5 features
    next_input_row[3] = pred_scaled  # Update the 'Close' price feature (index 3)

    # Append the new row and remove the oldest to maintain the sequence length
    last_sequence = np.append(last_sequence[1:], [next_input_row], axis=0)

# --- 3. Inverse Transform and Display Forecast ---
# Create a dummy array with the same number of features to inverse transform the predictions
forecast_transformed = np.zeros((len(forecast_predictions), 5))
# Place the scaled predictions into the 'Close' price column (index 3)
forecast_transformed[:, 3] = forecast_predictions

# Inverse transform to get the actual price values
forecast_prices = scaler.inverse_transform(forecast_transformed)[:, close_idx]

# Generate future timestamps for the forecast
last_timestamp = data["Datetime"].iloc[-1]
future_timestamps = pd.to_datetime(
    [last_timestamp + pd.DateOffset(minutes=5 * (i + 1)) for i in range(future_steps)]
)

# Create and display a forecast DataFrame
forecast_df = pd.DataFrame(
    {"Timestamp": future_timestamps, "Predicted Close": forecast_prices}
)
print(f"\nPredicted Closing Prices for the Next {forecast_minutes} Minutes:")
print(forecast_df)


# --- 4. Enhanced Visualization ---
# plt.figure(figsize=(15, 7))

# Plot recent historical data using the actual datetime index
# plt.plot(
#     data["Datetime"].iloc[-200:],
#     data["Close"].iloc[-200:],
#     label="Historical Close Price",
#     color="blue",
# )

# Plot the forecasted data with future timestamps
# plt.plot(
#     forecast_df["Timestamp"],
#     forecast_df["Predicted Close"],
#     label="Forecasted Close Price",
#     color="red",
#     marker="o",
#     linestyle="--",
# )

# plt.title(f"Actual vs. Forecasted Stock Price for {symbol}")
# plt.xlabel("Date and Time")
# plt.ylabel("Price (INR)")
# plt.legend()
# plt.grid(True)

# Format the x-axis to show dates and times nicely
# plt.gca().xaxis.set_major_formatter(mdates.DateFormatter("%Y-%m-%d %H:%M"))
# plt.gca().xaxis.set_major_locator(mdates.AutoDateLocator())
# plt.gcf().autofmt_xdate()  # Rotates the dates for better readability

# plt.show()