# Downloading dataset

In [None]:
from alpha_vantage.timeseries import TimeSeries
import pandas as pd

# Replace with your Alpha Vantage API key
API_KEY = "your api key"

# Initialize Alpha Vantage TimeSeries API
ts = TimeSeries(key=API_KEY, output_format="pandas")

# Fetch Adani Ports data (BSE)
stock_data, meta_data = ts.get_daily(symbol="BHEL.BSE", outputsize="full")

# Rename columns for readability
stock_data = stock_data.rename(columns={
    "1. open": "Open",
    "2. high": "High",
    "3. low": "Low",
    "4. close": "Close",
    "5. volume": "Volume"
})

# Convert index to datetime
stock_data.index = pd.to_datetime(stock_data.index)

# Display the first few rows
print(stock_data.head())

# Save to CSV for later use
stock_data.to_csv("bhel_stock_data.csv")


# Fetching news for different indian companies and performing sentiment analysis

In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# Download once
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

def get_top_news_sentiment(company_name):
    query_url = company_name.replace(" ", "+")
    url = f"https://news.google.com/rss/search?q={query_url}+when:1d&hl=en-IN&gl=IN&ceid=IN:en"

    response = requests.get(url)
    soup = BeautifulSoup(response.content, features="xml")
    item = soup.find("item")

    if item is None:
        print(f"❌ No news found for {company_name}.")
        return pd.DataFrame()

    # Extract details
    title = item.title.text
    link = item.link.text
    pub_date = item.pubDate.text
    pub_date_fmt = datetime.strptime(pub_date, "%a, %d %b %Y %H:%M:%S %Z").strftime("%Y-%m-%d")

    # Sentiment analysis
    score = sia.polarity_scores(title)['compound']
    sentiment = (
        "Positive" if score >= 0.05 else
        "Negative" if score <= -0.05 else
        "Neutral"
    )

    # Save to DataFrame
    df = pd.DataFrame([{
        # "company": company_name,
        "headline": title,
        # "link": link,
        "Date": pub_date_fmt,
        "sentiment_score": score,
        "sentiment_label": sentiment
    }])

    filename = f"{company_name.lower().replace(' ', '_')}_news_sentiment.csv"
    df.to_csv(filename, index=False)
    print(f"✅ Top news for {company_name} saved to: {filename}")
    return df

# Example: Get news for ITC Limited or any other company
company = "bhel"
top_news_df = get_top_news_sentiment(company)
print(top_news_df)


# Merging the stock dataset and news sentiment dataset of each company

In [None]:
import pandas as pd

# Load stock data (up to March 18)
df_stock = pd.read_csv("bhel_stock_data.csv", parse_dates=["date"])

# Load news sentiment data (from March 19)
df_news = pd.read_csv("bhel_news_sentiment.csv", parse_dates=["Date"])

# **Shift today's news (March 19) backward by 1 day (so it merges with March 18 stock prices)**
df_news["Date"] = df_news["Date"] - pd.Timedelta(days=1)
# print(df_news["Date"])

# Convert date format to ensure proper merging
df_stock["date"] = df_stock["date"].dt.date
df_news["Date"] = df_news["Date"].dt.date

# **Merge March 19 News with March 18 Stock Data**
df_merged = pd.merge(df_stock, df_news, left_on="date", right_on="Date", how="outer")

# Fill missing news sentiment with forward-fill (if past news is available)
df_merged["headline"] = df_merged["headline"].fillna(method="ffill")
df_merged["sentiment_score"] = df_merged["sentiment_score"].fillna(method="ffill")

# Fill any remaining NaN values with 0
df_merged["sentiment_score"] = df_merged["sentiment_score"].fillna(0)

# Drop duplicate Date column
df_merged.drop(columns=["Date"], inplace=True)

# Save updated dataset
df_merged.to_csv("bhel_stock_sentiment_data.csv", index=False)

print("\n✅ Merging Completed Successfully!")
# print("March 19 news merged with March 18 stock data to predict March 19 prices.")
print(df_merged.tail())  # Show last few rows


# Anomaly Detection

In [None]:
# Anomaly Detection:-
import pandas as pd
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt

# Load dataset
df = pd.read_csv("bhel_stock_sentiment_data.csv")  # Ensure your CSV has columns Date, Open, High, Low, Close
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)

# Initialize Isolation Forest for each OHLC price
iso_forest_open = IsolationForest(contamination=0.01, random_state=42)
iso_forest_high = IsolationForest(contamination=0.01, random_state=42)
iso_forest_low = IsolationForest(contamination=0.01, random_state=42)
iso_forest_close = IsolationForest(contamination=0.01, random_state=42)

# Detect anomalies for each OHLC price
df["Anomaly_Open"] = iso_forest_open.fit_predict(df[['Open']])
df["Anomaly_High"] = iso_forest_high.fit_predict(df[['High']])
df["Anomaly_Low"] = iso_forest_low.fit_predict(df[['Low']])
df["Anomaly_Close"] = iso_forest_close.fit_predict(df[['Close']])

# Plot anomalies for Open price
plt.figure(figsize=(12, 6))
plt.plot(df.index, df['Open'], label='Open Price', color='blue', linewidth=2)
anomalies_open = df[df['Anomaly_Open'] == -1]
plt.scatter(anomalies_open.index, anomalies_open['Open'], color='red', label='Anomalies - Open', zorder=5)
plt.title("Open Price with Anomalies")
plt.xlabel("Date")
plt.ylabel("Open Price")
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Plot anomalies for High price
plt.figure(figsize=(12, 6))
plt.plot(df.index, df['High'], label='High Price', color='green', linewidth=2)
anomalies_high = df[df['Anomaly_High'] == -1]
plt.scatter(anomalies_high.index, anomalies_high['High'], color='red', label='Anomalies - High', zorder=5)
plt.title("High Price with Anomalies")
plt.xlabel("Date")
plt.ylabel("High Price")
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Plot anomalies for Low price
plt.figure(figsize=(12, 6))
plt.plot(df.index, df['Low'], label='Low Price', color='orange', linewidth=2)
anomalies_low = df[df['Anomaly_Low'] == -1]
plt.scatter(anomalies_low.index, anomalies_low['Low'], color='red', label='Anomalies - Low', zorder=5)
plt.title("Low Price with Anomalies")
plt.xlabel("Date")
plt.ylabel("Low Price")
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Plot anomalies for Close price
plt.figure(figsize=(12, 6))
plt.plot(df.index, df['Close'], label='Close Price', color='blue', linewidth=2)
anomalies_close = df[df['Anomaly_Close'] == -1]
plt.scatter(anomalies_close.index, anomalies_close['Close'], color='red', label='Anomalies - Close', zorder=5)
plt.title("Close Price with Anomalies")
plt.xlabel("Date")
plt.ylabel("Close Price")
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Remove anomalies from the dataset (keep only normal data)
df_clean = df[
    (df["Anomaly_Open"] == 1) &
    (df["Anomaly_High"] == 1) &
    (df["Anomaly_Low"] == 1) &
    (df["Anomaly_Close"] == 1)
].drop(columns=["Anomaly_Open", "Anomaly_High", "Anomaly_Low", "Anomaly_Close"])


# Normalize only numeric columns
from sklearn.preprocessing import MinMaxScaler

# Select only numeric columns (exclude strings like news/headlines)
numeric_cols = df_clean.select_dtypes(include=['float64', 'int64']).columns

scaler = MinMaxScaler()
df_scaled = pd.DataFrame(
    scaler.fit_transform(df_clean[numeric_cols]),
    columns=numeric_cols,
    index=df_clean.index
)


# Prepare train/test split
train_size = int(len(df_scaled) * 0.8)
train, test = df_scaled[:train_size], df_scaled[train_size:]

# Save the preprocessed data
train.to_csv("bhel_train_data.csv")
test.to_csv("bhel_test_data.csv")


# Model Implementation

In [None]:
# LSTM Model:-
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import os
import random
import numpy as np
import tensorflow as tf

# Set random seeds for reproducibility
seed = 42
os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)

# Set additional environment variables to control TF behavior
os.environ['TF_DETERMINISTIC_OPS'] = '1'
os.environ['TF_CUDNN_DETERMINISTIC'] = '1'


# Load the preprocessed dataset (after anomaly removal)
train = pd.read_csv("bhel_train_data.csv", index_col="date", parse_dates=True)
test = pd.read_csv("bhel_test_data.csv", index_col="date", parse_dates=True)

# Include sentiment score in the features
train = train[['Open', 'High', 'Low', 'Close', 'sentiment_score']]
test = test[['Open', 'High', 'Low', 'Close', 'sentiment_score']]

# Normalize the data
scaler = MinMaxScaler()
train_scaled = scaler.fit_transform(train)
test_scaled = scaler.transform(test)

# Prepare the data for LSTM
def create_dataset(data, look_back=60):
    X, y = [], []
    for i in range(look_back, len(data)):
        X.append(data[i-look_back:i])
        y.append(data[i, :4])  # Only predict OHLC, not sentiment
    return np.array(X), np.array(y)

# Prepare train and test data
X_train, y_train = create_dataset(train_scaled)
X_test, y_test = create_dataset(test_scaled)

# Reshape for LSTM input
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], X_train.shape[2])
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], X_test.shape[2])

# Build the LSTM model
model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.2))
model.add(LSTM(units=50, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(units=4))  # Predict Open, High, Low, Close

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Early stopping to prevent overfitting
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
history = model.fit(X_train, y_train, epochs=100, batch_size=32, 
                    validation_data=(X_test, y_test),
                    callbacks=[early_stop])

# Predict
y_pred = model.predict(X_test)

# Inverse transform with dummy sentiment_score
def add_dummy_sentiment(data):
    dummy = np.zeros((data.shape[0], 1))
    return np.concatenate((data, dummy), axis=1)

y_pred_padded = add_dummy_sentiment(y_pred)
y_test_padded = add_dummy_sentiment(y_test)

y_pred_inverse = scaler.inverse_transform(y_pred_padded)[:, :4]
y_test_inverse = scaler.inverse_transform(y_test_padded)[:, :4]

# Evaluation
mse = mean_squared_error(y_test_inverse, y_pred_inverse)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test_inverse, y_pred_inverse)
r2 = r2_score(y_test_inverse, y_pred_inverse)

print(f"MSE: {mse}")
print(f"MAE: {mae}")
print(f"Accuracy (R²): {r2 * 100}%")
print(f"RMSE: {rmse}")

# Plot training and validation loss
plt.figure(figsize=(10, 6))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title("Training and Validation Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.show()

# Plot training vs testing data for OHLC prices
plt.figure(figsize=(12, 6))

# Plot for Open price
plt.subplot(2, 2, 1)
plt.plot(train.index, train['Open'], label="Train Open", color='blue')
plt.plot(test.index, test['Open'], label="Test Open", color='orange')
plt.title("Training vs Testing Open Price")
plt.xlabel("Date")
plt.ylabel("Open Price")
plt.legend()

# Plot for High price
plt.subplot(2, 2, 2)
plt.plot(train.index, train['High'], label="Train High", color='blue')
plt.plot(test.index, test['High'], label="Test High", color='orange')
plt.title("Training vs Testing High Price")
plt.xlabel("Date")
plt.ylabel("High Price")
plt.legend()

# Plot for Low price
plt.subplot(2, 2, 3)
plt.plot(train.index, train['Low'], label="Train Low", color='blue')
plt.plot(test.index, test['Low'], label="Test Low", color='orange')
plt.title("Training vs Testing Low Price")
plt.xlabel("Date")
plt.ylabel("Low Price")
plt.legend()

# Plot for Close price
plt.subplot(2, 2, 4)
plt.plot(train.index, train['Close'], label="Train Close", color='blue')
plt.plot(test.index, test['Close'], label="Test Close", color='orange')
plt.title("Training vs Testing Close Price")
plt.xlabel("Date")
plt.ylabel("Close Price")
plt.legend()

plt.tight_layout()
plt.show()


# CNN Model:-
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, Dense, Dropout, Flatten
from tensorflow.keras.callbacks import EarlyStopping
import os
import random
import tensorflow as tf

# Set random seeds for reproducibility
seed = 42
os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)

# Deterministic TF behavior
os.environ['TF_DETERMINISTIC_OPS'] = '1'
os.environ['TF_CUDNN_DETERMINISTIC'] = '1'

# Load the preprocessed dataset (after anomaly removal)
train = pd.read_csv("bhel_train_data.csv", index_col="date", parse_dates=True)
test = pd.read_csv("bhel_test_data.csv", index_col="date", parse_dates=True)

# Include sentiment score in the features
train = train[['Open', 'High', 'Low', 'Close', 'sentiment_score']]
test = test[['Open', 'High', 'Low', 'Close', 'sentiment_score']]

# Normalize the data
scaler = MinMaxScaler()
train_scaled = scaler.fit_transform(train)
test_scaled = scaler.transform(test)

# Prepare the data for CNN
def create_dataset(data, look_back=60):
    X, y = [], []
    for i in range(look_back, len(data)):
        X.append(data[i-look_back:i])
        y.append(data[i, :4])  # Predict only OHLC
    return np.array(X), np.array(y)

# Create train and test datasets
X_train, y_train = create_dataset(train_scaled)
X_test, y_test = create_dataset(test_scaled)

# CNN expects 3D input just like LSTM: (samples, timesteps, features)
# Already in the correct shape from create_dataset

# Build the CNN model
model = Sequential()
model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.2))
model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(50, activation='relu'))
model.add(Dense(4))  # Predict OHLC

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
history = model.fit(X_train, y_train, epochs=100, batch_size=32,
                    validation_data=(X_test, y_test),
                    callbacks=[early_stop])

# Predict
y_pred = model.predict(X_test)

# Inverse transform with dummy sentiment
def add_dummy_sentiment(data):
    dummy = np.zeros((data.shape[0], 1))
    return np.concatenate((data, dummy), axis=1)

y_pred_padded = add_dummy_sentiment(y_pred)
y_test_padded = add_dummy_sentiment(y_test)

y_pred_inverse = scaler.inverse_transform(y_pred_padded)[:, :4]
y_test_inverse = scaler.inverse_transform(y_test_padded)[:, :4]

# Evaluation
mse = mean_squared_error(y_test_inverse, y_pred_inverse)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test_inverse, y_pred_inverse)
r2 = r2_score(y_test_inverse, y_pred_inverse)

print(f"MSE: {mse}")
print(f"MAE: {mae}")
print(f"Accuracy (R²): {r2 * 100}%")
print(f"RMSE: {rmse}")

# Plot training and validation loss
plt.figure(figsize=(10, 6))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title("Training and Validation Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.show()

# Plot training vs testing data for OHLC prices
plt.figure(figsize=(12, 6))

# Open
plt.subplot(2, 2, 1)
plt.plot(train.index, train['Open'], label="Train Open", color='blue')
plt.plot(test.index, test['Open'], label="Test Open", color='orange')
plt.title("Training vs Testing Open Price")
plt.xlabel("Date")
plt.ylabel("Open Price")
plt.legend()

# High
plt.subplot(2, 2, 2)
plt.plot(train.index, train['High'], label="Train High", color='blue')
plt.plot(test.index, test['High'], label="Test High", color='orange')
plt.title("Training vs Testing High Price")
plt.xlabel("Date")
plt.ylabel("High Price")
plt.legend()

# Low
plt.subplot(2, 2, 3)
plt.plot(train.index, train['Low'], label="Train Low", color='blue')
plt.plot(test.index, test['Low'], label="Test Low", color='orange')
plt.title("Training vs Testing Low Price")
plt.xlabel("Date")
plt.ylabel("Low Price")
plt.legend()

# Close
plt.subplot(2, 2, 4)
plt.plot(train.index, train['Close'], label="Train Close", color='blue')
plt.plot(test.index, test['Close'], label="Test Close", color='orange')
plt.title("Training vs Testing Close Price")
plt.xlabel("Date")
plt.ylabel("Close Price")
plt.legend()

plt.tight_layout()
plt.show()


# RNN Model:-
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import os
import random
import tensorflow as tf

# Reproducibility
seed = 42
os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)
os.environ['TF_DETERMINISTIC_OPS'] = '1'
os.environ['TF_CUDNN_DETERMINISTIC'] = '1'

# Load dataset
train = pd.read_csv("bhel_train_data.csv", index_col="date", parse_dates=True)
test = pd.read_csv("bhel_test_data.csv", index_col="date", parse_dates=True)

# Select features (OHLC + sentiment)
train = train[['Open', 'High', 'Low', 'Close', 'sentiment_score']]
test = test[['Open', 'High', 'Low', 'Close', 'sentiment_score']]

# Normalize
scaler = MinMaxScaler()
train_scaled = scaler.fit_transform(train)
test_scaled = scaler.transform(test)

# Create sequences
def create_dataset(data, look_back=60):
    X, y = [], []
    for i in range(look_back, len(data)):
        X.append(data[i-look_back:i])
        y.append(data[i, :4])  # Predict only OHLC
    return np.array(X), np.array(y)

X_train, y_train = create_dataset(train_scaled)
X_test, y_test = create_dataset(test_scaled)

# Build RNN model
model = Sequential()
model.add(SimpleRNN(units=50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.2))
model.add(SimpleRNN(units=50, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(units=4))  # Output OHLC

# Compile
model.compile(optimizer='adam', loss='mean_squared_error')

# Train
early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
history = model.fit(X_train, y_train, epochs=100, batch_size=32,
                    validation_data=(X_test, y_test),
                    callbacks=[early_stop])

# Predict
y_pred = model.predict(X_test)

# Inverse scale with dummy sentiment
def add_dummy_sentiment(data):
    dummy = np.zeros((data.shape[0], 1))
    return np.concatenate((data, dummy), axis=1)

y_pred_padded = add_dummy_sentiment(y_pred)
y_test_padded = add_dummy_sentiment(y_test)

y_pred_inverse = scaler.inverse_transform(y_pred_padded)[:, :4]
y_test_inverse = scaler.inverse_transform(y_test_padded)[:, :4]

# Metrics
mse = mean_squared_error(y_test_inverse, y_pred_inverse)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test_inverse, y_pred_inverse)
r2 = r2_score(y_test_inverse, y_pred_inverse)

print(f"MSE: {mse}")
print(f"MAE: {mae}")
print(f"R² Score: {r2 * 100}%")
print(f"RMSE: {rmse}")

# Plot loss
plt.figure(figsize=(10, 6))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title("Training and Validation Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.show()

# Plot OHLC training vs testing
plt.figure(figsize=(12, 6))

# Open
plt.subplot(2, 2, 1)
plt.plot(train.index, train['Open'], label="Train Open", color='blue')
plt.plot(test.index, test['Open'], label="Test Open", color='orange')
plt.title("Training vs Testing Open Price")
plt.xlabel("Date")
plt.ylabel("Open Price")
plt.legend()

# High
plt.subplot(2, 2, 2)
plt.plot(train.index, train['High'], label="Train High", color='blue')
plt.plot(test.index, test['High'], label="Test High", color='orange')
plt.title("Training vs Testing High Price")
plt.xlabel("Date")
plt.ylabel("High Price")
plt.legend()

# Low
plt.subplot(2, 2, 3)
plt.plot(train.index, train['Low'], label="Train Low", color='blue')
plt.plot(test.index, test['Low'], label="Test Low", color='orange')
plt.title("Training vs Testing Low Price")
plt.xlabel("Date")
plt.ylabel("Low Price")
plt.legend()

# Close
plt.subplot(2, 2, 4)
plt.plot(train.index, train['Close'], label="Train Close", color='blue')
plt.plot(test.index, test['Close'], label="Test Close", color='orange')
plt.title("Training vs Testing Close Price")
plt.xlabel("Date")
plt.ylabel("Close Price")
plt.legend()

plt.tight_layout()
plt.show()


# GRU Model:-
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import os
import random
import tensorflow as tf

# Set random seeds for reproducibility
seed = 42
os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)

# Set additional environment variables to control TF behavior
os.environ['TF_DETERMINISTIC_OPS'] = '1'
os.environ['TF_CUDNN_DETERMINISTIC'] = '1'

# Load the preprocessed dataset (after anomaly removal)
train = pd.read_csv("bhel_train_data.csv", index_col="date", parse_dates=True)
test = pd.read_csv("bhel_test_data.csv", index_col="date", parse_dates=True)

# Include sentiment score in the features
train = train[['Open', 'High', 'Low', 'Close', 'sentiment_score']]
test = test[['Open', 'High', 'Low', 'Close', 'sentiment_score']]

# Normalize the data
scaler = MinMaxScaler()
train_scaled = scaler.fit_transform(train)
test_scaled = scaler.transform(test)

# Prepare the data for GRU
def create_dataset(data, look_back=60):
    X, y = [], []
    for i in range(look_back, len(data)):
        X.append(data[i-look_back:i])
        y.append(data[i, :4])  # Only predict OHLC, not sentiment
    return np.array(X), np.array(y)

X_train, y_train = create_dataset(train_scaled)
X_test, y_test = create_dataset(test_scaled)

# Reshape for GRU input
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], X_train.shape[2])
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], X_test.shape[2])

# Build the GRU model
model = Sequential()
model.add(GRU(units=50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.2))
model.add(GRU(units=50, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(units=4))  # Predict Open, High, Low, Close

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
history = model.fit(X_train, y_train, epochs=100, batch_size=32,
                    validation_data=(X_test, y_test),
                    callbacks=[early_stop])

# Predict
y_pred = model.predict(X_test)

# Inverse transform with dummy sentiment
def add_dummy_sentiment(data):
    dummy = np.zeros((data.shape[0], 1))
    return np.concatenate((data, dummy), axis=1)

y_pred_padded = add_dummy_sentiment(y_pred)
y_test_padded = add_dummy_sentiment(y_test)

y_pred_inverse = scaler.inverse_transform(y_pred_padded)[:, :4]
y_test_inverse = scaler.inverse_transform(y_test_padded)[:, :4]

# Evaluation
mse = mean_squared_error(y_test_inverse, y_pred_inverse)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test_inverse, y_pred_inverse)
r2 = r2_score(y_test_inverse, y_pred_inverse)

print(f"MSE: {mse}")
print(f"MAE: {mae}")
print(f"Accuracy (R²): {r2 * 100}%")
print(f"RMSE: {rmse}")

# Plot training and validation loss
plt.figure(figsize=(10, 6))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title("Training and Validation Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.show()

# Plot training vs testing data for OHLC prices
plt.figure(figsize=(12, 6))

# Plot for Open price
plt.subplot(2, 2, 1)
plt.plot(train.index, train['Open'], label="Train Open", color='blue')
plt.plot(test.index, test['Open'], label="Test Open", color='orange')
plt.title("Training vs Testing Open Price")
plt.xlabel("Date")
plt.ylabel("Open Price")
plt.legend()

# Plot for High price
plt.subplot(2, 2, 2)
plt.plot(train.index, train['High'], label="Train High", color='blue')
plt.plot(test.index, test['High'], label="Test High", color='orange')
plt.title("Training vs Testing High Price")
plt.xlabel("Date")
plt.ylabel("High Price")
plt.legend()

# Plot for Low price
plt.subplot(2, 2, 3)
plt.plot(train.index, train['Low'], label="Train Low", color='blue')
plt.plot(test.index, test['Low'], label="Test Low", color='orange')
plt.title("Training vs Testing Low Price")
plt.xlabel("Date")
plt.ylabel("Low Price")
plt.legend()

# Plot for Close price
plt.subplot(2, 2, 4)
plt.plot(train.index, train['Close'], label="Train Close", color='blue')
plt.plot(test.index, test['Close'], label="Test Close", color='orange')
plt.title("Training vs Testing Close Price")
plt.xlabel("Date")
plt.ylabel("Close Price")
plt.legend()

plt.tight_layout()
plt.show()


# TCN Model:-
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tcn import TCN
import os
import random
import tensorflow as tf
0.004
# Set random seeds for reproducibility
seed = 42
os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)
os.environ['TF_DETERMINISTIC_OPS'] = '1'
os.environ['TF_CUDNN_DETERMINISTIC'] = '1'

# Load the preprocessed dataset (after anomaly removal)
train = pd.read_csv("bhel_train_data.csv", index_col="date", parse_dates=True)
test = pd.read_csv("bhel_test_data.csv", index_col="date", parse_dates=True)

# Include sentiment score in the features
train = train[['Open', 'High', 'Low', 'Close', 'sentiment_score']]
test = test[['Open', 'High', 'Low', 'Close', 'sentiment_score']]

# Normalize the data
scaler = MinMaxScaler()
train_scaled = scaler.fit_transform(train)
test_scaled = scaler.transform(test)

# Prepare data
def create_dataset(data, look_back=60):
    X, y = [], []
    for i in range(look_back, len(data)):
        X.append(data[i - look_back:i])
        y.append(data[i, :4])  # Predict OHLC only
    return np.array(X), np.array(y)

X_train, y_train = create_dataset(train_scaled)
X_test, y_test = create_dataset(test_scaled)

# Build the TCN model
model = Sequential()
model.add(TCN(input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(units=4))  # Predict Open, High, Low, Close

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
history = model.fit(X_train, y_train, epochs=100, batch_size=32, 
                    validation_data=(X_test, y_test),
                    callbacks=[early_stop])

# Predict
y_pred = model.predict(X_test)

# Inverse transform
def add_dummy_sentiment(data):
    dummy = np.zeros((data.shape[0], 1))
    return np.concatenate((data, dummy), axis=1)

y_pred_padded = add_dummy_sentiment(y_pred)
y_test_padded = add_dummy_sentiment(y_test)

y_pred_inverse = scaler.inverse_transform(y_pred_padded)[:, :4]
y_test_inverse = scaler.inverse_transform(y_test_padded)[:, :4]

# Evaluation
mse = mean_squared_error(y_test_inverse, y_pred_inverse)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test_inverse, y_pred_inverse)
r2 = r2_score(y_test_inverse, y_pred_inverse)

print(f"MSE: {mse}")
print(f"MAE: {mae}")
print(f"Accuracy (R²): {r2 * 100}%")
print(f"RMSE: {rmse}")

# Plot training and validation loss
plt.figure(figsize=(10, 6))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title("Training and Validation Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.show()

# Plot training vs testing data for OHLC prices
plt.figure(figsize=(12, 6))

plt.subplot(2, 2, 1)
plt.plot(train.index, train['Open'], label="Train Open", color='blue')
plt.plot(test.index, test['Open'], label="Test Open", color='orange')
plt.title("Training vs Testing Open Price")
plt.xlabel("Date")
plt.ylabel("Open Price")
plt.legend()

plt.subplot(2, 2, 2)
plt.plot(train.index, train['High'], label="Train High", color='blue')
plt.plot(test.index, test['High'], label="Test High", color='orange')
plt.title("Training vs Testing High Price")
plt.xlabel("Date")
plt.ylabel("High Price")
plt.legend()

plt.subplot(2, 2, 3)
plt.plot(train.index, train['Low'], label="Train Low", color='blue')
plt.plot(test.index, test['Low'], label="Test Low", color='orange')
plt.title("Training vs Testing Low Price")
plt.xlabel("Date")
plt.ylabel("Low Price")
plt.legend()

plt.subplot(2, 2, 4)
plt.plot(train.index, train['Close'], label="Train Close", color='blue')
plt.plot(test.index, test['Close'], label="Test Close", color='orange')
plt.title("Training vs Testing Close Price")
plt.xlabel("Date")
plt.ylabel("Close Price")
plt.legend()

plt.tight_layout()
plt.show()


# XGBoost Model:-
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
import os
import random

# Set random seeds for reproducibility
seed = 42
os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)
np.random.seed(seed)


# Load the preprocessed dataset (after anomaly removal)
train = pd.read_csv("bhel_train_data.csv", index_col="date", parse_dates=True)
test = pd.read_csv("bhel_test_data.csv", index_col="date", parse_dates=True)

# Include sentiment score in the features
train = train[['Open', 'High', 'Low', 'Close', 'sentiment_score']]
test = test[['Open', 'High', 'Low', 'Close', 'sentiment_score']]

# Normalize the data
scaler = MinMaxScaler()
train_scaled = scaler.fit_transform(train)
test_scaled = scaler.transform(test)

# Prepare the data for XGBoost
def create_dataset(data, look_back=60):
    X, y = [], []
    for i in range(look_back, len(data)):
        X.append(data[i-look_back:i].flatten())  # Flatten for XGBoost
        y.append(data[i, :4])  # Only predict OHLC
    return np.array(X), np.array(y)

# Prepare train and test data
X_train, y_train = create_dataset(train_scaled)
X_test, y_test = create_dataset(test_scaled)

# Train separate XGBoost models for each OHLC value
models = []
predictions = []
for i in range(4):
    model = XGBRegressor(n_estimators=100, learning_rate=0.05, max_depth=5, random_state=seed)
    model.fit(X_train, y_train[:, i])
    y_pred = model.predict(X_test)
    predictions.append(y_pred)
    models.append(model)

# Combine predictions
y_pred = np.stack(predictions, axis=1)

# Inverse transform with dummy sentiment_score
def add_dummy_sentiment(data):
    dummy = np.zeros((data.shape[0], 1))
    return np.concatenate((data, dummy), axis=1)

y_pred_padded = add_dummy_sentiment(y_pred)
y_test_padded = add_dummy_sentiment(y_test)

y_pred_inverse = scaler.inverse_transform(y_pred_padded)[:, :4]
y_test_inverse = scaler.inverse_transform(y_test_padded)[:, :4]

# Evaluation
mse = mean_squared_error(y_test_inverse, y_pred_inverse)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test_inverse, y_pred_inverse)
r2 = r2_score(y_test_inverse, y_pred_inverse)

print(f"MSE: {mse}")
print(f"MAE: {mae}")
print(f"Accuracy (R²): {r2 * 100}%")
print(f"RMSE: {rmse}")


# Plot training vs testing data for OHLC prices
plt.figure(figsize=(12, 6))

# Plot for Open price
plt.subplot(2, 2, 1)
plt.plot(train.index, train['Open'], label="Train Open", color='blue')
plt.plot(test.index, test['Open'], label="Test Open", color='orange')
plt.title("Training vs Testing Open Price")
plt.xlabel("Date")
plt.ylabel("Open Price")
plt.legend()

# Plot for High price
plt.subplot(2, 2, 2)
plt.plot(train.index, train['High'], label="Train High", color='blue')
plt.plot(test.index, test['High'], label="Test High", color='orange')
plt.title("Training vs Testing High Price")
plt.xlabel("Date")
plt.ylabel("High Price")
plt.legend()

# Plot for Low price
plt.subplot(2, 2, 3)
plt.plot(train.index, train['Low'], label="Train Low", color='blue')
plt.plot(test.index, test['Low'], label="Test Low", color='orange')
plt.title("Training vs Testing Low Price")
plt.xlabel("Date")
plt.ylabel("Low Price")
plt.legend()

# Plot for Close price
plt.subplot(2, 2, 4)
plt.plot(train.index, train['Close'], label="Train Close", color='blue')
plt.plot(test.index, test['Close'], label="Test Close", color='orange')
plt.title("Training vs Testing Close Price")
plt.xlabel("Date")
plt.ylabel("Close Price")
plt.legend()

plt.tight_layout()
plt.show()


# LightGBM Model:-
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import os
import random

# Set random seeds for reproducibility
seed = 42
os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)
np.random.seed(seed)

# Load the preprocessed dataset (after anomaly removal)
train = pd.read_csv("bhel_train_data.csv", index_col="date", parse_dates=True)
test = pd.read_csv("bhel_test_data.csv", index_col="date", parse_dates=True)

# Include sentiment score in the features
train = train[['Open', 'High', 'Low', 'Close', 'sentiment_score']]
test = test[['Open', 'High', 'Low', 'Close', 'sentiment_score']]

# Normalize the data
scaler = MinMaxScaler()
train_scaled = scaler.fit_transform(train)
test_scaled = scaler.transform(test)

# Create sequences (look_back=60)
def create_dataset(data, look_back=60):
    X, y = [], []
    for i in range(look_back, len(data)):
        features = data[i-look_back:i].reshape(-1)
        X.append(features)
        y.append(data[i, :4])  # Only predict OHLC, not sentiment
    return np.array(X), np.array(y)

X_train, y_train = create_dataset(train_scaled)
X_test, y_test = create_dataset(test_scaled)

# Build and train separate LightGBM models for each OHLC value
y_pred = np.zeros_like(y_test)

for i, label in enumerate(['Open', 'High', 'Low', 'Close']):
    model = lgb.LGBMRegressor(n_estimators=30, random_state=seed)
    model.fit(X_train, y_train[:, i])
    y_pred[:, i] = model.predict(X_test)

# Inverse transform with dummy sentiment
def add_dummy_sentiment(data):
    dummy = np.zeros((data.shape[0], 1))
    return np.concatenate((data, dummy), axis=1)

y_pred_padded = add_dummy_sentiment(y_pred)
y_test_padded = add_dummy_sentiment(y_test)

y_pred_inverse = scaler.inverse_transform(y_pred_padded)[:, :4]
y_test_inverse = scaler.inverse_transform(y_test_padded)[:, :4]

# Evaluation
mse = mean_squared_error(y_test_inverse, y_pred_inverse)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test_inverse, y_pred_inverse)
r2 = r2_score(y_test_inverse, y_pred_inverse)

print(f"MSE: {mse}")
print(f"MAE: {mae}")
print(f"Accuracy (R²): {r2 * 100}%")
print(f"RMSE: {rmse}")

# Plotting Loss is skipped as LightGBM doesn’t return per-epoch history
# You can enable this with `model.booster_.evals_result_()` if needed

# Plot training vs testing data for OHLC prices
plt.figure(figsize=(12, 6))

# Plot for Open price
plt.subplot(2, 2, 1)
plt.plot(train.index, train['Open'], label="Train Open", color='blue')
plt.plot(test.index, test['Open'], label="Test Open", color='orange')
plt.title("Training vs Testing Open Price")
plt.xlabel("Date")
plt.ylabel("Open Price")
plt.legend()

# Plot for High price
plt.subplot(2, 2, 2)
plt.plot(train.index, train['High'], label="Train High", color='blue')
plt.plot(test.index, test['High'], label="Test High", color='orange')
plt.title("Training vs Testing High Price")
plt.xlabel("Date")
plt.ylabel("High Price")
plt.legend()

# Plot for Low price
plt.subplot(2, 2, 3)
plt.plot(train.index, train['Low'], label="Train Low", color='blue')
plt.plot(test.index, test['Low'], label="Test Low", color='orange')
plt.title("Training vs Testing Low Price")
plt.xlabel("Date")
plt.ylabel("Low Price")
plt.legend()

# Plot for Close price
plt.subplot(2, 2, 4)
plt.plot(train.index, train['Close'], label="Train Close", color='blue')
plt.plot(test.index, test['Close'], label="Test Close", color='orange')
plt.title("Training vs Testing Close Price")
plt.xlabel("Date")
plt.ylabel("Close Price")
plt.legend()

plt.tight_layout()
plt.show()
