In [1]:
# Data Handling
import numpy as np
import pandas as pd

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Feature Scaling
from sklearn.preprocessing import MinMaxScaler

# Model Building
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")


In [3]:
import yfinance as yf
import pandas as pd

# List of stocks to predict
tickers = ["AAPL", "MSFT", "TSLA", "GOOGL", "AMZN"]

# Download historical data
start_date = "2010-01-01"
end_date = "2024-01-01"

stocks_data = {}
for ticker in tickers:
    stocks_data[ticker] = yf.download(ticker, start=start_date, end=end_date)

# Convert to DataFrame
df_stocks = {ticker: data[['Close']] for ticker, data in stocks_data.items()}
df_combined = pd.concat(df_stocks, axis=1)
df_combined.columns = [f"{col[1]}_{col[0]}" for col in df_combined.columns]  # Flatten MultiIndex

# Display combined stock data
print(df_combined.head())


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


            Close_AAPL  Close_MSFT  Close_TSLA  Close_GOOGL  Close_AMZN
Date                                                                   
2010-01-04    7.643214   30.950001         NaN    15.684434      6.6950
2010-01-05    7.656429   30.959999         NaN    15.615365      6.7345
2010-01-06    7.534643   30.770000         NaN    15.221722      6.6125
2010-01-07    7.520714   30.450001         NaN    14.867367      6.5000
2010-01-08    7.570714   30.660000         NaN    15.065566      6.6760


In [4]:
import requests
from transformers import pipeline

# Load NLP sentiment analysis model
sentiment_pipeline = pipeline("sentiment-analysis")

# Function to fetch latest stock news
def get_stock_news(query):
    url = f"https://newsapi.org/v2/everything?q={query}&apiKey=b59154c7234d4dbc81b86455abc86c51"
    response = requests.get(url).json()
    return response["articles"][:5]  # Get top 5 articles

# Analyze sentiment of news headlines
def analyze_sentiment(news_articles):
    headlines = [article["title"] for article in news_articles]
    sentiment_scores = sentiment_pipeline(headlines)
    return sum([score["score"] if score["label"] == "POSITIVE" else -score["score"] for score in sentiment_scores]) / len(sentiment_scores)

# Example: Fetch news for Apple
news_articles = get_stock_news("Apple Stock")
sentiment_score = analyze_sentiment(news_articles)

print(f"Apple Sentiment Score: {sentiment_score}")


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cpu


Apple Sentiment Score: -0.19786211252212524


In [5]:
def add_technical_indicators(data):
    data["SMA_20"] = data["Close"].rolling(window=20).mean()
    data["EMA_20"] = data["Close"].ewm(span=20, adjust=False).mean()
    data["RSI"] = 100 - (100 / (1 + (data["Close"].diff().clip(lower=0).rolling(14).mean() / 
                                    (-data["Close"].diff().clip(upper=0).rolling(14).mean()))))
    return data

# Apply feature engineering to each stock
for ticker in tickers:
    stocks_data[ticker] = add_technical_indicators(stocks_data[ticker])

df_combined = pd.concat({ticker: data for ticker, data in stocks_data.items()}, axis=1)
df_combined.head()


Unnamed: 0_level_0,AAPL,AAPL,AAPL,AAPL,AAPL,AAPL,AAPL,AAPL,AAPL,MSFT,...,GOOGL,AMZN,AMZN,AMZN,AMZN,AMZN,AMZN,AMZN,AMZN,AMZN
Price,Adj Close,Close,High,Low,Open,Volume,SMA_20,EMA_20,RSI,Adj Close,...,RSI,Adj Close,Close,High,Low,Open,Volume,SMA_20,EMA_20,RSI
Ticker,AAPL,AAPL,AAPL,AAPL,AAPL,AAPL,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,MSFT,...,Unnamed: 12_level_2,AMZN,AMZN,AMZN,AMZN,AMZN,AMZN,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Date,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2010-01-04,6.447412,7.643214,7.660714,7.585,7.6225,493729600,,7.643214,,23.30068,...,,6.695,6.695,6.8305,6.657,6.8125,151998000,,6.695,
2010-01-05,6.45856,7.656429,7.699643,7.616071,7.664286,601904800,,7.644473,,23.308207,...,,6.7345,6.7345,6.774,6.5905,6.6715,177038000,,6.698762,
2010-01-06,6.355829,7.534643,7.686786,7.526786,7.656429,552160000,,7.634013,,23.165167,...,,6.6125,6.6125,6.7365,6.5825,6.73,143576000,,6.690547,
2010-01-07,6.344078,7.520714,7.571429,7.466071,7.5625,477131200,,7.623222,,22.924246,...,,6.5,6.5,6.616,6.44,6.6005,220604000,,6.672399,
2010-01-08,6.386254,7.570714,7.571429,7.466429,7.510714,447610800,,7.618222,,23.082354,...,,6.676,6.676,6.684,6.4515,6.528,196610000,,6.672742,


In [6]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Scaling data
scaler = MinMaxScaler()
df_scaled = scaler.fit_transform(df_combined)

# Create sequences
sequence_length = 60  # Use past 60 days to predict next day
def create_sequences(data, sequence_length=60):
    X, y = [], []
    for i in range(len(data) - sequence_length):
        X.append(data[i:i+sequence_length])
        y.append(data[i+sequence_length])
    return np.array(X), np.array(y)

X, y = create_sequences(df_scaled, sequence_length)

# Split into training & testing
train_size = int(0.8 * len(X))
X_train, X_test, y_train, y_test = X[:train_size], X[train_size:], y[:train_size], y[train_size:]


In [7]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

# Define model
model = Sequential([
    LSTM(100, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])),
    Dropout(0.3),
    LSTM(100, return_sequences=True),
    Dropout(0.3),
    LSTM(50, return_sequences=False),
    Dropout(0.3),
    Dense(1)
])

# Compile
model.compile(optimizer='adam', loss='mean_squared_error')

# Train
model.fit(X_train, y_train, epochs=50, batch_size=16, validation_data=(X_test, y_test))


Epoch 1/50
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 89ms/step - loss: nan - val_loss: nan
Epoch 2/50
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 84ms/step - loss: nan - val_loss: nan
Epoch 3/50
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 86ms/step - loss: nan - val_loss: nan
Epoch 4/50
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 84ms/step - loss: nan - val_loss: nan
Epoch 5/50
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 85ms/step - loss: nan - val_loss: nan
Epoch 6/50
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 84ms/step - loss: nan - val_loss: nan
Epoch 7/50
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 86ms/step - loss: nan - val_loss: nan
Epoch 8/50
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 84ms/step - loss: nan - val_loss: nan
Epoch 9/50
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15

<keras.src.callbacks.history.History at 0x7eaafb333ee0>

In [9]:
import numpy as np

# Check for NaNs and infinite values
print(f"NaNs in y_train: {np.isnan(y_train).sum()}")
print(f"NaNs in y_test: {np.isnan(y_test).sum()}")
print(f"Infinite values in y_train: {np.isinf(y_train).sum()}")
print(f"Infinite values in y_test: {np.isinf(y_test).sum()}")

# Check for very large or extreme values
print(f"y_train min: {np.min(y_train)}, max: {np.max(y_train)}")
print(f"y_test min: {np.min(y_test)}, max: {np.max(y_test)}")


NaNs in y_train: 591
NaNs in y_test: 0
Infinite values in y_train: 0
Infinite values in y_test: 0
y_train min: nan, max: nan
y_test min: 0.0, max: 1.0000000000000002


In [10]:
# Replace NaNs and Infs with mean value
y_train = np.nan_to_num(y_train, nan=np.nanmean(y_train), posinf=np.nanmean(y_train), neginf=np.nanmean(y_train))
y_test = np.nan_to_num(y_test, nan=np.nanmean(y_test), posinf=np.nanmean(y_test), neginf=np.nanmean(y_test))


In [11]:
y_train = y_train.reshape(-1)
y_test = y_test.reshape(-1)


In [18]:
print(f"X_train_xgb shape: {X_train_xgb.shape}")
print(f"y_train shape: {y_train.shape}")


X_train_xgb shape: (2769, 45)
y_train shape: (124605,)


In [19]:
min_length = min(len(X_train), len(y_train))
X_train_xgb = X_train[:min_length, -1, :]  # Extract only valid samples
y_train = y_train[:min_length]  # Trim target labels to match


In [20]:
min_length_test = min(len(X_test), len(y_test))
X_test_xgb = X_test[:min_length_test, -1, :]
y_test = y_test[:min_length_test]


In [21]:
print(f"Fixed X_train_xgb shape: {X_train_xgb.shape}")
print(f"Fixed y_train shape: {y_train.shape}")


Fixed X_train_xgb shape: (2769, 45)
Fixed y_train shape: (2769,)


In [22]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

# Initialize and train XGBoost
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train_xgb, y_train)  # Now both have the same number of samples

# Make predictions
y_xgb_pred = xgb_model.predict(X_test_xgb)

# Evaluate performance
mse_xgb = mean_squared_error(y_test, y_xgb_pred)
print(f"XGBoost MSE: {mse_xgb}")


XGBoost MSE: 0.2522192114929415


In [23]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# Ensure data is 3D for LSTM
X_train_lstm = np.expand_dims(X_train_xgb, axis=2)
X_test_lstm = np.expand_dims(X_test_xgb, axis=2)

# Build LSTM model
model_lstm = Sequential([
    LSTM(100, return_sequences=True, input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2])),
    Dropout(0.3),
    LSTM(100, return_sequences=True),
    Dropout(0.3),
    LSTM(50, return_sequences=False),
    Dropout(0.3),
    Dense(1)
])

# Compile model
model_lstm.compile(optimizer='adam', loss='mean_squared_error')

# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train LSTM
model_lstm.fit(X_train_lstm, y_train, epochs=50, batch_size=16, validation_data=(X_test_lstm, y_test), callbacks=[early_stopping])


Epoch 1/50
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 69ms/step - loss: nan - val_loss: nan
Epoch 2/50
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 65ms/step - loss: nan - val_loss: nan
Epoch 3/50
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 65ms/step - loss: nan - val_loss: nan
Epoch 4/50
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 66ms/step - loss: nan - val_loss: nan
Epoch 5/50
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 67ms/step - loss: nan - val_loss: nan


<keras.src.callbacks.history.History at 0x7eaad86de020>

In [24]:
# Predict with LSTM
y_lstm_pred = model_lstm.predict(X_test_lstm)


[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 49ms/step


In [25]:
# Define weights for hybrid model
weight_xgb = 0.7
weight_lstm = 0.3

# Hybrid model prediction (weighted average)
y_hybrid_pred = (weight_xgb * y_xgb_pred) + (weight_lstm * y_lstm_pred.flatten())


In [26]:
from sklearn.metrics import mean_squared_error, r2_score

# Compute MSE and R² Score
mse_hybrid = mean_squared_error(y_test, y_hybrid_pred)
r2_hybrid = r2_score(y_test, y_hybrid_pred)

print(f"Hybrid Model MSE: {mse_hybrid}")
print(f"Hybrid Model R² Score: {r2_hybrid}")


ValueError: Input contains NaN.

In [27]:
import numpy as np

# Check NaN values in predictions and actual values
print(f"NaNs in y_hybrid_pred: {np.isnan(y_hybrid_pred).sum()}")
print(f"NaNs in y_test: {np.isnan(y_test).sum()}")


NaNs in y_hybrid_pred: 693
NaNs in y_test: 0


In [28]:
# Replace NaNs in y_hybrid_pred
y_hybrid_pred = np.nan_to_num(y_hybrid_pred, nan=np.nanmean(y_hybrid_pred))


In [29]:
y_test = np.nan_to_num(y_test, nan=np.nanmean(y_test))


In [30]:
print(f"NaNs in y_hybrid_pred (After Fix): {np.isnan(y_hybrid_pred).sum()}")
print(f"NaNs in y_test (After Fix): {np.isnan(y_test).sum()}")


NaNs in y_hybrid_pred (After Fix): 693
NaNs in y_test (After Fix): 0


In [31]:
print(f"NaNs in LSTM Predictions: {np.isnan(y_lstm_pred).sum()}")
print(f"NaNs in XGBoost Predictions: {np.isnan(y_xgb_pred).sum()}")


NaNs in LSTM Predictions: 693
NaNs in XGBoost Predictions: 0


In [35]:
y_lstm_pred = np.nan_to_num(y_lstm_pred, nan=np.nanmean(y_lstm_pred))


In [33]:
y_xgb_pred = np.nan_to_num(y_xgb_pred, nan=np.nanmean(y_xgb_pred))


In [36]:
print(f"NaNs in LSTM Predictions (After Fix): {np.isnan(y_lstm_pred).sum()}")
print(f"NaNs in XGBoost Predictions (After Fix): {np.isnan(y_xgb_pred).sum()}")


NaNs in LSTM Predictions (After Fix): 693
NaNs in XGBoost Predictions (After Fix): 0


In [38]:
import matplotlib.pyplot as plt

# Plot training loss
plt.plot(model_lstm.history.history['loss'], label='Training Loss')
plt.plot(model_lstm.history.history['val_loss'], label='Validation Loss')
plt.legend()
plt.title("LSTM Training Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.show()


KeyError: 'loss'