In [1]:
import pandas as pd
from textblob import TextBlob
import praw
import re
import spacy
from sklearn.preprocessing import MinMaxScaler
from gpt4all import GPT4All
import csv
import yfinance as yf
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
import optuna
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.discriminant_analysis import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from tqdm import tqdm
import torch
from transformers import BertTokenizer, BertForSequenceClassification

In [None]:
df = pd.read_csv('bert.csv')
df

In [None]:
# Feature selection
features = df[['Likes', 'Num_Comments', 'Content_Sentiment_Score', 'Comment_Sentiment_Score', 'Content_Length', 'Comment_Length']]
target = df[['Stock_Price']]  # Ensure DataFrame structure to keep 2D array

# Feature normalization
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(features)

# Target variable normalization
target_scaler = MinMaxScaler()
target_scaled = target_scaler.fit_transform(target)

# Create time series samples
def create_sequences(data, seq_length):
    xs, ys = [], []
    for i in range(len(data) - seq_length):
        x = data[i:i+seq_length]
        y = data[i+seq_length, -1]
        xs.append(x)
        ys.append(y)
    return np.array(xs), np.array(ys)

seq_length = 7
X, y = create_sequences(np.hstack((X_scaled, target_scaled)), seq_length)

# Split training and test sets
train_size = int(len(X) * 0.7)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# Build single layer LSTM model
def build_lstm_model(units, learning_rate, input_shape):
    model = Sequential([
        LSTM(units, return_sequences=False, input_shape=input_shape),
        Dropout(0.3),
        Dense(100),
        Dense(1)
    ])
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mean_squared_error')
    return model

# Train LSTM model with best parameters
best_params = {'units': 150, 'learning_rate': 0.001}
final_model = build_lstm_model(best_params['units'], best_params['learning_rate'], (seq_length, X_train.shape[2]))

# Add early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train model with batch size of 8
history = final_model.fit(X_train, y_train, batch_size=8, epochs=100, validation_split=0.3, callbacks=[early_stopping])

# Predict and evaluate
predictions = final_model.predict(X_test)
predictions = target_scaler.inverse_transform(predictions)
y_test_actual = target_scaler.inverse_transform(y_test.reshape(-1, 1))

# Calculate regression metrics
mse = mean_squared_error(y_test_actual, predictions)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test_actual, predictions)
r2 = r2_score(y_test_actual, predictions)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R² Score: {r2}")

# Visualization settings
sns.set_theme(style="whitegrid")
colors = sns.color_palette("husl", 3)

# Plot actual vs predicted stock prices
plt.figure(figsize=(14, 7))
plt.plot(y_test_actual, label='Actual', color=colors[0])
plt.plot(predictions, label='Predicted', color=colors[1])
plt.xlabel('Samples')
plt.ylabel('Stock Price')
plt.title('Actual vs Predicted Stock Prices')
plt.legend()
plt.show()

# Residual analysis plot
residuals = y_test_actual - predictions
plt.figure(figsize=(14, 7))
plt.plot(residuals, label='Residuals', color=colors[2])
plt.xlabel('Samples')
plt.ylabel('Residuals')
plt.title('Residuals (Actual - Predicted)')
plt.legend()
plt.show()

# Plot loss function changes
plt.figure(figsize=(14, 7))
plt.plot(history.history['loss'], label='Train Loss', color=colors[0])
plt.plot(history.history['val_loss'], label='Validation Loss', color=colors[1])
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.show()


In [3]:
################# RESULT ######################
# Epoch 1/100
# 472/472 [==============================] - 4s 5ms/step - loss: 0.0123 - val_loss: 0.0033
# Epoch 2/100
# 472/472 [==============================] - 2s 4ms/step - loss: 0.0064 - val_loss: 0.0047
# Epoch 3/100
# 472/472 [==============================] - 2s 5ms/step - loss: 0.0051 - val_loss: 0.0029
# Epoch 4/100
# 472/472 [==============================] - 2s 5ms/step - loss: 0.0047 - val_loss: 0.0019
# Epoch 5/100
# 472/472 [==============================] - 2s 5ms/step - loss: 0.0045 - val_loss: 0.0019
# Epoch 6/100
# 472/472 [==============================] - 2s 5ms/step - loss: 0.0044 - val_loss: 0.0024
# Epoch 7/100
# 472/472 [==============================] - 2s 5ms/step - loss: 0.0041 - val_loss: 0.0019
# Epoch 8/100
# 472/472 [==============================] - 2s 5ms/step - loss: 0.0042 - val_loss: 0.0046
# Epoch 9/100
# 472/472 [==============================] - 2s 5ms/step - loss: 0.0040 - val_loss: 0.0019
# Epoch 10/100
# 472/472 [==============================] - 2s 4ms/step - loss: 0.0041 - val_loss: 0.0019
# Epoch 11/100
# 472/472 [==============================] - 2s 5ms/step - loss: 0.0038 - val_loss: 0.0019
# Epoch 12/100
# 472/472 [==============================] - 2s 5ms/step - loss: 0.0040 - val_loss: 0.0019
# Epoch 13/100
# 472/472 [==============================] - 2s 5ms/step - loss: 0.0040 - val_loss: 0.0021
# Epoch 14/100
# 472/472 [==============================] - 2s 5ms/step - loss: 0.0039 - val_loss: 0.0020
# Epoch 15/100
# 472/472 [==============================] - 2s 5ms/step - loss: 0.0039 - val_loss: 0.0019
# Epoch 16/100
# 472/472 [==============================] - 2s 5ms/step - loss: 0.0038 - val_loss: 0.0020
# Epoch 17/100
# 472/472 [==============================] - 2s 4ms/step - loss: 0.0037 - val_loss: 0.0028
# Epoch 18/100
# 472/472 [==============================] - 2s 5ms/step - loss: 0.0038 - val_loss: 0.0018
# Epoch 19/100
# 472/472 [==============================] - 2s 4ms/step - loss: 0.0037 - val_loss: 0.0021
# Epoch 20/100
# 472/472 [==============================] - 2s 5ms/step - loss: 0.0037 - val_loss: 0.0021
# Epoch 21/100
# 472/472 [==============================] - 2s 5ms/step - loss: 0.0037 - val_loss: 0.0023
# Epoch 22/100
# 472/472 [==============================] - 2s 5ms/step - loss: 0.0037 - val_loss: 0.0024
# Epoch 23/100
# 472/472 [==============================] - 2s 5ms/step - loss: 0.0035 - val_loss: 0.0019
# Epoch 24/100
# 472/472 [==============================] - 2s 4ms/step - loss: 0.0037 - val_loss: 0.0019
# Epoch 25/100
# 472/472 [==============================] - 2s 5ms/step - loss: 0.0035 - val_loss: 0.0022
# Epoch 26/100
# 472/472 [==============================] - 2s 5ms/step - loss: 0.0037 - val_loss: 0.0020
# Epoch 27/100
# 472/472 [==============================] - 2s 4ms/step - loss: 0.0036 - val_loss: 0.0019
# Epoch 28/100
# 472/472 [==============================] - 2s 5ms/step - loss: 0.0035 - val_loss: 0.0027
# 73/73 [==============================] - 0s 2ms/step
# Mean Squared Error (MSE): 69.60032426210358
# Root Mean Squared Error (RMSE): 8.34268087979539
# Mean Absolute Error (MAE): 2.5024898821474317
# R² Score: 0.9207340218809266


In [None]:
############# Here is use the TLBO ################
# Load dataset
df = pd.read_csv('bert.csv')

# Feature selection
features = df[['Likes', 'Num_Comments', 'Content_Sentiment_Score', 'Comment_Sentiment_Score', 'Content_Length', 'Comment_Length']]
target = df[['Stock_Price']]  # Ensure DataFrame structure to keep 2D array

# Feature normalization
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(features)

# Target variable normalization
target_scaler = MinMaxScaler()
target_scaled = target_scaler.fit_transform(target)

seq_length = 7

# Create time series samples
def create_sequences(data, seq_length):
    xs, ys = [], []
    for i in range(len(data) - seq_length):
        x = data[i:i+seq_length]
        y = data[i+seq_length, -1]
        xs.append(x)
        ys.append(y)
    return np.array(xs), np.array(ys)

X, y = create_sequences(np.hstack((X_scaled, target_scaled)), seq_length)
print(X.shape, y.shape)

# Split training and test sets
train_size = int(len(X) * 0.8)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

# Define LSTM model and evaluation function
def build_lstm_model(units, learning_rate, input_shape):
    model = Sequential([
        LSTM(units, return_sequences=False, input_shape=input_shape),
        Dropout(0.3),
        Dense(100),
        Dense(1)
    ])
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mean_squared_error')
    return model

def evaluate_model(params):
    units = int(params['units'])
    learning_rate = params['learning_rate']
    model = build_lstm_model(units, learning_rate, (seq_length, X_train.shape[2]))
    history = model.fit(X_train, y_train, batch_size=8, epochs=50, validation_split=0.2, verbose=0)
    predictions = model.predict(X_test)
    predictions = target_scaler.inverse_transform(predictions)
    y_test_actual = target_scaler.inverse_transform(y_test.reshape(-1, 1))
    mse = mean_squared_error(y_test_actual, predictions)
    return mse

# Initialize population
def initialize_population(nPop, param_bounds):
    pop = []
    for _ in range(nPop):
        individual = {
            'units': np.random.randint(param_bounds['units'][0], param_bounds['units'][1]),
            'learning_rate': np.random.uniform(param_bounds['learning_rate'][0], param_bounds['learning_rate'][1])
        }
        pop.append(individual)
    return pop

# Teacher phase and learner phase implementations
def teacher_phase(pop, teacher, param_bounds):
    new_population = []
    for individual in pop:
        new_individual = individual.copy()
        for param in individual:
            difference = np.random.rand() * (teacher[param] - individual[param])
            new_individual[param] += difference
            new_individual[param] = np.clip(new_individual[param], param_bounds[param][0], param_bounds[param][1])
        new_population.append(new_individual)
    return new_population

def learner_phase(pop, param_bounds):
    new_population = pop.copy()
    for i, learner1 in enumerate(pop):
        learner2 = pop[np.random.choice(len(pop))]
        new_individual = learner1.copy()
        for param in learner1:
            if evaluate_model(learner1) < evaluate_model(learner2):
                new_individual[param] += np.random.rand() * (learner1[param] - learner2[param])
            else:
                new_individual[param] += np.random.rand() * (learner2[param] - learner1[param])
            new_individual[param] = np.clip(new_individual[param], param_bounds[param][0], param_bounds[param][1])
        new_population[i] = new_individual
    return new_population

# TLBO optimization process
nPop = 10  # Population size
MaxIt = 30  # Maximum number of iterations
param_bounds = {'units': (100, 150), 'learning_rate': (1e-4, 1e-2)}
pop = initialize_population(nPop, param_bounds)

# Set early stopping MSE threshold
mse_threshold = 1e-5  # Stop iteration if MSE is less than or equal to this value
best_mse = np.inf
best_params = None

for it in range(MaxIt):
    fitness = [evaluate_model(individual) for individual in pop]
    teacher = pop[np.argmin(fitness)]
    pop = teacher_phase(pop, teacher, param_bounds)
    pop = learner_phase(pop, param_bounds)
    
    # Update best parameters
    current_best_mse = min(fitness)
    if current_best_mse < best_mse:
        best_mse = current_best_mse
        best_params = pop[np.argmin(fitness)]
    
    # Output current best parameters for each iteration
    print(f'Iteration {it + 1}, Best MSE: {best_mse}, Best Params: {best_params}')
    
    # Early stopping condition: MSE less than or equal to threshold
    if best_mse <= mse_threshold:
        print(f"Stopping early at iteration {it + 1} with MSE: {best_mse}")
        break

# Output final best parameters and best MSE
print(f"Best MSE: {best_mse}, Best Params: {best_params}")

# Use best parameters to retrain LSTM model
final_model = build_lstm_model(int(best_params['units']), best_params['learning_rate'], (seq_length, X_train.shape[2]))
final_model.fit(X_train, y_train, batch_size=8, epochs=100, validation_split=0.2)

# Model evaluation and visualization

# Predict and evaluate
predictions = final_model.predict(X_test)
predictions = target_scaler.inverse_transform(predictions)
y_test_actual = target_scaler.inverse_transform(y_test.reshape(-1, 1))

# Calculate regression metrics
mse = mean_squared_error(y_test_actual, predictions)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test_actual, predictions)
r2 = r2_score(y_test_actual, predictions)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R² Score: {r2}")

# Visualization settings
sns.set_theme(style="whitegrid")
colors = sns.color_palette("husl", 3)

# Plot actual vs predicted stock prices
plt.figure(figsize=(14, 7))
plt.plot(y_test_actual, label='Actual', color=colors[0])
plt.plot(predictions, label='Predicted', color=colors[1])
plt.xlabel('Samples')
plt.ylabel('Stock Price')
plt.title('Actual vs Predicted Stock Prices')
plt.legend()
plt.show()

# Residual analysis plot
residuals = y_test_actual - predictions
plt.figure(figsize=(14, 7))
plt.plot(residuals, label='Residuals', color=colors[2])
plt.xlabel('Samples')
plt.ylabel('Residuals')
plt.title('Residuals (Actual - Predicted)')
plt.legend()
plt.show()

# Plot loss function changes
plt.figure(figsize=(14, 7))
plt.plot(history.history['loss'], label='Train Loss', color=colors[0])
plt.plot(history.history['val_loss'], label='Validation Loss', color=colors[1])
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.show()
