In [2]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.layers import Input, Embedding, LSTM, Bidirectional, Dropout, Dense, Layer
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam
from keras.regularizers import l2
import tensorflow as tf



In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# Load data
df = pd.read_csv("stock_data.csv")

In [5]:
# Convert sentiment -1 to 0
df["Sentiment"] = df["Sentiment"].replace(-1, 0)

In [6]:
# Text cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'@[A-Za-z0-9_]+', '', text)
    text = re.sub(r'\$[A-Za-z]+', '', text)
    text = re.sub(r'http\S+|www.\S+', '', text)
    text = re.sub(r'[^a-zA-Z ]', '', text)
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])
    return text

In [7]:
# Apply cleaning
df['Clean_Text'] = df['Text'].apply(clean_text)

In [8]:
# Tokenization and padding
tokenizer = Tokenizer(num_words=15000, oov_token="<OOV>")
tokenizer.fit_on_texts(df['Clean_Text'])

In [9]:
sequences = tokenizer.texts_to_sequences(df['Clean_Text'])
padded = pad_sequences(sequences, maxlen=32, padding='post', truncating='post')

In [10]:
X = padded
y = df['Sentiment'].values

In [11]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [12]:
# Custom attention layer
class Attention(Layer):
    def __init__(self, **kwargs):
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name='att_weight', shape=(input_shape[-1], 1),
                                 initializer='random_normal', trainable=True)
        self.b = self.add_weight(name='att_bias', shape=(input_shape[1], 1),
                                 initializer='zeros', trainable=True)
        super(Attention, self).build(input_shape)

    def call(self, inputs):
        e = tf.keras.backend.tanh(tf.keras.backend.dot(inputs, self.W) + self.b)
        a = tf.keras.backend.softmax(e, axis=1)
        output = inputs * a
        return tf.keras.backend.sum(output, axis=1)

In [13]:
# Model with attention
embedding_dim = 100
vocab_size = 15000

input_layer = Input(shape=(32,))
embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=32)(input_layer)
bi_lstm = Bidirectional(LSTM(64, return_sequences=True))(embedding_layer)
dropout = Dropout(0.6)(bi_lstm)
attention = Attention()(dropout)
dense = Dense(64, activation='relu', kernel_regularizer=l2(0.01))(attention)
dropout2 = Dropout(0.4)(dense)
output = Dense(1, activation='sigmoid')(dropout2)

model = Model(inputs=input_layer, outputs=output)

In [14]:
# Compile
model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

In [15]:
# Train
early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
model.fit(X_train, y_train, validation_split=0.1, epochs=10, batch_size=32, callbacks=[early_stop])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


<keras.src.callbacks.History at 0x1d4cd042ee0>

In [16]:
# Evaluate
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.7688


In [19]:
model.save("sentiment_model.keras")  # No .h5 — this saves in TensorFlow's SavedModel format