In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.metrics import classification_report

# ## 1. Load the dataset
def load_data(train_path, valid_path):
    """Load training and validation datasets from CSV files."""
    train_df = pd.read_csv(train_path)
    valid_df = pd.read_csv(valid_path)
    return train_df, valid_df

train_path = "sent_train.csv"
valid_path = "sent_valid.csv"
train_df, valid_df = load_data(train_path, valid_path)

# ## 2. Visualize the dataset
print("Training Data Sample:")
print(train_df.head())
print("\nValidation Data Sample:")
print(valid_df.head())

# ## 3. Preprocess the text data
MAX_NUM_WORDS = 5000  # Maximum vocabulary size
MAX_SEQUENCE_LENGTH = 200  # Maximum sequence length for padding
EMBEDDING_DIM = 64  # Word embedding dimension

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(train_df["text"])

# Convert text to sequences
X_train = tokenizer.texts_to_sequences(train_df["text"])
X_valid = tokenizer.texts_to_sequences(valid_df["text"])

# Pad sequences to ensure uniform input size
X_train = pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH, padding="post", truncating="post")
X_valid = pad_sequences(X_valid, maxlen=MAX_SEQUENCE_LENGTH, padding="post", truncating="post")

# Convert labels to categorical format
y_train = to_categorical(train_df["label"], num_classes=3)
y_valid = to_categorical(valid_df["label"], num_classes=3)

# ## 4. Define the LSTM Model
def build_lstm_model():
    """Build a sequential LSTM model for sentiment classification."""
    model = Sequential([
        Embedding(MAX_NUM_WORDS, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH),
        LSTM(128, return_sequences=True),
        LSTM(64),
        Dropout(0.5),
        Dense(32, activation='relu'),
        Dense(3, activation='softmax')  # 3 output classes
    ])
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

model = build_lstm_model()
model.summary()

# ## 5. Train the model
EPOCHS = 3
BATCH_SIZE = 32 #This is the number of samples processed before the model is updated.

history = model.fit(X_train, y_train, validation_data=(X_valid, y_valid), epochs=EPOCHS, batch_size=BATCH_SIZE)

# ## 6. Evaluate the Model
y_pred = model.predict(X_valid)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true = np.argmax(y_valid, axis=1)

print("\nClassification Report:")
print(classification_report(y_true, y_pred_classes, target_names=["Bearish", "Bullish", "Neutral"]))



Training Data Sample:
                                                text  label
0  $BYND - JPMorgan reels in expectations on Beyo...      0
1  $CCL $RCL - Nomura points to bookings weakness...      0
2  $CX - Cemex cut at Credit Suisse, J.P. Morgan ...      0
3  $ESS: BTIG Research cuts to Neutral https://t....      0
4  $FNKO - Funko slides after Piper Jaffray PT cu...      0

Validation Data Sample:
                                                text  label
0  $ALLY - Ally Financial pulls outlook https://t...      0
1  $DELL $HPE - Dell, HPE targets trimmed on comp...      0
2  $PRTY - Moody's turns negative on Party City h...      0
3                   $SAN: Deutsche Bank cuts to Hold      0
4                  $SITC: Compass Point cuts to Sell      0




Epoch 1/6
[1m299/299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 157ms/step - accuracy: 0.6329 - loss: 0.9265 - val_accuracy: 0.6558 - val_loss: 0.8811
Epoch 2/6
[1m299/299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 155ms/step - accuracy: 0.6376 - loss: 0.9100 - val_accuracy: 0.6558 - val_loss: 0.8786
Epoch 3/6
[1m299/299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 158ms/step - accuracy: 0.6489 - loss: 0.8936 - val_accuracy: 0.6558 - val_loss: 0.8819
Epoch 4/6
[1m299/299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 157ms/step - accuracy: 0.6493 - loss: 0.8932 - val_accuracy: 0.6558 - val_loss: 0.8786
Epoch 5/6
[1m299/299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 155ms/step - accuracy: 0.6404 - loss: 0.9018 - val_accuracy: 0.6558 - val_loss: 0.8782
Epoch 6/6
[1m299/299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 156ms/step - accuracy: 0.6448 - loss: 0.8957 - val_accuracy: 0.6558 - val_loss: 0.8795
[1m75/75[0m [

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
