# Introduction
 This notebook leverages pre-defined functions from the `train_lstm.py` script to train an LSTM model on the Sentiment140 dataset using tokenized text sequences.

### Setup

In [None]:
import sys
import numpy as np
import matplotlib.pyplot as plt
sys.path.append('../../src/models/')  # Add the path to the script

In [None]:
from train_lstm import (
    load_data, tokenize_and_pad, build_lstm_attention_model,
    train_lstm_attention_model, evaluate_model, save_model_and_tokenizer
)

### Load the cleaned data

In [None]:
df = load_data('../../data/processed/cleaned_data.csv')
df = df.dropna(subset=['clean_text'])

### Tokenize and pad sequences

In [None]:
X, y, tokenizer = tokenize_and_pad(df)

### Build LSTM Model

In [None]:
model = build_lstm_attention_model()

In [None]:
model.summary()

### Train the LSTM Model

In [None]:
model, history = train_lstm_attention_model(X, y, model)

### Evaluate the Model

In [None]:
accuracy, report = evaluate_model(model, X, y)
print(f"Model Accuracy on Full Dataset: {accuracy}")
print("\nClassification Report:\n", report)

### Visualization of training history (e.g., loss and accuracy over epochs)

In [None]:
# Extract loss and accuracy data
loss = history.history['loss']
val_loss = history.history['val_loss']
accuracy = history.history['accuracy']
val_accuracy = history.history['val_accuracy']

# Number of epochs
epochs = range(1, len(loss) + 1)

# Plot loss
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

# Plot accuracy
plt.subplot(1, 2, 2)
plt.plot(epochs, accuracy, 'bo', label='Training accuracy')
plt.plot(epochs, val_accuracy, 'b', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.show()

In [None]:
# Convert predictions to class labels
predictions = model.predict(X)
y_pred = np.argmax(predictions, axis=1)
y_true = np.argmax(y, axis=1)

# Generate confusion matrix
cm = confusion_matrix(y_true, y_pred)

# Display confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap=plt.cm.Blues)
plt.show()

### Save the model and tokenizer

In [None]:
save_model_and_tokenizer(
    model, tokenizer,
    '../../models/lstm_model.h5',
    '../../models/tokenizer_lstm.pkl'
)