In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import LabelBinarizer
import joblib
import os

# Define paths for loading data and saving models
data_path = "../data/"
train_file = data_path + "train_data.csv"
val_file = data_path + "val_data.csv"
model_save_path = "../models/"

# Load training and validation data
def load_data(train_path, val_path):
    """
    Function to load training and validation data from CSV files.
    """
    train_df = pd.read_csv(train_path)
    val_df = pd.read_csv(val_path)
    return train_df, val_df

train_df, val_df = load_data(train_file, val_file)

# Prepare text and labels
X_train = train_df['clean_text']
y_train = train_df['category']
X_val = val_df['clean_text']
y_val = val_df['category']

# Tokenize and pad sequences
def prepare_text_sequences(X_train, X_val, max_words=10000, max_len=100):
    """
    Function to tokenize and pad text sequences for deep learning models.
    """
    tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
    tokenizer.fit_on_texts(X_train)

    # Convert text to sequences
    train_sequences = tokenizer.texts_to_sequences(X_train)
    val_sequences = tokenizer.texts_to_sequences(X_val)

    # Pad sequences
    train_padded = pad_sequences(train_sequences, maxlen=max_len, padding='post', truncating='post')
    val_padded = pad_sequences(val_sequences, maxlen=max_len, padding='post', truncating='post')

    return train_padded, val_padded, tokenizer

X_train_padded, X_val_padded, tokenizer = prepare_text_sequences(X_train, X_val)

# One-hot encode labels
def one_hot_encode_labels(y_train, y_val):
    """
    Function to one-hot encode labels for multi-class classification.
    """
    encoder = LabelBinarizer()
    y_train_encoded = encoder.fit_transform(y_train)
    y_val_encoded = encoder.transform(y_val)
    return y_train_encoded, y_val_encoded, encoder

y_train_encoded, y_val_encoded, label_encoder = one_hot_encode_labels(y_train, y_val)

# Save the tokenizer and label encoder for reuse
joblib.dump(tokenizer, os.path.join(model_save_path, "tokenizer.pkl"))
joblib.dump(label_encoder, os.path.join(model_save_path, "label_encoder.pkl"))
print("Tokenizer and label encoder saved.")

# Build LSTM model
def build_lstm_model(input_dim, embedding_dim=128, input_length=100, lstm_units=64):
    """
    Function to build an LSTM-based model with an embedding layer.
    """
    model = Sequential([
        Embedding(input_dim=input_dim, output_dim=embedding_dim, input_length=input_length),
        Bidirectional(LSTM(lstm_units, return_sequences=True)),
        Dropout(0.3),
        Bidirectional(LSTM(lstm_units // 2)),
        Dense(32, activation='relu'),
        Dropout(0.3),
        Dense(y_train_encoded.shape[1], activation='softmax')
    ])
    return model

# Define model parameters
vocab_size = len(tokenizer.word_index) + 1
max_len = 100

lstm_model = build_lstm_model(input_dim=vocab_size, input_length=max_len)

# Compile the model
lstm_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                   loss='categorical_crossentropy',
                   metrics=['accuracy'])

# Train the model
lstm_model.fit(X_train_padded, y_train_encoded, 
               validation_data=(X_val_padded, y_val_encoded), 
               epochs=5, batch_size=64)

# Save the trained model
lstm_model.save(os.path.join(model_save_path, "lstm_model.h5"))
print("LSTM model saved.")

# Evaluate the model
def evaluate_lstm_model(model, X_val, y_val, encoder):
    """
    Function to evaluate the LSTM model on validation data.
    """
    val_preds = model.predict(X_val)
    val_preds_labels = np.argmax(val_preds, axis=1)
    y_val_labels = np.argmax(y_val, axis=1)

    accuracy = accuracy_score(y_val_labels, val_preds_labels)
    f1 = f1_score(y_val_labels, val_preds_labels, average='weighted')
    precision = precision_score(y_val_labels, val_preds_labels, average='weighted')
    recall = recall_score(y_val_labels, val_preds_labels, average='weighted')

    print("\nLSTM Model Evaluation Metrics:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1-score: {f1:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")

evaluate_lstm_model(lstm_model, X_val_padded, y_val_encoded, label_encoder)





































