In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import joblib
import os

# Define file paths for data and model saving
data_path = "../data/"
train_file = data_path + "train_data.csv"
val_file = data_path + "val_data.csv"
model_save_path = "../models/"


In [14]:
# Load training and validation data
def load_data(train_path, val_path):
    """
    Load training and validation data from CSV files with debug statements.
    """
    try:
        # Load training and validation CSV files
        train_df = pd.read_csv(train_path)
        val_df = pd.read_csv(val_path)

        # Debug: Print shapes of dataframes
        print(f"[DEBUG] Training data loaded successfully with shape: {train_df.shape}")
        print(f"[DEBUG] Validation data loaded successfully with shape: {val_df.shape}")
    except FileNotFoundError as e:
        print(f"[ERROR] File not found: {e}")
        raise
    except Exception as e:
        print(f"[ERROR] An unexpected error occurred while loading data: {e}")
        raise

    return train_df, val_df


# Load the data into DataFrames
train_df, val_df = load_data(train_file, val_file)

# Extract text and labels from data
try:
    # Handle missing or non-string values in 'clean_text' by filling with empty strings
    train_df['clean_text'] = train_df['clean_text'].fillna('').astype(str)
    val_df['clean_text'] = val_df['clean_text'].fillna('').astype(str)

    X_train, y_train = train_df['clean_text'], train_df['category']
    X_val, y_val = val_df['clean_text'], val_df['category']

    # Debug: Print shapes of text and label data
    print(f"[DEBUG] X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
    print(f"[DEBUG] X_val shape: {X_val.shape}, y_val shape: {y_val.shape}")
except KeyError as e:
    print(f"[ERROR] Key error: {e}. Please check if the expected columns exist.")
    raise


# Tokenize and pad sequences
def prepare_text_sequences(X_train, X_val, max_words=10000, max_len=100):
    """
    Tokenize and pad text sequences for deep learning models with debug statements.
    """
    tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
    tokenizer.fit_on_texts(X_train)

    # Convert text to sequences
    train_sequences = tokenizer.texts_to_sequences(X_train)
    val_sequences = tokenizer.texts_to_sequences(X_val)

    # Debug: Print an example of tokenized sequence
    print(f"[DEBUG] Example of tokenized training sequence: {train_sequences[:1]}")

    # Pad sequences to ensure equal length
    train_padded = pad_sequences(train_sequences, maxlen=max_len, padding='post', truncating='post')
    val_padded = pad_sequences(val_sequences, maxlen=max_len, padding='post', truncating='post')

    # Debug: Print shapes of padded sequences
    print(f"[DEBUG] Padded training sequence shape: {train_padded.shape}")
    print(f"[DEBUG] Padded validation sequence shape: {val_padded.shape}")

    return train_padded, val_padded, tokenizer


# Prepare sequences for training and validation sets
X_train_padded, X_val_padded, tokenizer = prepare_text_sequences(X_train, X_val)

# Convert labels to integer type to avoid float issues
try:
    y_train = y_train.astype(int)
    y_val = y_val.astype(int)
    # Debug: Print unique values of labels to confirm correct conversion
    print(f"[DEBUG] Labels converted to integer type. y_train unique values: {y_train.unique()}")
except ValueError as e:
    print(f"[ERROR] Value error while converting labels to integers: {e}")
    raise



[DEBUG] Training data loaded successfully with shape: (104302, 6)
[DEBUG] Validation data loaded successfully with shape: (26076, 6)
[DEBUG] X_train shape: (104302,), y_train shape: (104302,)
[DEBUG] X_val shape: (26076,), y_val shape: (26076,)
[DEBUG] Example of tokenized training sequence: [[1, 23, 202, 682, 2, 4, 92, 313, 346, 187, 9, 684, 3, 183, 4735, 1313]]
[DEBUG] Padded training sequence shape: (104302, 100)
[DEBUG] Padded validation sequence shape: (26076, 100)
[DEBUG] Labels converted to integer type. y_train unique values: [ 0  1 -1]


In [15]:

# One-hot encode labels
def one_hot_encode_labels(y_train, y_val):
    """
    One-hot encode labels for multi-class classification with debug statements.
    """
    encoder = LabelBinarizer()

    try:
        y_train_encoded = encoder.fit_transform(y_train)
        y_val_encoded = encoder.transform(y_val)
        # Debug: Print an example of encoded label and shapes of encoded arrays
        print(f"[DEBUG] Example of one-hot encoded training label: {y_train_encoded[:1]}")
        print(f"[DEBUG] One-hot encoded training labels shape: {y_train_encoded.shape}")
        print(f"[DEBUG] One-hot encoded validation labels shape: {y_val_encoded.shape}")
    except Exception as e:
        print(f"[ERROR] Error occurred during one-hot encoding: {e}")
        raise

    return y_train_encoded, y_val_encoded, encoder


# Apply one-hot encoding to labels
y_train_encoded, y_val_encoded, label_encoder = one_hot_encode_labels(y_train, y_val)

# Save the tokenizer and label encoder for future use
try:
    joblib.dump(tokenizer, os.path.join(model_save_path, "tokenizer.pkl"))
    joblib.dump(label_encoder, os.path.join(model_save_path, "label_encoder.pkl"))
    print("[DEBUG] Tokenizer and label encoder saved successfully.")
except Exception as e:
    print(f"[ERROR] Error saving tokenizer or label encoder: {e}")
    raise


[DEBUG] Example of one-hot encoded training label: [[0 1 0]]
[DEBUG] One-hot encoded training labels shape: (104302, 3)
[DEBUG] One-hot encoded validation labels shape: (26076, 3)
[DEBUG] Tokenizer and label encoder saved successfully.


In [16]:


# Define a custom PyTorch Dataset
class TextDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = torch.tensor(sequences, dtype=torch.long)
        self.labels = torch.tensor(labels, dtype=torch.float)

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]


# Create Dataset objects for training and validation
train_dataset = TextDataset(X_train_padded, y_train_encoded)
val_dataset = TextDataset(X_val_padded, y_val_encoded)

# Create DataLoader for batch processing
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)



In [17]:

# Define the LSTM model
class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, embedding_dim=128, lstm_units=64, output_dim=3):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, lstm_units, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(0.3)
        self.fc1 = nn.Linear(lstm_units * 2, 32)  # Bidirectional LSTM output is doubled
        self.fc2 = nn.Linear(32, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        lstm_out = self.dropout(lstm_out[:, -1, :])  # Take output of last time step
        fc1_out = torch.relu(self.fc1(lstm_out))
        out = self.fc2(fc1_out)
        return out


# Model parameters
vocab_size = len(tokenizer.word_index) + 1
lstm_model = LSTMClassifier(input_dim=vocab_size)

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(lstm_model.parameters(), lr=0.001)



In [18]:

# Training function
def train_model(model, train_loader, criterion, optimizer, epochs=5):
    model.train()
    for epoch in range(epochs):
        epoch_loss = 0
        for sequences, labels in train_loader:
            # Forward pass
            outputs = model(sequences)
            loss = criterion(outputs, labels.argmax(1))

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        # Debug: Print loss for each epoch
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {epoch_loss / len(train_loader):.4f}")


# Train the LSTM model
train_model(lstm_model, train_loader, criterion, optimizer)


Epoch 1/5, Loss: 1.0607
Epoch 2/5, Loss: 1.0601
Epoch 3/5, Loss: 1.0597
Epoch 4/5, Loss: 1.0597
Epoch 5/5, Loss: 1.0596


In [19]:

# Import necessary metrics libraries
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score


# Evaluation function
def evaluate_model(model, val_loader):
    model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for sequences, labels in val_loader:
            outputs = model(sequences)
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())  # Ensure preds are moved to CPU
            all_labels.extend(labels.argmax(1).cpu().numpy())  # Ensure labels are moved to CPU

    # Convert to NumPy arrays for compatibility with sklearn metrics
    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)

    # Debug information to understand prediction counts
    print(f"[DEBUG] Number of predicted samples: {len(all_preds)}")
    print(f"[DEBUG] Number of actual samples: {len(all_labels)}")
    unique_labels = np.unique(all_labels)
    print(f"[DEBUG] Unique labels in ground truth: {unique_labels}")

    # Calculate evaluation metrics with zero_division handling
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='weighted', zero_division=1)
    precision = precision_score(all_labels, all_preds, average='weighted', zero_division=1)
    recall = recall_score(all_labels, all_preds, average='weighted', zero_division=1)

    # Print metrics
    print("\nLSTM Model Evaluation Metrics:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1-score: {f1:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")


# Run evaluation on validation data
evaluate_model(lstm_model, val_loader)

# Save the model
torch.save(lstm_model.state_dict(), os.path.join(model_save_path, "lstm_model.pth"))
print("LSTM model saved.")


[DEBUG] Number of predicted samples: 26076
[DEBUG] Number of actual samples: 26076
[DEBUG] Unique labels in ground truth: [0 1 2]

LSTM Model Evaluation Metrics:
Accuracy: 0.4433
F1-score: 0.2723
Precision: 0.7532
Recall: 0.4433
LSTM model saved.
