In [None]:
!pip install transformers datasets scikit-learn

# Bert Hypermeter Tuning
` not part of the final version of the project`



In [None]:
########## Hyperparameter Search Bert Model ##########
########## NOT used in the final version of the project ##########

import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import get_scheduler
from sklearn.metrics import accuracy_score, classification_report
import torch.nn as nn
import itertools

DEVICE = torch.device("cuda" if torch.cuda.is_available()  else "cpu")
print(f"Using device: {DEVICE}")


# Dataset Class
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Load Preprocessed Data
processed_data_path = "/content/drive/MyDrive/Akademik/btu/SentimentAnalysis/Colab/preprocessed.csv"
print("Loading preprocessed data...")
data = pd.read_csv(processed_data_path)

# Verify dataset class distribution (add after loading your dataset)
print("Class distribution in the dataset:")
print(data["sentiment"].value_counts())

# Limit dataset size for testing
subset_size = 100000  # Use a subset for quick iterations
data = data.sample(subset_size, random_state=42)

# Drop rows with missing values
data.dropna(subset=["text"], inplace=True)

# Prepare tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Prepare datasets and dataloaders
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data["text"], data["sentiment"], test_size=0.2, random_state=42
)

train_dataset = SentimentDataset(train_texts.tolist(), train_labels.tolist(), tokenizer)
test_dataset = SentimentDataset(test_texts.tolist(), test_labels.tolist(), tokenizer)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

# Define DataLoaders for training and validation
val_texts, val_labels = test_texts, test_labels
val_dataset = SentimentDataset(val_texts.tolist(), val_labels.tolist(), tokenizer)
val_loader = DataLoader(val_dataset, batch_size=32)

# Initialize BERT model
print("Initializing BERT model...")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3).to(DEVICE)

# Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
num_training_steps = len(train_loader) * 10  # Set max epochs to 10
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Loss function
criterion = nn.CrossEntropyLoss()

# Training function
def train_model(model, dataloader, optimizer, scheduler, criterion):
    model.train()
    total_loss = 0
    progress_bar = tqdm(dataloader, desc="Training", leave=False)
    for batch in progress_bar:
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())
    return total_loss / len(dataloader)

# Evaluation function
def evaluate_model(model, dataloader, criterion):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    predictions_list = []
    true_labels_list = []

    progress_bar = tqdm(dataloader, desc="Evaluating", leave=False)
    with torch.no_grad():
        for batch in progress_bar:
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            labels = batch["labels"].to(DEVICE)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            predictions = torch.argmax(outputs.logits, dim=-1)
            predictions_list.extend(predictions.cpu().numpy())
            true_labels_list.extend(labels.cpu().numpy())

            correct += (predictions == labels).sum().item()
            total += labels.size(0)
            progress_bar.set_postfix(loss=loss.item(), accuracy=correct / total)

    accuracy = correct / total
    return total_loss / len(dataloader), accuracy, predictions_list, true_labels_list

# Define hyperparameter search space
search_space = {
    "learning_rate": [2e-5, 3e-5, 5e-5],
    "batch_size": [16, 32],
    "num_epochs": [2, 3]
}

def hyperparameter_tuning(search_space, train_loader, val_loader):
    """
    Perform hyperparameter tuning to find the best configuration.
    Args:
        search_space (dict): Dictionary containing hyperparameter lists.
        train_loader (DataLoader): DataLoader for training data.
        val_loader (DataLoader): DataLoader for validation data.

    Returns:
        dict: Best hyperparameters and associated accuracy.
    """
    best_config = None
    best_accuracy = 0.0

    # Generate all combinations of hyperparameters
    combinations = list(itertools.product(
        search_space["learning_rate"],
        search_space["batch_size"],
        search_space["num_epochs"]
    ))

    for lr, batch_size, num_epochs in tqdm(combinations, desc="Tuning Hyperparameters"):
        print(f"\nTesting configuration: LR={lr}, Batch Size={batch_size}, Epochs={num_epochs}")

        # Update DataLoader with current batch size
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size)

        # Initialize model, optimizer, and scheduler
        model = BertForSequenceClassification.from_pretrained(
            "bert-base-uncased",
            num_labels=3
        ).to(DEVICE)
        optimizer = AdamW(model.parameters(), lr=lr)
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)
        criterion = nn.CrossEntropyLoss()

        # Train the model
        for epoch in range(num_epochs):
            train_loss = train_model(model, train_loader, optimizer, scheduler, criterion)
            print(f"Epoch {epoch + 1}: Train Loss = {train_loss:.4f}")

        # Validate the model
        _, val_accuracy, _, _ = evaluate_model(model, val_loader, criterion)
        print(f"Validation Accuracy: {val_accuracy:.4f}")

        # Update the best configuration if the current one is better
        if val_accuracy > best_accuracy:
            best_accuracy = val_accuracy
            best_config = {
                "learning_rate": lr,
                "batch_size": batch_size,
                "num_epochs": num_epochs
            }
            # Save the best model
            torch.save(model.state_dict(), "/content/drive/MyDrive/Akademik/btu/SentimentAnalysis/Colab/models/tryouts/bert_model_best.pt")

    print(f"\nBest Configuration: {best_config}")
    print(f"Best Validation Accuracy: {best_accuracy:.4f}")
    return best_config

print("Starting hyperparameter tuning...")
best_hyperparameters = hyperparameter_tuning(search_space, train_loader, val_loader)
print(f"Best Hyperparameters: {best_hyperparameters}")

# Evaluate Model
print("Evaluating model...")
model.load_state_dict(torch.load("/content/drive/MyDrive/Akademik/btu/SentimentAnalysis/Colab/models/tryouts/bert_model_best.pt"))
model.eval()

# Get predictions and true labels
val_loss, val_accuracy, predictions, true_labels = evaluate_model(model, test_loader, criterion)

# Print Accuracy Score and Classification Report
accuracy = accuracy_score(true_labels, predictions)
print(f"Accuracy: {accuracy:.4f}")

# Adjust dynamically based on unique labels
unique_labels = sorted(list(set(true_labels + predictions)))

# Define the target names dynamically
target_names_full = ["Negative", "Neutral", "Positive"]
target_names = [target_names_full[i] for i in unique_labels]

# Print the classification report
print("\nClassification Report:")
print(classification_report(true_labels, predictions, target_names=target_names))

# Final BERT Model and Evaluation
` used in the final version of the project`

In [None]:
########## Bert Model Training Script ##########

import os
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import get_scheduler
from sklearn.metrics import accuracy_score, classification_report
import torch.nn as nn
from torch.cuda.amp import autocast, GradScaler

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

# Dataset Class
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

model_save_path = "/content/drive/MyDrive/Akademik/btu/SentimentAnalysis/Colab/models/bert_model.pt"
output_save_path = "/content/drive/MyDrive/Akademik/btu/SentimentAnalysis/Colab/results/metrics"
os.makedirs(output_save_path, exist_ok=True)

# Load Preprocessed Data
processed_data_path = "/content/drive/MyDrive/Akademik/btu/SentimentAnalysis/Colab/preprocessed.csv"
print("Loading preprocessed data...")
data = pd.read_csv(processed_data_path)

# Verify dataset class distribution
print("Class distribution in the dataset:")
print(data["sentiment"].value_counts())

# Limit dataset size for testing
subset_size = 100000
data = data.sample(subset_size, random_state=42)

# Drop rows with missing values
data.dropna(subset=["text"], inplace=True)

# Prepare tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Prepare datasets and dataloaders
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data["text"], data["sentiment"], test_size=0.2, random_state=42
)

train_dataset = SentimentDataset(train_texts.tolist(), train_labels.tolist(), tokenizer)
test_dataset = SentimentDataset(test_texts.tolist(), test_labels.tolist(), tokenizer)

train_loader = DataLoader(
    train_dataset, batch_size=16, shuffle=True, num_workers=4, pin_memory=True
)
test_loader = DataLoader(
    test_dataset, batch_size=16, num_workers=4, pin_memory=True
)

print("Initializing BERT model...")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3).to(DEVICE)

# Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=3e-5, weight_decay=0.01)
num_training_steps = len(train_loader) * 3  # Adjust for 3 epochs
warmup_steps = int(0.1 * num_training_steps)
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_training_steps)

# Loss function
criterion = nn.CrossEntropyLoss()

# Mixed Precision Training
scaler = GradScaler()

# Gradient Accumulation Steps
accumulation_steps = 2  # Simulates batch size 32

def train_model(model, dataloader, optimizer, scheduler, criterion):
    model.train()
    total_loss = 0
    progress_bar = tqdm(dataloader, desc="Training", leave=False)
    optimizer.zero_grad()

    for i, batch in enumerate(progress_bar):
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)

        with autocast():
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss / accumulation_steps  # Divide loss for accumulation

        scaler.scale(loss).backward()

        if (i + 1) % accumulation_steps == 0 or (i + 1) == len(dataloader):
            scaler.unscale_(optimizer)
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Gradient clipping
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            scheduler.step()

        total_loss += loss.item() * accumulation_steps  # Scale back for correct total
        progress_bar.set_postfix(loss=loss.item() * accumulation_steps)
    return total_loss / len(dataloader)

def evaluate_model(model, dataloader, criterion):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    predictions_list = []
    true_labels_list = []

    progress_bar = tqdm(dataloader, desc="Evaluating", leave=False)
    with torch.no_grad():
        for batch in progress_bar:
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            labels = batch["labels"].to(DEVICE)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            predictions = torch.argmax(outputs.logits, dim=-1)
            predictions_list.extend(predictions.cpu().numpy())
            true_labels_list.extend(labels.cpu().numpy())

            correct += (predictions == labels).sum().item()
            total += labels.size(0)
            progress_bar.set_postfix(loss=loss.item(), accuracy=correct / total)

    accuracy = correct / total
    return total_loss / len(dataloader), accuracy, predictions_list, true_labels_list

# Training loop with checkpoint saving
train_losses = []
val_losses = []
val_accuracies = []
best_accuracy = 0

print("Training model...")
for epoch in range(3):  # Train for 3 epochs
    print(f"Epoch {epoch + 1}")
    train_loss = train_model(model, train_loader, optimizer, scheduler, criterion)
    val_loss, val_accuracy, predictions, true_labels = evaluate_model(model, test_loader, criterion)

    train_losses.append(train_loss)
    val_losses.append(val_loss)
    val_accuracies.append(val_accuracy)

    print(f"Epoch {epoch + 1}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}, Val Accuracy = {val_accuracy:.4f}")

    if val_accuracy > best_accuracy:
        best_accuracy = val_accuracy
        torch.save(model.state_dict(), model_save_path)
        print(f"Model saved at epoch {epoch + 1}")

print("Evaluating model...")
model.load_state_dict(torch.load(model_save_path))
model.eval()

val_loss, val_accuracy, predictions, true_labels = evaluate_model(model, test_loader, criterion)

print("Saving outputs for visualization...")
outputs = {
    "train_losses": train_losses,
    "val_losses": val_losses,
    "val_accuracies": val_accuracies,
    "predictions": predictions,
    "true_labels": true_labels
}
torch.save(outputs, os.path.join(output_save_path, "bert_visualization_outputs.pth"))

accuracy = accuracy_score(true_labels, predictions)
print(f"Final Accuracy: {accuracy:.4f}")

target_names = ["Negative", "Neutral", "Positive"]
print("\nClassification Report:")
print(classification_report(true_labels, predictions, target_names=target_names))




Using device: cuda
Loading preprocessed data...
Class distribution in the dataset:
sentiment
0    800000
2    800000
Name: count, dtype: int64


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



Initializing BERT model...


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = GradScaler()


Training model...
Epoch 1


  with autocast():


Epoch 1: Train Loss = 0.4756, Val Loss = 0.3880, Val Accuracy = 0.8265
Model saved at epoch 1
Epoch 2




Epoch 2: Train Loss = 0.3210, Val Loss = 0.4050, Val Accuracy = 0.8319
Model saved at epoch 2
Epoch 3


  model.load_state_dict(torch.load(model_save_path))


Epoch 3: Train Loss = 0.1968, Val Loss = 0.4544, Val Accuracy = 0.8305
Evaluating model...




Saving outputs for visualization...
Final Accuracy: 0.8319

Classification Report:


ValueError: Number of classes, 2, does not match size of target_names, 3. Try specifying the labels parameter

In [None]:
########## Evaluation Bert Model ##########

import os
import torch
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification
import torch.nn as nn

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Set random seed for reproducibility
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
if torch.backends.cudnn.is_available():
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

def evaluate_model(model, dataloader, criterion):
    model.eval()
    total_loss = 0.0
    true_labels, predictions = [], []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            labels = batch["labels"].to(DEVICE)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(true_labels, predictions)
    return total_loss / len(dataloader), accuracy, true_labels, predictions

if __name__ == "__main__":
    model_save_path = "/content/drive/MyDrive/Akademik/btu/SentimentAnalysis/Colab/models/bert_model.pt"
    processed_data_path = "/content/drive/MyDrive/Akademik/btu/SentimentAnalysis/Colab/preprocessed.csv"

    # Load preprocessed data
    print("Loading preprocessed data...")
    data = pd.read_csv(processed_data_path)
    data.dropna(subset=["text"], inplace=True)

    # Prepare tokenizer and datasets
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

    test_texts, test_labels = data["text"].tolist(), data["sentiment"].tolist()
    test_dataset = SentimentDataset(test_texts, test_labels, tokenizer)
    test_loader = DataLoader(test_dataset, batch_size=16)

    # Load the pre-trained model
    print("Loading model...")
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3).to(DEVICE)
    model.load_state_dict(torch.load(model_save_path))

    # Evaluate the model
    print("Evaluating model...")
    criterion = nn.CrossEntropyLoss()
    _, test_accuracy, true_labels, predictions = evaluate_model(model, test_loader, criterion)

    unique_classes = sorted(set(true_labels))
    num_classes = len(unique_classes)

    # Generate target names based on the number of classes
    default_target_names = ["Class " + str(i) for i in range(num_classes)]
    custom_target_names = ["Negative", "Neutral", "Positive"][:num_classes]  # Adjust to match detected classes

    print("\nAccuracy:", accuracy_score(true_labels, predictions))
    print("\nClassification Report:")
    print(classification_report(true_labels, predictions, target_names=custom_target_names))

    print("\nConfusion Matrix:")
    print(confusion_matrix(true_labels, predictions, labels=unique_classes))


# LSTM Hypermeter Tuning
##### `hyperparameters tuning was applied to find the best hyperparameters to get the best accuracy, it is not used in the final version of the project`

In [None]:
import os
import pickle
import random
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pandas as pd

DEVICE = torch.device("cuda" if torch.cuda.is_available()  else "cpu")
print(f"Using device: {DEVICE}")

seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
if torch.backends.cudnn.is_available():
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=100):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokens = self.tokenizer(self.texts[idx])
        tokens = tokens[:self.max_length]  # Truncate if too long
        tokens = [0] * (self.max_length - len(tokens)) + tokens  # Pad if too short

        tokens_tensor = torch.tensor(tokens, dtype=torch.long)
        label_tensor = torch.tensor(self.labels[idx], dtype=torch.long)

        return tokens_tensor, label_tensor

class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, n_layers=1):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, n_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        hidden_state = lstm_out[:, -1, :]
        output = self.fc(hidden_state)
        return output

def train_model(model, dataloader, optimizer, criterion):
    model.train()
    total_loss = 0.0
    for inputs, labels in dataloader:
        inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate_model(model, dataloader):
    model.eval()
    true_labels, predictions = [], []
    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
            outputs = model(inputs)
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy())
    return true_labels, predictions

if __name__ == "__main__":

    vectorizer_path = "/content/drive/MyDrive/Akademik/btu/SentimentAnalysis/Colab/vectorizer.pkl"
    processed_data_path = "/content/drive/MyDrive/Akademik/btu/SentimentAnalysis/Colab/preprocessed.csv"
    model_save_path = "/content/drive/MyDrive/Akademik/btu/SentimentAnalysis/Colab/models/tryouts/lstm_model_base.pt"

    print("Loading vectorizer...")
    with open(vectorizer_path, "rb") as f:
        vectorizer = pickle.load(f)

    print("Loading preprocessed data...")
    data = pd.read_csv(processed_data_path)
    data.dropna(subset=["text"], inplace=True)

    tokenizer_fn = vectorizer.build_analyzer()

    def tokenizer(text):
        tokens = tokenizer_fn(text)
        return [vectorizer.vocabulary_.get(token, 0) for token in tokens]

    print("Splitting data into train and test sets...")
    train_texts, test_texts, train_labels, test_labels = train_test_split(
        data["text"], data["sentiment"], test_size=0.2, random_state=42
    )

    train_dataset = SentimentDataset(train_texts.tolist(), train_labels.tolist(), tokenizer)
    test_dataset = SentimentDataset(test_texts.tolist(), test_labels.tolist(), tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=64)

    vocab_size = len(vectorizer.vocabulary_)
    embed_dim = 100
    hidden_dim = 256
    output_dim = 3

    print("Initializing model...")
    model = LSTMClassifier(vocab_size, embed_dim, hidden_dim, output_dim).to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()

    print(f"Using device: {DEVICE}")

    print("Training model...")
    for epoch in range(5):
        train_loss = train_model(model, train_loader, optimizer, criterion)
        print(f"Epoch {epoch + 1}: Loss = {train_loss:.4f}")

    print("Saving model...")
    os.makedirs("models", exist_ok=True)
    torch.save(model.state_dict(), model_save_path)
    print(f"Model saved to {model_save_path}")

    print("Evaluating model...")
    model = LSTMClassifier(vocab_size, embed_dim, hidden_dim, output_dim).to(DEVICE)
    model.load_state_dict(torch.load(model_save_path))
    model.eval()  # Explicitly set evaluation mode
    true_labels, predictions = evaluate_model(model, test_loader)

    print("\nAccuracy:", accuracy_score(true_labels, predictions))
    print("\nClassification Report:")
    print(classification_report(true_labels, predictions))

    print("\nConfusion Matrix:")
    print(confusion_matrix(true_labels, predictions))

Using device: cuda
Loading vectorizer...


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Loading preprocessed data...
Splitting data into train and test sets...
Initializing model...
Using device: cuda
Training model...
Epoch 1: Loss = 0.4267
Epoch 2: Loss = 0.3901
Epoch 3: Loss = 0.3764
Epoch 4: Loss = 0.3658
Epoch 5: Loss = 0.3565
Saving model...
Model saved to /content/drive/MyDrive/Colab Notebooks/SentimentAnalysis/wout_metadata/lstm_model.pt
Evaluating model...


  model.load_state_dict(torch.load(model_save_path))



Accuracy: 0.8194986316639843

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.82      0.82    159215
           2       0.82      0.82      0.82    160151

    accuracy                           0.82    319366
   macro avg       0.82      0.82      0.82    319366
weighted avg       0.82      0.82      0.82    319366


Confusion Matrix:
[[130013  29202]
 [ 28444 131707]]


In [None]:
import os
import pickle
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import pandas as pd
from tqdm import tqdm
import itertools
from sklearn.metrics import accuracy_score, classification_report

DEVICE = torch.device("cuda" if torch.cuda.is_available()  else "cpu")
print(DEVICE)

class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=100):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokens = self.tokenizer(self.texts[idx])
        tokens = tokens[:self.max_length]
        tokens = [0] * (self.max_length - len(tokens)) + tokens

        tokens_tensor = torch.tensor(tokens, dtype=torch.long)
        label_tensor = torch.tensor(self.labels[idx], dtype=torch.long)

        return tokens_tensor, label_tensor

class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, n_layers=1, dropout=0.2):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, n_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        hidden_state = lstm_out[:, -1, :]
        output = self.fc(hidden_state)
        return output

def train_model(model, dataloader, optimizer, criterion):
    model.train()
    total_loss = 0.0
    for inputs, labels in tqdm(dataloader, desc="Training", leave=False):
        inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate_model(model, dataloader, criterion):
    model.eval()
    total_loss = 0.0
    true_labels = []
    predictions = []

    with torch.no_grad():
        for inputs, labels in tqdm(dataloader, desc="Evaluating", leave=False):
            inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            preds = torch.argmax(outputs, dim=1).cpu().tolist()
            true_labels.extend(labels.cpu().tolist())
            predictions.extend(preds)

    accuracy = accuracy_score(true_labels, predictions)
    return total_loss / len(dataloader), accuracy, true_labels, predictions

if __name__ == "__main__":
    # Load preprocessed data and vectorizer
    vectorizer_path = "/content/drive/MyDrive/Akademik/btu/SentimentAnalysis/Colab/vectorizer.pkl"
    processed_data_path = "/content/drive/MyDrive/Akademik/btu/SentimentAnalysis/Colab/preprocessed.csv"

    with open(vectorizer_path, "rb") as f:
        vectorizer = pickle.load(f)

    data = pd.read_csv(processed_data_path)
    data.dropna(subset=["text"], inplace=True)

    tokenizer_fn = vectorizer.build_analyzer()

    def tokenizer(text):
        tokens = tokenizer_fn(text)
        return [vectorizer.vocabulary_.get(token, 0) for token in tokens]

    train_texts, test_texts, train_labels, test_labels = train_test_split(
        data["text"], data["sentiment"], test_size=0.2, random_state=42
    )

    train_dataset = SentimentDataset(train_texts.tolist(), train_labels.tolist(), tokenizer)
    test_dataset = SentimentDataset(test_texts.tolist(), test_labels.tolist(), tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=32)

    # Hyperparameter grid
    param_grid = {
        "embed_dim": [50, 100],
        "hidden_dim": [128, 256],
        "n_layers": [1, 2],
        "dropout": [0.2, 0.5],
        "batch_size": [32, 64],
        "learning_rate": [0.001, 0.0005]
    }

    best_accuracy = 0
    best_params = {}

    for params in itertools.product(*param_grid.values()):
        embed_dim, hidden_dim, n_layers, dropout, batch_size, learning_rate = params

        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        test_loader = DataLoader(test_dataset, batch_size=batch_size)

        vocab_size = len(vectorizer.vocabulary_)
        output_dim = 3

        model = LSTMClassifier(vocab_size, embed_dim, hidden_dim, output_dim, n_layers, dropout).to(DEVICE)
        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
        criterion = nn.CrossEntropyLoss()

        print(f"Training with params: embed_dim={embed_dim}, hidden_dim={hidden_dim}, n_layers={n_layers}, dropout={dropout}, batch_size={batch_size}, learning_rate={learning_rate}")

        for epoch in range(3):
            train_loss = train_model(model, train_loader, optimizer, criterion)
            val_loss, val_accuracy, _, _ = evaluate_model(model, test_loader, criterion)
            print(f"Epoch {epoch + 1}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}, Val Accuracy = {val_accuracy:.4f}")

            if val_accuracy > best_accuracy:
                best_accuracy = val_accuracy
                best_params = {
                    "embed_dim": embed_dim,
                    "hidden_dim": hidden_dim,
                    "n_layers": n_layers,
                    "dropout": dropout,
                    "batch_size": batch_size,
                    "learning_rate": learning_rate
                }
                torch.save(model.state_dict(), "/content/drive/MyDrive/Akademik/btu/SentimentAnalysis/Colab/models/tryouts/best_lstm_model.pt")

    print("Best Hyperparameters:", best_params)
    print(f"Best Accuracy: {best_accuracy:.4f}")


cuda


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Training with params: embed_dim=50, hidden_dim=128, n_layers=1, dropout=0.2, batch_size=32, learning_rate=0.001




Epoch 1: Train Loss = 0.4326, Val Loss = 0.4074, Val Accuracy = 0.8135




Epoch 2: Train Loss = 0.3934, Val Loss = 0.3980, Val Accuracy = 0.8184




Epoch 3: Train Loss = 0.3792, Val Loss = 0.3964, Val Accuracy = 0.8203
Training with params: embed_dim=50, hidden_dim=128, n_layers=1, dropout=0.2, batch_size=32, learning_rate=0.0005




Epoch 1: Train Loss = 0.4462, Val Loss = 0.4119, Val Accuracy = 0.8104




Epoch 2: Train Loss = 0.3982, Val Loss = 0.3997, Val Accuracy = 0.8171




Epoch 3: Train Loss = 0.3825, Val Loss = 0.3962, Val Accuracy = 0.8195
Training with params: embed_dim=50, hidden_dim=128, n_layers=1, dropout=0.2, batch_size=64, learning_rate=0.001




Epoch 1: Train Loss = 0.4386, Val Loss = 0.4089, Val Accuracy = 0.8127




Epoch 2: Train Loss = 0.3961, Val Loss = 0.4031, Val Accuracy = 0.8150




Epoch 3: Train Loss = 0.3808, Val Loss = 0.3960, Val Accuracy = 0.8199
Training with params: embed_dim=50, hidden_dim=128, n_layers=1, dropout=0.2, batch_size=64, learning_rate=0.0005




Epoch 1: Train Loss = 0.4578, Val Loss = 0.4198, Val Accuracy = 0.8058




Epoch 2: Train Loss = 0.4059, Val Loss = 0.4057, Val Accuracy = 0.8137




Epoch 3: Train Loss = 0.3904, Val Loss = 0.3980, Val Accuracy = 0.8180
Training with params: embed_dim=50, hidden_dim=128, n_layers=1, dropout=0.5, batch_size=32, learning_rate=0.001




Epoch 1: Train Loss = 0.4316, Val Loss = 0.4081, Val Accuracy = 0.8133




Epoch 2: Train Loss = 0.3922, Val Loss = 0.3967, Val Accuracy = 0.8188




Epoch 3: Train Loss = 0.3780, Val Loss = 0.3956, Val Accuracy = 0.8197
Training with params: embed_dim=50, hidden_dim=128, n_layers=1, dropout=0.5, batch_size=32, learning_rate=0.0005




Epoch 1: Train Loss = 0.4449, Val Loss = 0.4114, Val Accuracy = 0.8103




Epoch 2: Train Loss = 0.3978, Val Loss = 0.3987, Val Accuracy = 0.8176




Epoch 3: Train Loss = 0.3826, Val Loss = 0.3946, Val Accuracy = 0.8202
Training with params: embed_dim=50, hidden_dim=128, n_layers=1, dropout=0.5, batch_size=64, learning_rate=0.001




Epoch 1: Train Loss = 0.4386, Val Loss = 0.4108, Val Accuracy = 0.8108




Epoch 2: Train Loss = 0.3955, Val Loss = 0.3990, Val Accuracy = 0.8177




Epoch 3: Train Loss = 0.3800, Val Loss = 0.3955, Val Accuracy = 0.8195
Training with params: embed_dim=50, hidden_dim=128, n_layers=1, dropout=0.5, batch_size=64, learning_rate=0.0005




Epoch 1: Train Loss = 0.4573, Val Loss = 0.4177, Val Accuracy = 0.8073




Epoch 2: Train Loss = 0.4046, Val Loss = 0.4041, Val Accuracy = 0.8147




Epoch 3: Train Loss = 0.3895, Val Loss = 0.3977, Val Accuracy = 0.8184
Training with params: embed_dim=50, hidden_dim=128, n_layers=2, dropout=0.2, batch_size=32, learning_rate=0.001




Epoch 1: Train Loss = 0.4320, Val Loss = 0.4056, Val Accuracy = 0.8144




Epoch 2: Train Loss = 0.3922, Val Loss = 0.3962, Val Accuracy = 0.8197




Epoch 3: Train Loss = 0.3782, Val Loss = 0.3920, Val Accuracy = 0.8223
Training with params: embed_dim=50, hidden_dim=128, n_layers=2, dropout=0.2, batch_size=32, learning_rate=0.0005




Epoch 1: Train Loss = 0.4445, Val Loss = 0.4112, Val Accuracy = 0.8105




Epoch 2: Train Loss = 0.3979, Val Loss = 0.3994, Val Accuracy = 0.8177




Epoch 3: Train Loss = 0.3824, Val Loss = 0.3931, Val Accuracy = 0.8209
Training with params: embed_dim=50, hidden_dim=128, n_layers=2, dropout=0.2, batch_size=64, learning_rate=0.001




Epoch 1: Train Loss = 0.4387, Val Loss = 0.4070, Val Accuracy = 0.8131




Epoch 2: Train Loss = 0.3947, Val Loss = 0.3944, Val Accuracy = 0.8209




Epoch 3: Train Loss = 0.3790, Val Loss = 0.3911, Val Accuracy = 0.8220
Training with params: embed_dim=50, hidden_dim=128, n_layers=2, dropout=0.2, batch_size=64, learning_rate=0.0005




Epoch 1: Train Loss = 0.4540, Val Loss = 0.4173, Val Accuracy = 0.8076




Epoch 2: Train Loss = 0.4038, Val Loss = 0.4016, Val Accuracy = 0.8160




Epoch 3: Train Loss = 0.3886, Val Loss = 0.3958, Val Accuracy = 0.8185
Training with params: embed_dim=50, hidden_dim=128, n_layers=2, dropout=0.5, batch_size=32, learning_rate=0.001




Epoch 1: Train Loss = 0.4353, Val Loss = 0.4069, Val Accuracy = 0.8136




Epoch 2: Train Loss = 0.3982, Val Loss = 0.3964, Val Accuracy = 0.8193




Epoch 3: Train Loss = 0.3858, Val Loss = 0.3930, Val Accuracy = 0.8210
Training with params: embed_dim=50, hidden_dim=128, n_layers=2, dropout=0.5, batch_size=32, learning_rate=0.0005




Epoch 1: Train Loss = 0.4477, Val Loss = 0.4112, Val Accuracy = 0.8114




Epoch 2: Train Loss = 0.4025, Val Loss = 0.4001, Val Accuracy = 0.8168




Epoch 3: Train Loss = 0.3889, Val Loss = 0.3950, Val Accuracy = 0.8198
Training with params: embed_dim=50, hidden_dim=128, n_layers=2, dropout=0.5, batch_size=64, learning_rate=0.001




Epoch 1: Train Loss = 0.4403, Val Loss = 0.4076, Val Accuracy = 0.8135




Epoch 2: Train Loss = 0.3995, Val Loss = 0.3972, Val Accuracy = 0.8193




Epoch 3: Train Loss = 0.3863, Val Loss = 0.3940, Val Accuracy = 0.8213
Training with params: embed_dim=50, hidden_dim=128, n_layers=2, dropout=0.5, batch_size=64, learning_rate=0.0005




Epoch 1: Train Loss = 0.4594, Val Loss = 0.4175, Val Accuracy = 0.8075




Epoch 2: Train Loss = 0.4081, Val Loss = 0.4053, Val Accuracy = 0.8145




Epoch 3: Train Loss = 0.3942, Val Loss = 0.3968, Val Accuracy = 0.8189
Training with params: embed_dim=50, hidden_dim=256, n_layers=1, dropout=0.2, batch_size=32, learning_rate=0.001




Epoch 1: Train Loss = 0.4300, Val Loss = 0.4030, Val Accuracy = 0.8159




Epoch 2: Train Loss = 0.3889, Val Loss = 0.3944, Val Accuracy = 0.8201




Epoch 3: Train Loss = 0.3721, Val Loss = 0.3956, Val Accuracy = 0.8211
Training with params: embed_dim=50, hidden_dim=256, n_layers=1, dropout=0.2, batch_size=32, learning_rate=0.0005




Epoch 1: Train Loss = 0.4410, Val Loss = 0.4070, Val Accuracy = 0.8131




Epoch 2: Train Loss = 0.3918, Val Loss = 0.3949, Val Accuracy = 0.8200




Epoch 3: Train Loss = 0.3720, Val Loss = 0.3933, Val Accuracy = 0.8219
Training with params: embed_dim=50, hidden_dim=256, n_layers=1, dropout=0.2, batch_size=64, learning_rate=0.001




Epoch 1: Train Loss = 0.4351, Val Loss = 0.4070, Val Accuracy = 0.8140




Epoch 2: Train Loss = 0.3896, Val Loss = 0.3945, Val Accuracy = 0.8202




Epoch 3: Train Loss = 0.3698, Val Loss = 0.3937, Val Accuracy = 0.8203
Training with params: embed_dim=50, hidden_dim=256, n_layers=1, dropout=0.2, batch_size=64, learning_rate=0.0005




Epoch 1: Train Loss = 0.4513, Val Loss = 0.4120, Val Accuracy = 0.8102




Epoch 2: Train Loss = 0.3973, Val Loss = 0.3979, Val Accuracy = 0.8178




Epoch 3: Train Loss = 0.3782, Val Loss = 0.3934, Val Accuracy = 0.8211
Training with params: embed_dim=50, hidden_dim=256, n_layers=1, dropout=0.5, batch_size=32, learning_rate=0.001




Epoch 1: Train Loss = 0.4295, Val Loss = 0.4021, Val Accuracy = 0.8166




Epoch 2: Train Loss = 0.3890, Val Loss = 0.3948, Val Accuracy = 0.8201




Epoch 3: Train Loss = 0.3717, Val Loss = 0.3950, Val Accuracy = 0.8206
Training with params: embed_dim=50, hidden_dim=256, n_layers=1, dropout=0.5, batch_size=32, learning_rate=0.0005


Training:  46%|████▌     | 18398/39921 [02:02<02:17, 156.50it/s]

In [None]:
import os
import pickle
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import pandas as pd
from tqdm import tqdm
import itertools
from sklearn.metrics import accuracy_score, classification_report

DEVICE = torch.device("cuda" if torch.cuda.is_available()  else "cpu")
print(DEVICE)

class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=100):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokens = self.tokenizer(self.texts[idx])
        tokens = tokens[:self.max_length]
        tokens = [0] * (self.max_length - len(tokens)) + tokens

        tokens_tensor = torch.tensor(tokens, dtype=torch.long)
        label_tensor = torch.tensor(self.labels[idx], dtype=torch.long)

        return tokens_tensor, label_tensor

class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, n_layers=1, dropout=0.2):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, n_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        hidden_state = lstm_out[:, -1, :]
        output = self.fc(hidden_state)
        return output

def train_model(model, dataloader, optimizer, criterion):
    model.train()
    total_loss = 0.0
    for inputs, labels in tqdm(dataloader, desc="Training", leave=False):
        inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate_model(model, dataloader, criterion):
    model.eval()
    total_loss = 0.0
    true_labels = []
    predictions = []

    with torch.no_grad():
        for inputs, labels in tqdm(dataloader, desc="Evaluating", leave=False):
            inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            preds = torch.argmax(outputs, dim=1).cpu().tolist()
            true_labels.extend(labels.cpu().tolist())
            predictions.extend(preds)

    accuracy = accuracy_score(true_labels, predictions)
    return total_loss / len(dataloader), accuracy, true_labels, predictions

if __name__ == "__main__":
    # Load preprocessed data and vectorizer
    vectorizer_path = "/content/drive/MyDrive/Akademik/btu/SentimentAnalysis/Colab/vectorizer.pkl"
    processed_data_path = "/content/drive/MyDrive/Akademik/btu/SentimentAnalysis/Colab/preprocessed.csv"

    with open(vectorizer_path, "rb") as f:
        vectorizer = pickle.load(f)

    data = pd.read_csv(processed_data_path)
    data.dropna(subset=["text"], inplace=True)

    tokenizer_fn = vectorizer.build_analyzer()

    def tokenizer(text):
        tokens = tokenizer_fn(text)
        return [vectorizer.vocabulary_.get(token, 0) for token in tokens]

    train_texts, test_texts, train_labels, test_labels = train_test_split(
        data["text"], data["sentiment"], test_size=0.2, random_state=42
    )

    train_dataset = SentimentDataset(train_texts.tolist(), train_labels.tolist(), tokenizer)
    test_dataset = SentimentDataset(test_texts.tolist(), test_labels.tolist(), tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=32)

    # Hyperparameter grid
    param_grid = {
        "embed_dim": [50, 100],
        "hidden_dim": [256],
        "n_layers": [1, 2],
        "dropout": [0.2, 0.5],
        "batch_size": [32, 64],
        "learning_rate": [0.001, 0.0005]
    }

    best_accuracy = 0
    best_params = {}

    for params in itertools.product(*param_grid.values()):
        embed_dim, hidden_dim, n_layers, dropout, batch_size, learning_rate = params

        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        test_loader = DataLoader(test_dataset, batch_size=batch_size)

        vocab_size = len(vectorizer.vocabulary_)
        output_dim = 3

        model = LSTMClassifier(vocab_size, embed_dim, hidden_dim, output_dim, n_layers, dropout).to(DEVICE)
        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
        criterion = nn.CrossEntropyLoss()

        print(f"Training with params: embed_dim={embed_dim}, hidden_dim={hidden_dim}, n_layers={n_layers}, dropout={dropout}, batch_size={batch_size}, learning_rate={learning_rate}")

        for epoch in range(3):
            train_loss = train_model(model, train_loader, optimizer, criterion)
            val_loss, val_accuracy, _, _ = evaluate_model(model, test_loader, criterion)
            print(f"Epoch {epoch + 1}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}, Val Accuracy = {val_accuracy:.4f}")

            if val_accuracy > best_accuracy:
                best_accuracy = val_accuracy
                best_params = {
                    "embed_dim": embed_dim,
                    "hidden_dim": hidden_dim,
                    "n_layers": n_layers,
                    "dropout": dropout,
                    "batch_size": batch_size,
                    "learning_rate": learning_rate
                }
                torch.save(model.state_dict(), "/content/drive/MyDrive/Akademik/btu/SentimentAnalysis/Colab/models/tryouts/best_lstm_model2.pt")

    print("Best Hyperparameters:", best_params)
    print(f"Best Accuracy: {best_accuracy:.4f}")


cuda


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Training with params: embed_dim=50, hidden_dim=256, n_layers=1, dropout=0.2, batch_size=32, learning_rate=0.001




Epoch 1: Train Loss = 0.4299, Val Loss = 0.4038, Val Accuracy = 0.8147




Epoch 2: Train Loss = 0.3890, Val Loss = 0.3961, Val Accuracy = 0.8197




Epoch 3: Train Loss = 0.3722, Val Loss = 0.3945, Val Accuracy = 0.8209
Training with params: embed_dim=50, hidden_dim=256, n_layers=1, dropout=0.2, batch_size=32, learning_rate=0.0005




Epoch 1: Train Loss = 0.4408, Val Loss = 0.4070, Val Accuracy = 0.8133




Epoch 2: Train Loss = 0.3915, Val Loss = 0.3962, Val Accuracy = 0.8204




Epoch 3: Train Loss = 0.3714, Val Loss = 0.3935, Val Accuracy = 0.8212
Training with params: embed_dim=50, hidden_dim=256, n_layers=1, dropout=0.2, batch_size=64, learning_rate=0.001




Epoch 1: Train Loss = 0.4361, Val Loss = 0.4098, Val Accuracy = 0.8120




Epoch 2: Train Loss = 0.3904, Val Loss = 0.3959, Val Accuracy = 0.8196




Epoch 3: Train Loss = 0.3710, Val Loss = 0.3976, Val Accuracy = 0.8194
Training with params: embed_dim=50, hidden_dim=256, n_layers=1, dropout=0.2, batch_size=64, learning_rate=0.0005




Epoch 1: Train Loss = 0.4523, Val Loss = 0.4133, Val Accuracy = 0.8097




Epoch 2: Train Loss = 0.3988, Val Loss = 0.3993, Val Accuracy = 0.8175




Epoch 3: Train Loss = 0.3796, Val Loss = 0.3939, Val Accuracy = 0.8202
Training with params: embed_dim=50, hidden_dim=256, n_layers=1, dropout=0.5, batch_size=32, learning_rate=0.001




Epoch 1: Train Loss = 0.4303, Val Loss = 0.4033, Val Accuracy = 0.8153




Epoch 2: Train Loss = 0.3889, Val Loss = 0.3953, Val Accuracy = 0.8202




Epoch 3: Train Loss = 0.3719, Val Loss = 0.3955, Val Accuracy = 0.8206
Training with params: embed_dim=50, hidden_dim=256, n_layers=1, dropout=0.5, batch_size=32, learning_rate=0.0005




Epoch 1: Train Loss = 0.4425, Val Loss = 0.4096, Val Accuracy = 0.8127




Epoch 2: Train Loss = 0.3919, Val Loss = 0.3962, Val Accuracy = 0.8190




Epoch 3: Train Loss = 0.3717, Val Loss = 0.3943, Val Accuracy = 0.8213
Training with params: embed_dim=50, hidden_dim=256, n_layers=1, dropout=0.5, batch_size=64, learning_rate=0.001




Epoch 1: Train Loss = 0.4359, Val Loss = 0.4054, Val Accuracy = 0.8139




Epoch 2: Train Loss = 0.3902, Val Loss = 0.3951, Val Accuracy = 0.8205




Epoch 3: Train Loss = 0.3711, Val Loss = 0.3934, Val Accuracy = 0.8213
Training with params: embed_dim=50, hidden_dim=256, n_layers=1, dropout=0.5, batch_size=64, learning_rate=0.0005




Epoch 1: Train Loss = 0.4532, Val Loss = 0.4131, Val Accuracy = 0.8099


Training:  36%|███▋      | 7239/19961 [01:11<02:03, 102.71it/s]

In [None]:
import os
import pickle
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import pandas as pd
from tqdm import tqdm
import itertools
from sklearn.metrics import accuracy_score, classification_report

DEVICE = torch.device("cuda" if torch.cuda.is_available()  else "cpu")
print(DEVICE)

class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=100):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokens = self.tokenizer(self.texts[idx])
        tokens = tokens[:self.max_length]
        tokens = [0] * (self.max_length - len(tokens)) + tokens

        tokens_tensor = torch.tensor(tokens, dtype=torch.long)
        label_tensor = torch.tensor(self.labels[idx], dtype=torch.long)

        return tokens_tensor, label_tensor

class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, n_layers=1, dropout=0.2):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, n_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        hidden_state = lstm_out[:, -1, :]
        output = self.fc(hidden_state)
        return output

def train_model(model, dataloader, optimizer, criterion):
    model.train()
    total_loss = 0.0
    for inputs, labels in tqdm(dataloader, desc="Training", leave=False):
        inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate_model(model, dataloader, criterion):
    model.eval()
    total_loss = 0.0
    true_labels = []
    predictions = []

    with torch.no_grad():
        for inputs, labels in tqdm(dataloader, desc="Evaluating", leave=False):
            inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            preds = torch.argmax(outputs, dim=1).cpu().tolist()
            true_labels.extend(labels.cpu().tolist())
            predictions.extend(preds)

    accuracy = accuracy_score(true_labels, predictions)
    return total_loss / len(dataloader), accuracy, true_labels, predictions

if __name__ == "__main__":
    # Load preprocessed data and vectorizer
    vectorizer_path = "/content/drive/MyDrive/Akademik/btu/SentimentAnalysis/Colab/vectorizer.pkl"
    processed_data_path = "/content/drive/MyDrive/Akademik/btu/SentimentAnalysis/Colab/preprocessed.csv"

    with open(vectorizer_path, "rb") as f:
        vectorizer = pickle.load(f)

    data = pd.read_csv(processed_data_path)
    data.dropna(subset=["text"], inplace=True)

    tokenizer_fn = vectorizer.build_analyzer()

    def tokenizer(text):
        tokens = tokenizer_fn(text)
        return [vectorizer.vocabulary_.get(token, 0) for token in tokens]

    train_texts, test_texts, train_labels, test_labels = train_test_split(
        data["text"], data["sentiment"], test_size=0.2, random_state=42
    )

    train_dataset = SentimentDataset(train_texts.tolist(), train_labels.tolist(), tokenizer)
    test_dataset = SentimentDataset(test_texts.tolist(), test_labels.tolist(), tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=32)

    # Hyperparameter grid
    param_grid = {
        "embed_dim": [50, 100],
        "hidden_dim": [256],
        "n_layers": [2],
        "dropout": [0.2, 0.5],
        "batch_size": [32, 64],
        "learning_rate": [0.001, 0.0005]
    }

    best_accuracy = 0
    best_params = {}

    for params in itertools.product(*param_grid.values()):
        embed_dim, hidden_dim, n_layers, dropout, batch_size, learning_rate = params

        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        test_loader = DataLoader(test_dataset, batch_size=batch_size)

        vocab_size = len(vectorizer.vocabulary_)
        output_dim = 3

        model = LSTMClassifier(vocab_size, embed_dim, hidden_dim, output_dim, n_layers, dropout).to(DEVICE)
        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
        criterion = nn.CrossEntropyLoss()

        print(f"Training with params: embed_dim={embed_dim}, hidden_dim={hidden_dim}, n_layers={n_layers}, dropout={dropout}, batch_size={batch_size}, learning_rate={learning_rate}")

        for epoch in range(1):
            train_loss = train_model(model, train_loader, optimizer, criterion)
            val_loss, val_accuracy, _, _ = evaluate_model(model, test_loader, criterion)
            print(f"Epoch {epoch + 1}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}, Val Accuracy = {val_accuracy:.4f}")

            if val_accuracy > best_accuracy:
                best_accuracy = val_accuracy
                best_params = {
                    "embed_dim": embed_dim,
                    "hidden_dim": hidden_dim,
                    "n_layers": n_layers,
                    "dropout": dropout,
                    "batch_size": batch_size,
                    "learning_rate": learning_rate
                }
                torch.save(model.state_dict(), "/content/drive/MyDrive/Akademik/btu/SentimentAnalysis/Colab/models/tryouts/best_lstm_model_n_layers_2.pt")

    print("Best Hyperparameters:", best_params)
    print(f"Best Accuracy: {best_accuracy:.4f}")


cuda


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Training with params: embed_dim=50, hidden_dim=256, n_layers=2, dropout=0.2, batch_size=32, learning_rate=0.001




Epoch 1: Train Loss = 0.4302, Val Loss = 0.4043, Val Accuracy = 0.8161
Training with params: embed_dim=50, hidden_dim=256, n_layers=2, dropout=0.2, batch_size=32, learning_rate=0.0005




Epoch 1: Train Loss = 0.4414, Val Loss = 0.4049, Val Accuracy = 0.8145
Training with params: embed_dim=50, hidden_dim=256, n_layers=2, dropout=0.2, batch_size=64, learning_rate=0.001




Epoch 1: Train Loss = 0.4342, Val Loss = 0.4037, Val Accuracy = 0.8155
Training with params: embed_dim=50, hidden_dim=256, n_layers=2, dropout=0.2, batch_size=64, learning_rate=0.0005




Epoch 1: Train Loss = 0.4501, Val Loss = 0.4112, Val Accuracy = 0.8112
Training with params: embed_dim=50, hidden_dim=256, n_layers=2, dropout=0.5, batch_size=32, learning_rate=0.001




Epoch 1: Train Loss = 0.4322, Val Loss = 0.4047, Val Accuracy = 0.8155
Training with params: embed_dim=50, hidden_dim=256, n_layers=2, dropout=0.5, batch_size=32, learning_rate=0.0005




Epoch 1: Train Loss = 0.4443, Val Loss = 0.4074, Val Accuracy = 0.8125
Training with params: embed_dim=50, hidden_dim=256, n_layers=2, dropout=0.5, batch_size=64, learning_rate=0.001




Epoch 1: Train Loss = 0.4373, Val Loss = 0.4041, Val Accuracy = 0.8152
Training with params: embed_dim=50, hidden_dim=256, n_layers=2, dropout=0.5, batch_size=64, learning_rate=0.0005




Epoch 1: Train Loss = 0.4536, Val Loss = 0.4152, Val Accuracy = 0.8084
Training with params: embed_dim=100, hidden_dim=256, n_layers=2, dropout=0.2, batch_size=32, learning_rate=0.001




Epoch 1: Train Loss = 0.4231, Val Loss = 0.3981, Val Accuracy = 0.8185
Training with params: embed_dim=100, hidden_dim=256, n_layers=2, dropout=0.2, batch_size=32, learning_rate=0.0005




Epoch 1: Train Loss = 0.4302, Val Loss = 0.3994, Val Accuracy = 0.8171
Training with params: embed_dim=100, hidden_dim=256, n_layers=2, dropout=0.2, batch_size=64, learning_rate=0.001




Epoch 1: Train Loss = 0.4271, Val Loss = 0.4005, Val Accuracy = 0.8181
Training with params: embed_dim=100, hidden_dim=256, n_layers=2, dropout=0.2, batch_size=64, learning_rate=0.0005


Training:  67%|██████▋   | 13322/19961 [04:29<02:13, 49.63it/s]

In [None]:
import os
import pickle
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import pandas as pd
from tqdm import tqdm
import itertools
from sklearn.metrics import accuracy_score, classification_report

DEVICE = torch.device("cuda" if torch.cuda.is_available()  else "cpu")
print(DEVICE)

class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=100):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokens = self.tokenizer(self.texts[idx])
        tokens = tokens[:self.max_length]
        tokens = [0] * (self.max_length - len(tokens)) + tokens

        tokens_tensor = torch.tensor(tokens, dtype=torch.long)
        label_tensor = torch.tensor(self.labels[idx], dtype=torch.long)

        return tokens_tensor, label_tensor

class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, n_layers=1, dropout=0.2):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, n_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        hidden_state = lstm_out[:, -1, :]
        output = self.fc(hidden_state)
        return output

def train_model(model, dataloader, optimizer, criterion):
    model.train()
    total_loss = 0.0
    for inputs, labels in tqdm(dataloader, desc="Training", leave=False):
        inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate_model(model, dataloader, criterion):
    model.eval()
    total_loss = 0.0
    true_labels = []
    predictions = []

    with torch.no_grad():
        for inputs, labels in tqdm(dataloader, desc="Evaluating", leave=False):
            inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            preds = torch.argmax(outputs, dim=1).cpu().tolist()
            true_labels.extend(labels.cpu().tolist())
            predictions.extend(preds)

    accuracy = accuracy_score(true_labels, predictions)
    return total_loss / len(dataloader), accuracy, true_labels, predictions

if __name__ == "__main__":
    # Load preprocessed data and vectorizer
    vectorizer_path = "/content/drive/MyDrive/Akademik/btu/SentimentAnalysis/Colab/vectorizer.pkl"
    processed_data_path = "/content/drive/MyDrive/Akademik/btu/SentimentAnalysis/Colab/preprocessed.csv"

    with open(vectorizer_path, "rb") as f:
        vectorizer = pickle.load(f)

    data = pd.read_csv(processed_data_path)
    data.dropna(subset=["text"], inplace=True)

    tokenizer_fn = vectorizer.build_analyzer()

    def tokenizer(text):
        tokens = tokenizer_fn(text)
        return [vectorizer.vocabulary_.get(token, 0) for token in tokens]

    train_texts, test_texts, train_labels, test_labels = train_test_split(
        data["text"], data["sentiment"], test_size=0.2, random_state=42
    )

    train_dataset = SentimentDataset(train_texts.tolist(), train_labels.tolist(), tokenizer)
    test_dataset = SentimentDataset(test_texts.tolist(), test_labels.tolist(), tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=32)

    # Hyperparameter grid
    param_grid = {
        "embed_dim": [100],
        "hidden_dim": [256],
        "n_layers": [2],
        "dropout": [0.2, 0.5],
        "batch_size": [32, 64],
        "learning_rate": [0.001, 0.0005]
    }

    best_accuracy = 0
    best_params = {}

    for params in itertools.product(*param_grid.values()):
        embed_dim, hidden_dim, n_layers, dropout, batch_size, learning_rate = params

        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        test_loader = DataLoader(test_dataset, batch_size=batch_size)

        vocab_size = len(vectorizer.vocabulary_)
        output_dim = 3

        model = LSTMClassifier(vocab_size, embed_dim, hidden_dim, output_dim, n_layers, dropout).to(DEVICE)
        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
        criterion = nn.CrossEntropyLoss()

        print(f"Training with params: embed_dim={embed_dim}, hidden_dim={hidden_dim}, n_layers={n_layers}, dropout={dropout}, batch_size={batch_size}, learning_rate={learning_rate}")

        for epoch in range(1):
            train_loss = train_model(model, train_loader, optimizer, criterion)
            val_loss, val_accuracy, _, _ = evaluate_model(model, test_loader, criterion)
            print(f"Epoch {epoch + 1}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}, Val Accuracy = {val_accuracy:.4f}")

            if val_accuracy > best_accuracy:
                best_accuracy = val_accuracy
                best_params = {
                    "embed_dim": embed_dim,
                    "hidden_dim": hidden_dim,
                    "n_layers": n_layers,
                    "dropout": dropout,
                    "batch_size": batch_size,
                    "learning_rate": learning_rate
                }
                torch.save(model.state_dict(), "/content/drive/MyDrive/Akademik/btu/SentimentAnalysis/Colab/models/tryouts/best_lstm_model_embed_dim_100.pt")

    print("Best Hyperparameters:", best_params)
    print(f"Best Accuracy: {best_accuracy:.4f}")


cuda


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Training with params: embed_dim=100, hidden_dim=256, n_layers=2, dropout=0.2, batch_size=32, learning_rate=0.001




Epoch 1: Train Loss = 0.4231, Val Loss = 0.4003, Val Accuracy = 0.8173
Training with params: embed_dim=100, hidden_dim=256, n_layers=2, dropout=0.2, batch_size=32, learning_rate=0.0005




Epoch 1: Train Loss = 0.4302, Val Loss = 0.3990, Val Accuracy = 0.8179
Training with params: embed_dim=100, hidden_dim=256, n_layers=2, dropout=0.2, batch_size=64, learning_rate=0.001




Epoch 1: Train Loss = 0.4267, Val Loss = 0.3976, Val Accuracy = 0.8187
Training with params: embed_dim=100, hidden_dim=256, n_layers=2, dropout=0.2, batch_size=64, learning_rate=0.0005




Epoch 1: Train Loss = 0.4375, Val Loss = 0.4030, Val Accuracy = 0.8145
Training with params: embed_dim=100, hidden_dim=256, n_layers=2, dropout=0.5, batch_size=32, learning_rate=0.001




Epoch 1: Train Loss = 0.4266, Val Loss = 0.4009, Val Accuracy = 0.8170
Training with params: embed_dim=100, hidden_dim=256, n_layers=2, dropout=0.5, batch_size=32, learning_rate=0.0005




Epoch 1: Train Loss = 0.4337, Val Loss = 0.4025, Val Accuracy = 0.8173
Training with params: embed_dim=100, hidden_dim=256, n_layers=2, dropout=0.5, batch_size=64, learning_rate=0.001




Epoch 1: Train Loss = 0.4298, Val Loss = 0.4002, Val Accuracy = 0.8174
Training with params: embed_dim=100, hidden_dim=256, n_layers=2, dropout=0.5, batch_size=64, learning_rate=0.0005




Epoch 1: Train Loss = 0.4406, Val Loss = 0.4048, Val Accuracy = 0.8142
Best Hyperparameters: {'embed_dim': 100, 'hidden_dim': 256, 'n_layers': 2, 'dropout': 0.2, 'batch_size': 64, 'learning_rate': 0.001}
Best Accuracy: 0.8187


In [None]:
3import os
import pickle
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import pandas as pd
from tqdm import tqdm
import itertools
from sklearn.metrics import accuracy_score, classification_report

DEVICE = torch.device("cuda" if torch.cuda.is_available()  else "cpu")
print(DEVICE)

class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=100):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokens = self.tokenizer(self.texts[idx])
        tokens = tokens[:self.max_length]
        tokens = [0] * (self.max_length - len(tokens)) + tokens

        tokens_tensor = torch.tensor(tokens, dtype=torch.long)
        label_tensor = torch.tensor(self.labels[idx], dtype=torch.long)

        return tokens_tensor, label_tensor

class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, n_layers=1, dropout=0.2):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, n_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        hidden_state = lstm_out[:, -1, :]
        output = self.fc(hidden_state)
        return output

def train_model(model, dataloader, optimizer, criterion):
    model.train()
    total_loss = 0.0
    for inputs, labels in tqdm(dataloader, desc="Training", leave=False):
        inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate_model(model, dataloader, criterion):
    model.eval()
    total_loss = 0.0
    true_labels = []
    predictions = []

    with torch.no_grad():
        for inputs, labels in tqdm(dataloader, desc="Evaluating", leave=False):
            inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            preds = torch.argmax(outputs, dim=1).cpu().tolist()
            true_labels.extend(labels.cpu().tolist())
            predictions.extend(preds)

    accuracy = accuracy_score(true_labels, predictions)
    return total_loss / len(dataloader), accuracy, true_labels, predictions

if __name__ == "__main__":
    # Load preprocessed data and vectorizer
    vectorizer_path = "/content/drive/MyDrive/Akademik/btu/SentimentAnalysis/Colab/vectorizer.pkl"
    processed_data_path = "/content/drive/MyDrive/Akademik/btu/SentimentAnalysis/Colab/preprocessed.csv"

    with open(vectorizer_path, "rb") as f:
        vectorizer = pickle.load(f)

    data = pd.read_csv(processed_data_path)
    data.dropna(subset=["text"], inplace=True)

    tokenizer_fn = vectorizer.build_analyzer()

    def tokenizer(text):
        tokens = tokenizer_fn(text)
        return [vectorizer.vocabulary_.get(token, 0) for token in tokens]

    train_texts, test_texts, train_labels, test_labels = train_test_split(
        data["text"], data["sentiment"], test_size=0.2, random_state=42
    )

    train_dataset = SentimentDataset(train_texts.tolist(), train_labels.tolist(), tokenizer)
    test_dataset = SentimentDataset(test_texts.tolist(), test_labels.tolist(), tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=32)

    # Hyperparameter grid
    param_grid = {
        "embed_dim": [100],
        "hidden_dim": [256],
        "n_layers": [1, 2],
        "dropout": [0.2, 0.5],
        "batch_size": [64],
        "learning_rate": [0.001]
    }

    best_accuracy = 0
    best_params = {}

    for params in itertools.product(*param_grid.values()):
        embed_dim, hidden_dim, n_layers, dropout, batch_size, learning_rate = params

        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        test_loader = DataLoader(test_dataset, batch_size=batch_size)

        vocab_size = len(vectorizer.vocabulary_)
        output_dim = 3

        model = LSTMClassifier(vocab_size, embed_dim, hidden_dim, output_dim, n_layers, dropout).to(DEVICE)
        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
        criterion = nn.CrossEntropyLoss()

        print(f"Training with params: embed_dim={embed_dim}, hidden_dim={hidden_dim}, n_layers={n_layers}, dropout={dropout}, batch_size={batch_size}, learning_rate={learning_rate}")

        for epoch in range(1):
            train_loss = train_model(model, train_loader, optimizer, criterion)
            val_loss, val_accuracy, _, _ = evaluate_model(model, test_loader, criterion)
            print(f"Epoch {epoch + 1}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}, Val Accuracy = {val_accuracy:.4f}")

            if val_accuracy > best_accuracy:
                best_accuracy = val_accuracy
                best_params = {
                    "embed_dim": embed_dim,
                    "hidden_dim": hidden_dim,
                    "n_layers": n_layers,
                    "dropout": dropout,
                    "batch_size": batch_size,
                    "learning_rate": learning_rate
                }
                torch.save(model.state_dict(), "/content/drive/MyDrive/Akademik/btu/SentimentAnalysis/Colab/models/tryouts/best_lstm_model_n_layers.pt")

    print("Best Hyperparameters:", best_params)
    print(f"Best Accuracy: {best_accuracy:.4f}")


cuda


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Training with params: embed_dim=100, hidden_dim=256, n_layers=1, dropout=0.2, batch_size=64, learning_rate=0.001




Epoch 1: Train Loss = 0.4266, Val Loss = 0.4000, Val Accuracy = 0.8179
Training with params: embed_dim=100, hidden_dim=256, n_layers=1, dropout=0.5, batch_size=64, learning_rate=0.001




Epoch 1: Train Loss = 0.4276, Val Loss = 0.4010, Val Accuracy = 0.8167
Training with params: embed_dim=100, hidden_dim=256, n_layers=2, dropout=0.2, batch_size=64, learning_rate=0.001




Epoch 1: Train Loss = 0.4261, Val Loss = 0.3973, Val Accuracy = 0.8189
Training with params: embed_dim=100, hidden_dim=256, n_layers=2, dropout=0.5, batch_size=64, learning_rate=0.001




Epoch 1: Train Loss = 0.4292, Val Loss = 0.3991, Val Accuracy = 0.8176
Best Hyperparameters: {'embed_dim': 100, 'hidden_dim': 256, 'n_layers': 2, 'dropout': 0.2, 'batch_size': 64, 'learning_rate': 0.001}
Best Accuracy: 0.8189


# Hypermeter Try-out LSTM Models
##### `different hyperparameters have been tried to find the best accuracy, they are not used in the final version of the project`

In [None]:
import os
import pickle
import random
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pandas as pd

DEVICE = torch.device("cuda" if torch.cuda.is_available()  else "cpu")

seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
if torch.backends.cudnn.is_available():
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=100):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokens = self.tokenizer(self.texts[idx])
        tokens = tokens[:self.max_length]  # Truncate if too long
        tokens = [0] * (self.max_length - len(tokens)) + tokens  # Pad if too short

        tokens_tensor = torch.tensor(tokens, dtype=torch.long)
        label_tensor = torch.tensor(self.labels[idx], dtype=torch.long)

        return tokens_tensor, label_tensor

class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, n_layers=2):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, n_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        hidden_state = lstm_out[:, -1, :]
        output = self.fc(hidden_state)
        return output

def train_model(model, dataloader, optimizer, criterion):
    model.train()
    total_loss = 0.0
    for inputs, labels in dataloader:
        inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate_model(model, dataloader, criterion):
    model.eval()
    total_loss = 0.0
    true_labels, predictions = [], []
    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy())
    #return true_labels, predictions
    accuracy = accuracy_score(true_labels, predictions)
    return total_loss / len(dataloader), accuracy, true_labels, predictions

if __name__ == "__main__":
    vectorizer_path = "/content/drive/MyDrive/Akademik/btu/SentimentAnalysis/Colab/vectorizer.pkl"
    processed_data_path = "/content/drive/MyDrive/Akademik/btu/SentimentAnalysis/Colab/preprocessed.csv"
    model_save_path = "/content/drive/MyDrive/Akademik/btu/SentimentAnalysis/Colab/models/tryouts/lstm_model_10_epochs.pt"

    print("Loading vectorizer...")
    with open(vectorizer_path, "rb") as f:
        vectorizer = pickle.load(f)

    print("Loading preprocessed data...")
    data = pd.read_csv(processed_data_path)
    data.dropna(subset=["text"], inplace=True)

    tokenizer_fn = vectorizer.build_analyzer()

    def tokenizer(text):
        tokens = tokenizer_fn(text)
        return [vectorizer.vocabulary_.get(token, 0) for token in tokens]

    print("Splitting data into train and test sets...")
    train_texts, test_texts, train_labels, test_labels = train_test_split(
        data["text"], data["sentiment"], test_size=0.3, random_state=42
    )

    train_dataset = SentimentDataset(train_texts.tolist(), train_labels.tolist(), tokenizer)
    test_dataset = SentimentDataset(test_texts.tolist(), test_labels.tolist(), tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=64)

    vocab_size = len(vectorizer.vocabulary_)
    embed_dim = 100
    hidden_dim = 256
    output_dim = 3

    print("Initializing model...")
    model = LSTMClassifier(vocab_size, embed_dim, hidden_dim, output_dim).to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()

    print(f"Using device: {DEVICE}")

    print("Training model...")
    for epoch in range(10):
        train_loss = train_model(model, train_loader, optimizer, criterion)
        val_loss, val_accuracy, _, _ = evaluate_model(model, test_loader, criterion)
        print(f"Epoch {epoch + 1}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}, Val Accuracy = {val_accuracy:.4f}")

    print("Saving model...")
    os.makedirs("models", exist_ok=True)
    torch.save(model.state_dict(), model_save_path)
    print(f"Model saved to {model_save_path}")

    print("Evaluating model...")
    model = LSTMClassifier(vocab_size, embed_dim, hidden_dim, output_dim).to(DEVICE)
    model.load_state_dict(torch.load(model_save_path))
    model.eval()  # Explicitly set evaluation mode
    true_labels, predictions = evaluate_model(model, test_loader)

    print("\nAccuracy:", accuracy_score(true_labels, predictions))
    print("\nClassification Report:")
    print(classification_report(true_labels, predictions))

    print("\nConfusion Matrix:")
    print(confusion_matrix(true_labels, predictions))

Loading vectorizer...


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Loading preprocessed data...
Splitting data into train and test sets...
Initializing model...
Using device: cuda
Training model...
Epoch 1: Train Loss = 0.4283, Val Loss = 0.3997, Val Accuracy = 0.8177
Epoch 2: Train Loss = 0.3812, Val Loss = 0.3898, Val Accuracy = 0.8229
Epoch 3: Train Loss = 0.3574, Val Loss = 0.3902, Val Accuracy = 0.8228
Epoch 4: Train Loss = 0.3357, Val Loss = 0.3977, Val Accuracy = 0.8215
Epoch 5: Train Loss = 0.3159, Val Loss = 0.4075, Val Accuracy = 0.8196
Epoch 6: Train Loss = 0.2993, Val Loss = 0.4137, Val Accuracy = 0.8163
Epoch 7: Train Loss = 0.2863, Val Loss = 0.4307, Val Accuracy = 0.8153
Epoch 8: Train Loss = 0.2765, Val Loss = 0.4386, Val Accuracy = 0.8132
Epoch 9: Train Loss = 0.2689, Val Loss = 0.4509, Val Accuracy = 0.8126
Epoch 10: Train Loss = 0.2641, Val Loss = 0.4515, Val Accuracy = 0.8112
Saving model...
Model saved to /content/drive/MyDrive/Colab Notebooks/SentimentAnalysis/wout_metadata/lstm_model_10_epochs.pt
Evaluating model...


  model.load_state_dict(torch.load(model_save_path))


TypeError: evaluate_model() missing 1 required positional argument: 'criterion'

In [None]:
import os
import pickle
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pandas as pd

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Set random seed for reproducibility
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
if torch.backends.cudnn.is_available():
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=100):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokens = self.tokenizer(self.texts[idx])
        tokens = tokens[:self.max_length]
        tokens = [0] * (self.max_length - len(tokens)) + tokens

        tokens_tensor = torch.tensor(tokens, dtype=torch.long)
        label_tensor = torch.tensor(self.labels[idx], dtype=torch.long)

        return tokens_tensor, label_tensor


class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, n_layers=2):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, n_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        hidden_state = lstm_out[:, -1, :]
        output = self.fc(hidden_state)
        return output


def evaluate_model(model, dataloader, criterion):
    model.eval()
    total_loss = 0.0
    true_labels, predictions = [], []
    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy())
    accuracy = accuracy_score(true_labels, predictions)
    return total_loss / len(dataloader), accuracy, true_labels, predictions


if __name__ == "__main__":
    vectorizer_path = "/content/drive/MyDrive/Akademik/btu/SentimentAnalysis/Colab/vectorizer.pkl"
    processed_data_path = "/content/drive/MyDrive/Akademik/btu/SentimentAnalysis/Colab/preprocessed.csv"
    model_save_path = "/content/drive/MyDrive/Akademik/btu/SentimentAnalysis/Colab/models/tryouts/lstm_model_10_epochs.pt"

    print("Loading vectorizer...")
    with open(vectorizer_path, "rb") as f:
        vectorizer = pickle.load(f)

    print("Loading preprocessed data...")
    data = pd.read_csv(processed_data_path)
    data.dropna(subset=["text"], inplace=True)

    tokenizer_fn = vectorizer.build_analyzer()

    def tokenizer(text):
        tokens = tokenizer_fn(text)
        return [vectorizer.vocabulary_.get(token, 0) for token in tokens]

    print("Preparing test dataset...")
    test_texts, test_labels = data["text"].tolist(), data["sentiment"].tolist()
    test_dataset = SentimentDataset(test_texts, test_labels, tokenizer)
    test_loader = DataLoader(test_dataset, batch_size=64)

    vocab_size = len(vectorizer.vocabulary_)
    embed_dim = 100
    hidden_dim = 256
    output_dim = 3

    print("Loading model...")
    model = LSTMClassifier(vocab_size, embed_dim, hidden_dim, output_dim).to(DEVICE)
    model.load_state_dict(torch.load(model_save_path))

    print("Evaluating model...")
    criterion = nn.CrossEntropyLoss()
    _, test_accuracy, true_labels, predictions = evaluate_model(model, test_loader, criterion)

    unique_classes = sorted(set(true_labels))
    num_classes = len(unique_classes)

    # Generate target names based on the number of classes
    default_target_names = ["Class " + str(i) for i in range(num_classes)]
    custom_target_names = ["Negative", "Neutral", "Positive"][:num_classes]  # Adjust to match detected classes

    print("\nTest Accuracy:", test_accuracy)
    print("\nAccuracy:", accuracy_score(true_labels, predictions))
    print("\nClassification Report:")
    print(classification_report(true_labels, predictions, target_names=custom_target_names))

    print("\nConfusion Matrix:")
    print(confusion_matrix(true_labels, predictions, labels=unique_classes))


Loading vectorizer...


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Loading preprocessed data...
Preparing test dataset...
Loading model...


  model.load_state_dict(torch.load(model_save_path))


Evaluating model...

Test Accuracy: 0.8766706537327079

Accuracy: 0.8766706537327079

Classification Report:
              precision    recall  f1-score   support

    Negative       0.87      0.88      0.88    798396
     Neutral       0.88      0.87      0.88    798434

    accuracy                           0.88   1596830
   macro avg       0.88      0.88      0.88   1596830
weighted avg       0.88      0.88      0.88   1596830


Confusion Matrix:
[[703287  95109]
 [101827 696607]]


In [None]:
import os
import pickle
import random
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pandas as pd

DEVICE = torch.device("cuda" if torch.cuda.is_available()  else "cpu")

seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
if torch.backends.cudnn.is_available():
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=100):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokens = self.tokenizer(self.texts[idx])
        tokens = tokens[:self.max_length]  # Truncate if too long
        tokens = [0] * (self.max_length - len(tokens)) + tokens  # Pad if too short

        tokens_tensor = torch.tensor(tokens, dtype=torch.long)
        label_tensor = torch.tensor(self.labels[idx], dtype=torch.long)

        return tokens_tensor, label_tensor

class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, n_layers=1):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, n_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        hidden_state = lstm_out[:, -1, :]
        output = self.fc(hidden_state)
        return output

def train_model(model, dataloader, optimizer, criterion):
    model.train()
    total_loss = 0.0
    for inputs, labels in dataloader:
        inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate_model(model, dataloader, criterion):
    model.eval()
    total_loss = 0.0
    true_labels, predictions = [], []
    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy())
    #return true_labels, predictions
    accuracy = accuracy_score(true_labels, predictions)
    return total_loss / len(dataloader), accuracy, true_labels, predictions

if __name__ == "__main__":
    vectorizer_path = "/content/drive/MyDrive/Akademik/btu/SentimentAnalysis/Colab/vectorizer.pkl"
    processed_data_path = "/content/drive/MyDrive/Akademik/btu/SentimentAnalysis/Colab/preprocessed.csv"
    model_save_path = "/content/drive/MyDrive/Akademik/btu/SentimentAnalysis/Colab/models/tryouts/lstm_model_7_epochs.pt"

    print("Loading vectorizer...")
    with open(vectorizer_path, "rb") as f:
        vectorizer = pickle.load(f)

    print("Loading preprocessed data...")
    data = pd.read_csv(processed_data_path)
    data.dropna(subset=["text"], inplace=True)

    tokenizer_fn = vectorizer.build_analyzer()

    def tokenizer(text):
        tokens = tokenizer_fn(text)
        return [vectorizer.vocabulary_.get(token, 0) for token in tokens]

    print("Splitting data into train and test sets...")
    train_texts, test_texts, train_labels, test_labels = train_test_split(
        data["text"], data["sentiment"], test_size=0.4, random_state=42
    )

    train_dataset = SentimentDataset(train_texts.tolist(), train_labels.tolist(), tokenizer)
    test_dataset = SentimentDataset(test_texts.tolist(), test_labels.tolist(), tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=32)

    vocab_size = len(vectorizer.vocabulary_)
    embed_dim = 100
    hidden_dim = 128
    output_dim = 3

    print("Initializing model...")
    model = LSTMClassifier(vocab_size, embed_dim, hidden_dim, output_dim).to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
    criterion = nn.CrossEntropyLoss()

    print(f"Using device: {DEVICE}")

    print("Training model...")
    for epoch in range(7):
        train_loss = train_model(model, train_loader, optimizer, criterion)
        val_loss, val_accuracy, _, _ = evaluate_model(model, test_loader, criterion)
        print(f"Epoch {epoch + 1}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}, Val Accuracy = {val_accuracy:.4f}")

    print("Saving model...")
    os.makedirs("models", exist_ok=True)
    torch.save(model.state_dict(), model_save_path)
    print(f"Model saved to {model_save_path}")

    print("Evaluating model...")
    criterion = nn.CrossEntropyLoss()
    _, test_accuracy, true_labels, predictions = evaluate_model(model, test_loader, criterion)

    unique_classes = sorted(set(true_labels))
    num_classes = len(unique_classes)

    # Generate target names based on the number of classes
    default_target_names = ["Class " + str(i) for i in range(num_classes)]
    custom_target_names = ["Negative", "Neutral", "Positive"][:num_classes]  # Adjust to match detected classes

    print("\nTest Accuracy:", test_accuracy)
    print("\nAccuracy:", accuracy_score(true_labels, predictions))
    print("\nClassification Report:")
    print(classification_report(true_labels, predictions, target_names=custom_target_names))

    print("\nConfusion Matrix:")
    print(confusion_matrix(true_labels, predictions))

Loading vectorizer...


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Loading preprocessed data...
Splitting data into train and test sets...
Initializing model...
Using device: cuda
Training model...
Epoch 1: Train Loss = 0.4326, Val Loss = 0.4095, Val Accuracy = 0.8120
Epoch 2: Train Loss = 0.3992, Val Loss = 0.4000, Val Accuracy = 0.8177
Epoch 3: Train Loss = 0.3895, Val Loss = 0.3948, Val Accuracy = 0.8197
Epoch 4: Train Loss = 0.3840, Val Loss = 0.3924, Val Accuracy = 0.8214
Epoch 5: Train Loss = 0.3799, Val Loss = 0.3921, Val Accuracy = 0.8218
Epoch 6: Train Loss = 0.3765, Val Loss = 0.3901, Val Accuracy = 0.8233
Epoch 7: Train Loss = 0.3740, Val Loss = 0.3907, Val Accuracy = 0.8223
Saving model...
Model saved to /content/drive/MyDrive/Colab Notebooks/SentimentAnalysis/wout_metadata/lstm_model_7_epochs.pt
Evaluating model...

Test Accuracy: 0.8223057557786364

Accuracy: 0.8223057557786364

Classification Report:
              precision    recall  f1-score   support

    Negative       0.82      0.83      0.82    319015
     Neutral       0.83      

In [None]:
import os
import pickle
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pandas as pd

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Set random seed for reproducibility
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
if torch.backends.cudnn.is_available():
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=100):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokens = self.tokenizer(self.texts[idx])
        tokens = tokens[:self.max_length]
        tokens = [0] * (self.max_length - len(tokens)) + tokens

        tokens_tensor = torch.tensor(tokens, dtype=torch.long)
        label_tensor = torch.tensor(self.labels[idx], dtype=torch.long)

        return tokens_tensor, label_tensor


class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, n_layers=1):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, n_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        hidden_state = lstm_out[:, -1, :]
        output = self.fc(hidden_state)
        return output


def evaluate_model(model, dataloader, criterion):
    model.eval()
    total_loss = 0.0
    true_labels, predictions = [], []
    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy())
    accuracy = accuracy_score(true_labels, predictions)
    return total_loss / len(dataloader), accuracy, true_labels, predictions


if __name__ == "__main__":
    vectorizer_path = "/content/drive/MyDrive/Akademik/btu/SentimentAnalysis/Colab/vectorizer.pkl"
    processed_data_path = "/content/drive/MyDrive/Akademik/btu/SentimentAnalysis/Colab/preprocessed.csv"
    model_save_path = "/content/drive/MyDrive/Akademik/btu/SentimentAnalysis/Colab/models/tryouts/lstm_model_7_epochs.pt"

    print("Loading vectorizer...")
    with open(vectorizer_path, "rb") as f:
        vectorizer = pickle.load(f)

    print("Loading preprocessed data...")
    data = pd.read_csv(processed_data_path)
    data.dropna(subset=["text"], inplace=True)

    tokenizer_fn = vectorizer.build_analyzer()

    def tokenizer(text):
        tokens = tokenizer_fn(text)
        return [vectorizer.vocabulary_.get(token, 0) for token in tokens]

    print("Preparing test dataset...")
    test_texts, test_labels = data["text"].tolist(), data["sentiment"].tolist()
    test_dataset = SentimentDataset(test_texts, test_labels, tokenizer)
    test_loader = DataLoader(test_dataset, batch_size=32)

    vocab_size = len(vectorizer.vocabulary_)
    embed_dim = 100
    hidden_dim = 128
    output_dim = 3

    print("Loading model...")
    model = LSTMClassifier(vocab_size, embed_dim, hidden_dim, output_dim).to(DEVICE)
    model.load_state_dict(torch.load(model_save_path))

    print("Evaluating model...")
    criterion = nn.CrossEntropyLoss()
    _, test_accuracy, true_labels, predictions = evaluate_model(model, test_loader, criterion)

    unique_classes = sorted(set(true_labels))
    num_classes = len(unique_classes)

    # Generate target names based on the number of classes
    default_target_names = ["Class " + str(i) for i in range(num_classes)]
    custom_target_names = ["Negative", "Neutral", "Positive"][:num_classes]  # Adjust to match detected classes

    print("\nAccuracy:", accuracy_score(true_labels, predictions))
    print("\nClassification Report:")
    print(classification_report(true_labels, predictions, target_names=custom_target_names))

    print("\nConfusion Matrix:")
    print(confusion_matrix(true_labels, predictions, labels=unique_classes))


Loading vectorizer...
Loading preprocessed data...


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Preparing test dataset...
Loading model...
Evaluating model...


  model.load_state_dict(torch.load(model_save_path))



Accuracy: 0.8337581333016038

Classification Report:
              precision    recall  f1-score   support

    Negative       0.83      0.84      0.83    798396
     Neutral       0.84      0.83      0.83    798434

    accuracy                           0.83   1596830
   macro avg       0.83      0.83      0.83   1596830
weighted avg       0.83      0.83      0.83   1596830


Confusion Matrix:
[[671233 127163]
 [138297 660137]]


In [None]:
import os
import pickle
import random
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pandas as pd

DEVICE = torch.device("cuda" if torch.cuda.is_available()  else "cpu")

seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
if torch.backends.cudnn.is_available():
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=100):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokens = self.tokenizer(self.texts[idx])
        tokens = tokens[:self.max_length]  # Truncate if too long
        tokens = [0] * (self.max_length - len(tokens)) + tokens  # Pad if too short

        tokens_tensor = torch.tensor(tokens, dtype=torch.long)
        label_tensor = torch.tensor(self.labels[idx], dtype=torch.long)

        return tokens_tensor, label_tensor

class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, n_layers=1):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, n_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        hidden_state = lstm_out[:, -1, :]
        output = self.fc(hidden_state)
        return output

def train_model(model, dataloader, optimizer, criterion):
    model.train()
    total_loss = 0.0
    for inputs, labels in dataloader:
        inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate_model(model, dataloader, criterion):
    model.eval()
    total_loss = 0.0
    true_labels, predictions = [], []
    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy())
    #return true_labels, predictions
    accuracy = accuracy_score(true_labels, predictions)
    return total_loss / len(dataloader), accuracy, true_labels, predictions

if __name__ == "__main__":
    vectorizer_path = "/content/drive/MyDrive/Akademik/btu/SentimentAnalysis/Colab/vectorizer.pkl"
    processed_data_path = "/content/drive/MyDrive/Akademik/btu/SentimentAnalysis/Colab/preprocessed.csv"
    model_save_path = "/content/drive/MyDrive/Akademik/btu/SentimentAnalysis/Colab/models/tryouts/lstm_model_7_epochs_batch_64.pt"

    print("Loading vectorizer...")
    with open(vectorizer_path, "rb") as f:
        vectorizer = pickle.load(f)

    print("Loading preprocessed data...")
    data = pd.read_csv(processed_data_path)
    data.dropna(subset=["text"], inplace=True)

    tokenizer_fn = vectorizer.build_analyzer()

    def tokenizer(text):
        tokens = tokenizer_fn(text)
        return [vectorizer.vocabulary_.get(token, 0) for token in tokens]

    print("Splitting data into train and test sets...")
    train_texts, test_texts, train_labels, test_labels = train_test_split(
        data["text"], data["sentiment"], test_size=0.2, random_state=42
    )

    train_dataset = SentimentDataset(train_texts.tolist(), train_labels.tolist(), tokenizer)
    test_dataset = SentimentDataset(test_texts.tolist(), test_labels.tolist(), tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=64)

    vocab_size = len(vectorizer.vocabulary_)
    embed_dim = 100
    hidden_dim = 128
    output_dim = 3

    print("Initializing model...")
    model = LSTMClassifier(vocab_size, embed_dim, hidden_dim, output_dim).to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()

    print(f"Using device: {DEVICE}")

    print("Training model...")
    for epoch in range(7):
        train_loss = train_model(model, train_loader, optimizer, criterion)
        val_loss, val_accuracy, _, _ = evaluate_model(model, test_loader, criterion)
        print(f"Epoch {epoch + 1}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}, Val Accuracy = {val_accuracy:.4f}")

    print("Saving model...")
    os.makedirs("models", exist_ok=True)
    torch.save(model.state_dict(), model_save_path)
    print(f"Model saved to {model_save_path}")

    print("Evaluating model...")
    criterion = nn.CrossEntropyLoss()
    _, test_accuracy, true_labels, predictions = evaluate_model(model, test_loader, criterion)

    unique_classes = sorted(set(true_labels))
    num_classes = len(unique_classes)

    # Generate target names based on the number of classes
    default_target_names = ["Class " + str(i) for i in range(num_classes)]
    custom_target_names = ["Negative", "Neutral", "Positive"][:num_classes]  # Adjust to match detected classes

    print("\nTest Accuracy:", test_accuracy)
    print("\nAccuracy:", accuracy_score(true_labels, predictions))
    print("\nClassification Report:")
    print(classification_report(true_labels, predictions, target_names=custom_target_names))

    print("\nConfusion Matrix:")
    print(confusion_matrix(true_labels, predictions))

Loading vectorizer...
Loading preprocessed data...


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Splitting data into train and test sets...
Initializing model...
Using device: cuda
Training model...
Epoch 1: Train Loss = 0.4308, Val Loss = 0.4056, Val Accuracy = 0.8141
Epoch 2: Train Loss = 0.3908, Val Loss = 0.3976, Val Accuracy = 0.8187
Epoch 3: Train Loss = 0.3745, Val Loss = 0.3937, Val Accuracy = 0.8211
Epoch 4: Train Loss = 0.3609, Val Loss = 0.3965, Val Accuracy = 0.8202
Epoch 5: Train Loss = 0.3485, Val Loss = 0.3993, Val Accuracy = 0.8195
Epoch 6: Train Loss = 0.3373, Val Loss = 0.4063, Val Accuracy = 0.8179
Epoch 7: Train Loss = 0.3267, Val Loss = 0.4121, Val Accuracy = 0.8173
Saving model...
Model saved to /content/drive/MyDrive/Colab Notebooks/SentimentAnalysis/wout_metadata/lstm_model_7_epochs_batch_64.pt
Evaluating model...

Test Accuracy: 0.8173099202795538

Accuracy: 0.8173099202795538

Classification Report:
              precision    recall  f1-score   support

    Negative       0.81      0.82      0.82    159215
     Neutral       0.82      0.81      0.82    16

# Final LSTM Model and Evaluation
` used in the final version of the project`

In [None]:
import os
import pickle
import random
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pandas as pd

DEVICE = torch.device("cuda" if torch.cuda.is_available()  else "cpu")

seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
if torch.backends.cudnn.is_available():
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=100):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokens = self.tokenizer(self.texts[idx])
        tokens = tokens[:self.max_length]  # Truncate if too long
        tokens = [0] * (self.max_length - len(tokens)) + tokens  # Pad if too short

        tokens_tensor = torch.tensor(tokens, dtype=torch.long)
        label_tensor = torch.tensor(self.labels[idx], dtype=torch.long)

        return tokens_tensor, label_tensor

class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, n_layers=2, dropout=0.5):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, n_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        hidden_state = lstm_out[:, -1, :]
        output = self.fc(hidden_state)
        return output

def train_model(model, dataloader, optimizer, criterion):
    model.train()
    total_loss = 0.0
    for inputs, labels in dataloader:
        inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate_model(model, dataloader, criterion):
    model.eval()
    total_loss = 0.0
    true_labels, predictions = [], []
    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy())
    #return true_labels, predictions
    accuracy = accuracy_score(true_labels, predictions)
    return total_loss / len(dataloader), accuracy, true_labels, predictions

if __name__ == "__main__":
    vectorizer_path = "/content/drive/MyDrive/Akademik/btu/SentimentAnalysis/Colab/vectorizer.pkl"
    processed_data_path = "/content/drive/MyDrive/Akademik/btu/SentimentAnalysis/Colab/preprocessed.csv"
    model_save_path = "/content/drive/MyDrive/Akademik/btu/SentimentAnalysis/Colab/models/lstm_model.pt"
    metric_save_path = "/content/drive/MyDrive/Akademik/btu/SentimentAnalysis/Colab/results/metrics/"

    print("Loading vectorizer...")
    with open(vectorizer_path, "rb") as f:
        vectorizer = pickle.load(f)

    print("Loading preprocessed data...")
    data = pd.read_csv(processed_data_path)
    data.dropna(subset=["text"], inplace=True)

    tokenizer_fn = vectorizer.build_analyzer()

    def tokenizer(text):
        tokens = tokenizer_fn(text)
        return [vectorizer.vocabulary_.get(token, 0) for token in tokens]

    print("Splitting data into train and test sets...")
    train_texts, test_texts, train_labels, test_labels = train_test_split(
        data["text"], data["sentiment"], test_size=0.2, random_state=42
    )

    train_dataset = SentimentDataset(train_texts.tolist(), train_labels.tolist(), tokenizer)
    test_dataset = SentimentDataset(test_texts.tolist(), test_labels.tolist(), tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=64)

    vocab_size = len(vectorizer.vocabulary_)
    embed_dim = 100
    hidden_dim = 256
    output_dim = 3

    print("Initializing model...")
    model = LSTMClassifier(vocab_size, embed_dim, hidden_dim, output_dim).to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()

    print(f"Using device: {DEVICE}")

    print("Training model...")

    train_losses = []
    val_losses = []
    val_accuracies = []

    for epoch in range(10):
        train_loss = train_model(model, train_loader, optimizer, criterion)
        val_loss, val_accuracy, _, _ = evaluate_model(model, test_loader, criterion)

        train_losses.append(train_loss)
        val_losses.append(val_loss)
        val_accuracies.append(val_accuracy)

        print(f"Epoch {epoch + 1}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}, Val Accuracy = {val_accuracy:.4f}")

    print("Evaluating model...")
    _, test_accuracy, true_labels, predictions = evaluate_model(model, test_loader, criterion)

    print("Saving model and metrics...")
    #os.makedirs("models", exist_ok=True)
    os.makedirs(metric_save_path, exist_ok=True)
    torch.save(model.state_dict(), model_save_path)
    print(f"Model saved to {model_save_path}")
    torch.save({'train_losses': train_losses,'val_losses': val_losses,'val_accuracies': val_accuracies}, metric_save_path + "lstm_training_history2.pth")
    torch.save(true_labels, metric_save_path + "lstm_true_labels2.pth")
    torch.save(predictions, metric_save_path + "lstm_predictions2.pth")
    print(f"Metrics saved to {metric_save_path}")

    unique_classes = sorted(set(true_labels))
    num_classes = len(unique_classes)

    # Generate target names based on the number of classes
    default_target_names = ["Class " + str(i) for i in range(num_classes)]
    custom_target_names = ["Negative", "Neutral", "Positive"][:num_classes]  # Adjust to match detected classes

    print("\nAccuracy:", accuracy_score(true_labels, predictions))
    print("\nClassification Report:")
    print(classification_report(true_labels, predictions, target_names=custom_target_names))

    print("\nConfusion Matrix:")
    print(confusion_matrix(true_labels, predictions))

Loading vectorizer...


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Loading preprocessed data...
Splitting data into train and test sets...
Initializing model...
Using device: cuda
Training model...
Epoch 1: Train Loss = 0.4250, Val Loss = 0.3979, Val Accuracy = 0.8182
Epoch 2: Train Loss = 0.3800, Val Loss = 0.3914, Val Accuracy = 0.8222
Epoch 3: Train Loss = 0.3584, Val Loss = 0.3921, Val Accuracy = 0.8228
Epoch 4: Train Loss = 0.3394, Val Loss = 0.3941, Val Accuracy = 0.8226
Epoch 5: Train Loss = 0.3228, Val Loss = 0.4022, Val Accuracy = 0.8202
Epoch 6: Train Loss = 0.3088, Val Loss = 0.4150, Val Accuracy = 0.8190
Epoch 7: Train Loss = 0.2986, Val Loss = 0.4164, Val Accuracy = 0.8173
Epoch 8: Train Loss = 0.2904, Val Loss = 0.4266, Val Accuracy = 0.8158
Epoch 9: Train Loss = 0.2846, Val Loss = 0.4348, Val Accuracy = 0.8151
Epoch 10: Train Loss = 0.2809, Val Loss = 0.4341, Val Accuracy = 0.8125
Evaluating model...
Saving model and metrics...
Model saved to /content/drive/MyDrive/Colab Notebooks/SentimentAnalysis/wout_metadata/lstm_model_last.pt
Metri

In [None]:
import os
import pickle
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pandas as pd

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Set random seed for reproducibility
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
if torch.backends.cudnn.is_available():
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=100):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokens = self.tokenizer(self.texts[idx])
        tokens = tokens[:self.max_length]
        tokens = [0] * (self.max_length - len(tokens)) + tokens

        tokens_tensor = torch.tensor(tokens, dtype=torch.long)
        label_tensor = torch.tensor(self.labels[idx], dtype=torch.long)

        return tokens_tensor, label_tensor


class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, n_layers=2, dropout=0.5):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, n_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        hidden_state = lstm_out[:, -1, :]
        output = self.fc(hidden_state)
        return output


def evaluate_model(model, dataloader, criterion):
    model.eval()
    total_loss = 0.0
    true_labels, predictions = [], []
    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy())
    accuracy = accuracy_score(true_labels, predictions)
    return total_loss / len(dataloader), accuracy, true_labels, predictions


if __name__ == "__main__":
    vectorizer_path = "/content/drive/MyDrive/Akademik/btu/SentimentAnalysis/Colab/vectorizer.pkl"
    processed_data_path = "/content/drive/MyDrive/Akademik/btu/SentimentAnalysis/Colab/preprocessed.csv"
    model_save_path = "/content/drive/MyDrive/Akademik/btu/SentimentAnalysis/Colab/models/lstm_model.pt"

    print("Loading vectorizer...")
    with open(vectorizer_path, "rb") as f:
        vectorizer = pickle.load(f)

    print("Loading preprocessed data...")
    data = pd.read_csv(processed_data_path)
    data.dropna(subset=["text"], inplace=True)

    tokenizer_fn = vectorizer.build_analyzer()

    def tokenizer(text):
        tokens = tokenizer_fn(text)
        return [vectorizer.vocabulary_.get(token, 0) for token in tokens]

    print("Preparing test dataset...")
    test_texts, test_labels = data["text"].tolist(), data["sentiment"].tolist()
    test_dataset = SentimentDataset(test_texts, test_labels, tokenizer)
    test_loader = DataLoader(test_dataset, batch_size=64)

    vocab_size = len(vectorizer.vocabulary_)
    embed_dim = 100
    hidden_dim = 256
    output_dim = 3

    print("Loading model...")
    model = LSTMClassifier(vocab_size, embed_dim, hidden_dim, output_dim).to(DEVICE)
    model.load_state_dict(torch.load(model_save_path))

    print("Evaluating model...")
    criterion = nn.CrossEntropyLoss()
    _, test_accuracy, true_labels, predictions = evaluate_model(model, test_loader, criterion)

    unique_classes = sorted(set(true_labels))
    num_classes = len(unique_classes)

    # Generate target names based on the number of classes
    default_target_names = ["Class " + str(i) for i in range(num_classes)]
    custom_target_names = ["Negative", "Neutral", "Positive"][:num_classes]  # Adjust to match detected classes

    print("\nAccuracy:", accuracy_score(true_labels, predictions))
    print("\nClassification Report:")
    print(classification_report(true_labels, predictions, target_names=custom_target_names))

    print("\nConfusion Matrix:")
    print(confusion_matrix(true_labels, predictions, labels=unique_classes))


Loading vectorizer...


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Loading preprocessed data...
Preparing test dataset...
Loading model...
Evaluating model...


  model.load_state_dict(torch.load(model_save_path))



Accuracy: 0.8780677968224545

Classification Report:
              precision    recall  f1-score   support

    Negative       0.88      0.88      0.88    798396
     Neutral       0.88      0.88      0.88    798434

    accuracy                           0.88   1596830
   macro avg       0.88      0.88      0.88   1596830
weighted avg       0.88      0.88      0.88   1596830


Confusion Matrix:
[[699583  98813]
 [ 95892 702542]]
