In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import re
import os
from urllib.request import urlretrieve

import mlflow.data
from datetime import datetime
import kagglehub
import shutil
import os
import text_preprocessing as tpp

import embeddings as embd


In [31]:
import warnings
warnings.filterwarnings('ignore')


In [32]:
# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

In [33]:
# Download latest version of the datasets and copy them to the current folder
path = kagglehub.dataset_download("abhi8923shriv/sentiment-analysis-dataset")
shutil.copy(f'{path}/test.csv', f'{os.getcwd()}/test.csv')
shutil.copy(f'{path}/train.csv', f'{os.getcwd()}/train.csv')


'/home/nicolae/caralislabs/ml-stuff/my_blog/sentiment_analysis/train.csv'

In [34]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, vocab_to_idx, max_length=128):
        self.texts = texts
        self.labels = labels
        self.vocab_to_idx = vocab_to_idx
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        # Convert text to indices
        tokens = self.preprocess_text(text)
        indices = [self.vocab_to_idx.get(token, self.vocab_to_idx['<UNK>']) for token in tokens]
        
        # Pad or truncate to max_length
        if len(indices) < self.max_length:
            indices.extend([self.vocab_to_idx['<PAD>']] * (self.max_length - len(indices)))
        else:
            indices = indices[:self.max_length]
        
        return torch.tensor(indices, dtype=torch.long), torch.tensor(label, dtype=torch.long)
    
    def preprocess_text(self, text):
        # Simple text preprocessing
        text = text.lower()
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        tokens = text.split()
        return tokens

In [35]:
class SentimentLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers=2, dropout=0.3, 
                 pretrained_embeddings=None, freeze_embeddings=False):
        super(SentimentLSTM, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        if pretrained_embeddings is not None:
            print("Loading pre-trained embeddings...")
            self.embedding.weight.data.copy_(torch.from_numpy(pretrained_embeddings))
            self.embedding.weight.requires_grad = not freeze_embeddings
            if freeze_embeddings:
                print("Embedding weights frozen")
            else:
                print("Embedding weights will be fine-tuned")
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, 
                            dropout=dropout, batch_first=True, bidirectional=True)
        
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)  # *2 for bidirectional

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, (hidden, cell) = self.lstm(embedded)
        
        # hidden: [num_layers * num_directions, batch, hidden_dim]
        # Take the last layer's hidden states for both directions
        hidden_cat = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)  # [batch, hidden_dim*2]
        
        output = self.dropout(hidden_cat)
        output = self.fc(output)
        return output

In [36]:
def main():
    now = datetime.now()
    run_name = f"tracking run at: {now}"     

    with mlflow.start_run(run_name=run_name):

        # Some searchable tags
        experiment_tags = {
            "project_name": "sentiment_analysis",
        }
        
        experiment_id = get_or_create_experiment(experiment_name="Sentiment Analysis with pytorch model", experiment_tags=experiment_tags)
        
        mlflow.set_experiment(experiment_id=experiment_id)

        # Configuration
        CONFIG = {
            'embedding_dim': 100,
            'hidden_dim': 256,
            'n_layers': 2,
            'dropout': 0.3,
            'learning_rate': 0.01,
            'batch_size': 32,
            'epochs': 10,
            'max_length': 128,
            'min_freq': 2,
            'use_pretrained_embeddings': True,  # Set to False to train from scratch
            'freeze_embeddings': True,  # Set to True to freeze pre-trained embeddings
            'glove_path': None,  # Set to your GloVe file path, or None to auto-download
        }
        
        # Device configuration
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"Using device: {device}")
        
        # Load datasets (replace with your actual file paths)
        print("Loading datasets...")
        train_df = pd.read_csv('train.csv', encoding='latin1')
        test_df = pd.read_csv('test.csv', encoding='latin1')
        # clean-up the datasets
        train_df = train_df[['text', 'sentiment']].dropna()
        test_df = test_df[['text', 'sentiment']].dropna()
        
        print(f"Training data: {len(train_df)} samples")
        print(f"Test data: {len(test_df)} samples")
        print(f"Sentiment distribution in training data:")
        print(train_df['sentiment'].value_counts())
        
        # Prepare label encoder
        label_encoder = LabelEncoder()
        all_sentiments = pd.concat([train_df['sentiment'], test_df['sentiment']])
        label_encoder.fit(all_sentiments)
        
        # Encode labels
        train_labels = label_encoder.transform(train_df['sentiment'])
        test_labels = label_encoder.transform(test_df['sentiment'])
        
        print(f"Label mapping: {dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))}")
        
        # Build vocabulary
        print("Building vocabulary...")
        vocab_to_idx = tpp.build_vocabulary(train_df['text'], min_freq=CONFIG['min_freq'])

        mlflow.log_dict(vocab_to_idx, "vocab_to_idx")
        
        vocab_size = len(vocab_to_idx)
        print(f"Vocabulary size: {vocab_size}")
        
        # Load pre-trained embeddings if specified
        pretrained_embeddings = None
        if CONFIG['use_pretrained_embeddings']:
            # Handle GloVe embeddings
            glove_path = CONFIG['glove_path']
            if glove_path is None:
                # Auto-download GloVe embeddings
                glove_path = embd.download_glove_embeddings(CONFIG['embedding_dim'])
            
            # Load GloVe embeddings
            glove_embeddings = embd.load_glove_embeddings(glove_path, CONFIG['embedding_dim'])
            pretrained_embeddings = embd.create_embedding_matrix(vocab_to_idx, glove_embeddings, CONFIG['embedding_dim'])
        
        # Create datasets and dataloaders
        train_dataset = TextDataset(train_df['text'].values, train_labels, vocab_to_idx, CONFIG['max_length'])
        test_dataset = TextDataset(test_df['text'].values, test_labels, vocab_to_idx, CONFIG['max_length'])
        
        train_loader = DataLoader(train_dataset, batch_size=CONFIG['batch_size'], shuffle=True)
        test_loader = DataLoader(test_dataset, batch_size=CONFIG['batch_size'], shuffle=False)
        
        # Initialize model
        output_dim = len(label_encoder.classes_)
        model = SentimentLSTM(
            vocab_size=vocab_size,
            embedding_dim=CONFIG['embedding_dim'],
            hidden_dim=CONFIG['hidden_dim'],
            output_dim=output_dim,
            n_layers=CONFIG['n_layers'],
            dropout=CONFIG['dropout'],
            pretrained_embeddings=pretrained_embeddings,
            freeze_embeddings=CONFIG['freeze_embeddings']
        ).to(device)
        
        print(f"Model initialized with {sum(p.numel() for p in model.parameters())} parameters")
        
        # Loss and optimizer
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters(), lr=CONFIG['learning_rate'])
        
        # Training loop
        print("\nStarting training...")
        best_accuracy = 0
        
    
        # Log configuration
        mlflow.log_params(CONFIG)

        # Log datasets
        test_dataset = mlflow.data.from_pandas(df=test_df, name="test_dataset")
        train_dataset = mlflow.data.from_pandas(df=train_df, name="train_dataset")        
        mlflow.log_input(test_dataset, context="test_dataset")
        mlflow.log_input(train_dataset, context="train_dataset")
        
        for epoch in range(CONFIG['epochs']):
            train_loss, train_accuracy = train_model(model, train_loader, criterion, optimizer, device)
            test_loss, test_accuracy, _, _ = evaluate_model(model, test_loader, criterion, device)

            # Log metrics
            mlflow.log_metric("train_loss", train_loss, step=epoch)
            mlflow.log_metric("train_accuracy", train_accuracy, step=epoch)
            mlflow.log_metric("test_loss", test_loss, step=epoch)
            mlflow.log_metric("test_accuracy", test_accuracy, step=epoch)            
            
            print(f"Epoch [{epoch+1}/{CONFIG['epochs']}]")
            print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_accuracy:.4f}")
            print(f"Test Loss: {test_loss:.4f}, Test Acc: {test_accuracy:.4f}")
            print("-" * 50)
            
                # Save best model
            if test_accuracy > best_accuracy:
                best_accuracy = test_accuracy
                best_model_path = "best_sentiment_model.pth"
                torch.save({
                    'model_state_dict': model.state_dict(),
                    'vocab_to_idx': vocab_to_idx,
                    'label_classes': label_encoder.classes_.tolist(),  # Save classes instead of encoder
                    'config': CONFIG
                }, 'best_sentiment_model.pth')

                # Log model and best accuracy
                mlflow.log_metric("best_accuracy", best_accuracy)
                # mlflow.log_artifact(best_model_path)
                # mlflow.pytorch.log_model(model, artifact_path="model")
                log_model(model=model, vocab_to_index=vocab_to_idx,
                          vocab_size=vocab_size, max_length=CONFIG['max_length'], device=device)

        # Log final metrics
        test_loss, test_accuracy, predictions, true_labels = evaluate_model(model, test_loader, criterion, device)
        mlflow.log_metric("final_test_accuracy", test_accuracy)                
        
        # Final evaluation
        print("\nFinal Evaluation:")
        test_loss, test_accuracy, predictions, true_labels = evaluate_model(model, test_loader, criterion, device)
        
        print(f"Test Accuracy: {test_accuracy:.4f}")
        print("\nClassification Report:")
        report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)
        print(report)
        
        
        print("\nConfusion Matrix:")
        print(confusion_matrix(true_labels, predictions))
        
        print(f"\nBest model saved as 'best_sentiment_model.pth' with accuracy: {best_accuracy:.4f}")


In [37]:
def log_model(model, vocab_to_index, vocab_size, max_length, device):
    from mlflow.models.signature import infer_signature

    sample_input, sample_output = generate_sample_input_output(model, vocab_to_index, vocab_size, max_length, device)

    # Infer the model signature
    signature = infer_signature(sample_input, sample_output)

    # Log the model with input example and signature
    mlflow.pytorch.log_model(
        model,
        artifact_path="model",
        input_example=sample_input,
        signature=signature
    )

def generate_sample_input_output(model, vocab_to_index, vocab_size, max_length, device):
    import torch
    import numpy as np

    # Create a sample input batch (1 sequence of max_length tokens)
    sample_input = np.array(tpp.text_to_input("This is the best movie ever", vocab_to_index))
    sample_input_tensor = torch.from_numpy(sample_input).to(device)
    
    # Run the model to get sample output
    sample_output = model(sample_input_tensor)

    # Convert input to numpy for signature and input_example
    sample_input = sample_input_tensor.cpu().numpy()
    sample_output = sample_output.detach().cpu().numpy()
    return sample_input, sample_output

In [38]:
def get_or_create_experiment(experiment_name: str, experiment_tags: dict) -> str:
    # Check if the experiment already exists
    experiment = mlflow.get_experiment_by_name(experiment_name)
    
    if experiment is not None:
        print(f"Using existing experiment: {experiment_name} (ID: {experiment.experiment_id})")
        return experiment.experiment_id
    else:
        # Create a new experiment
        experiment_id = mlflow.create_experiment(name=experiment_name, tags=experiment_tags)
        print(f"Created new experiment: {experiment_name} (ID: {experiment_id})")
        return experiment_id

In [39]:
def train_model(model, train_loader, criterion, optimizer, device):
    """Train the model for one epoch"""
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    
    for batch_texts, batch_labels in train_loader:
        batch_texts, batch_labels = batch_texts.to(device), batch_labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(batch_texts)
        loss = criterion(outputs, batch_labels)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += batch_labels.size(0)
        correct += (predicted == batch_labels).sum().item()
    
    return total_loss / len(train_loader), correct / total

In [40]:
def evaluate_model(model, test_loader, criterion, device):
    """Evaluate the model"""
    model.eval()
    total_loss = 0
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():
        for batch_texts, batch_labels in test_loader:
            batch_texts, batch_labels = batch_texts.to(device), batch_labels.to(device)
            
            outputs = model(batch_texts)
            loss = criterion(outputs, batch_labels)
            
            total_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            
            all_predictions.extend(predicted.cpu().numpy())
            all_labels.extend(batch_labels.cpu().numpy())
    
    accuracy = accuracy_score(all_labels, all_predictions)
    return total_loss / len(test_loader), accuracy, all_predictions, all_labels


In [41]:
def predict_sentiment(text, model_path='best_sentiment_model.pth'):
    """Function to predict sentiment of a single text"""
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # Load model and components with weights_only=False for backward compatibility
    try:
        checkpoint = torch.load(model_path, map_location=device, weights_only=False)
    except Exception as e:
        print(f"Error loading with weights_only=False: {e}")
        # Try with weights_only=True and safe globals
        from sklearn.preprocessing import LabelEncoder
        torch.serialization.add_safe_globals([LabelEncoder])
        checkpoint = torch.load(model_path, map_location=device, weights_only=True)
    
    vocab_to_idx = checkpoint['vocab_to_idx']
    config = checkpoint['config']
    
    # Handle both old and new save formats
    if 'label_encoder' in checkpoint:
        # Old format with sklearn LabelEncoder
        label_classes = checkpoint['label_encoder'].classes_
    else:
        # New format with classes list
        label_classes = checkpoint['label_classes']
    
    # Initialize model
    model = SentimentLSTM(
        vocab_size=len(vocab_to_idx),
        embedding_dim=config['embedding_dim'],
        hidden_dim=config['hidden_dim'],
        output_dim=len(label_classes),
        n_layers=config['n_layers'],
        dropout=config['dropout']
    ).to(device)
    
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()
    
    # Preprocess text
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = text.split()
    indices = [vocab_to_idx.get(token, vocab_to_idx['<UNK>']) for token in tokens]
    
    # Pad or truncate
    if len(indices) < config['max_length']:
        indices.extend([vocab_to_idx['<PAD>']] * (config['max_length'] - len(indices)))
    else:
        indices = indices[:config['max_length']]
    
    # Predict
    with torch.no_grad():
        input_tensor = torch.tensor([indices], dtype=torch.long).to(device)
        output = model(input_tensor)
        probabilities = torch.softmax(output, dim=1)
        predicted_class = torch.argmax(output, dim=1).item()
    
    sentiment = label_classes[predicted_class]
    confidence = probabilities[0][predicted_class].item()
    
    return sentiment, confidence


In [42]:
if __name__ == "__main__":
    main()
    
    # Example usage of prediction function
    print("\nExample predictions:")
    example_texts = [
        "I absolutely love this product!",
        "This is the worst thing ever.",
        "It's okay, nothing special."
    ]
    
    try:
        for text in example_texts:
            sentiment, confidence = predict_sentiment(text)
            print(f"Text: '{text}'")
            print(f"Predicted sentiment: {sentiment} (confidence: {confidence:.4f})")
            print()
    except FileNotFoundError:
        print("Model not found. Please run training first.")


Using existing experiment: Sentiment Analysis with pytorch model (ID: 511091070605373857)
Using device: cuda
Loading datasets...
Training data: 27480 samples
Test data: 3534 samples
Sentiment distribution in training data:
sentiment
neutral     11117
positive     8582
negative     7781
Name: count, dtype: int64
Label mapping: {'negative': np.int64(0), 'neutral': np.int64(1), 'positive': np.int64(2)}
Building vocabulary...
Vocabulary size: 10096
Loading GloVe embeddings from glove.6B.100d.txt...
Loaded 400000 word vectors
Found pre-trained vectors for 9214/10096 words (91.3%)
Loading pre-trained embeddings...
Embedding weights frozen
Model initialized with 3321283 parameters

Starting training...
Epoch [1/10]
Train Loss: 0.8632, Train Acc: 0.6092
Test Loss: 0.7258, Test Acc: 0.6941
--------------------------------------------------
Using device: cuda
Epoch [2/10]
Train Loss: 0.7369, Train Acc: 0.6918
Test Loss: 0.6986, Test Acc: 0.7100
--------------------------------------------------
