### Libraries

In [1]:
import optuna
import torch
import torch.nn.functional as F
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, Dataset
from transformers import (
    DistilBertTokenizer, 
    DistilBertForSequenceClassification, 
    AdamW,
    get_scheduler
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import nltk
from nltk.corpus import stopwords
import re
from tqdm.auto import tqdm
import logging
import warnings
import pickle

### Text Preprocessing

In [2]:
class TextPreprocessor:
    def __init__(self):
        nltk.download('stopwords', quiet=True)
        self.stop_words = set(stopwords.words('english'))
        
    def clean_text(self, text):
        """Clean and preprocess text data."""
        text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
        text = text.lower() #convert to lower case
        text = re.sub(r'http\S+', '', text)  # Remove URLs
        text = re.sub(r'@\w+', '', text)  # Remove mentions
        text = re.sub(r'#(\w+)', r'\1', text)  # Remove hashtags but keep the word
        text = re.sub(r'[^a-z0-9\s]', ' ', text)  # Keep only alphanumeric
        text = ' '.join(word for word in text.split() if word not in self.stop_words)
        return text.strip()

### Dataset for PyTorch

In [3]:
class IMDbDataset(Dataset):
    def __init__(self, reviews, sentiments, tokenizer, max_length):
        self.reviews = reviews
        self.sentiments = sentiments
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        review = str(self.reviews[idx])
        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.sentiments[idx], dtype=torch.long)
        }

### Sentiment Analyzer Implementation

In [4]:
class SentimentAnalyzer:
    def __init__(self, model_path=None):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # uses GPU if available
        self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        self.max_length = 512
        self.preprocessor = TextPreprocessor()
        
        if model_path:
            self.model = self.load_model(model_path)
        else:
            self.model = None

    def prepare_data(self, df, test_size=0.2):
        """Prepare and split data for training."""
        # Clean the reviews
        df['review'] = df['review'].apply(self.preprocessor.clean_text)
        
        # Convert sentiments to numerical values
        df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
        
        # Split the data
        train_texts, val_texts, train_labels, val_labels = train_test_split(
            df['review'].values, 
            df['sentiment'].values, 
            test_size=test_size, 
            random_state=42
        )
        
        # Create datasets
        train_dataset = IMDbDataset(train_texts, train_labels, self.tokenizer, self.max_length)
        val_dataset = IMDbDataset(val_texts, val_labels, self.tokenizer, self.max_length)
        
        return train_dataset, val_dataset

    def train_model(self, trial, train_dataset, val_dataset):
        """Train model with hyperparameter optimization."""
        # Hyperparameters
        batch_size = trial.suggest_categorical('batch_size', [8, 16, 32])
        learning_rate = trial.suggest_float('learning_rate', 1e-5, 5e-5, log=True)
        num_epochs = 3
        
        # Create dataloaders
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size)
        
        # Initialize model
        model = DistilBertForSequenceClassification.from_pretrained(
            'distilbert-base-uncased', 
            num_labels=2
        ).to(self.device)
        
        # Optimizer and scheduler
        optimizer = AdamW(model.parameters(), lr=learning_rate)
        num_training_steps = num_epochs * len(train_loader)
        scheduler = get_scheduler(
            "linear",
            optimizer=optimizer,
            num_warmup_steps=0,
            num_training_steps=num_training_steps
        )
        
        best_val_acc = 0
        
        for epoch in range(num_epochs):
            # Training
            model.train()
            total_loss = 0
            train_preds, train_labels = [], []
            
            for batch in tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}'):
                batch = {k: v.to(self.device) for k, v in batch.items()}
                outputs = model(**batch)
                loss = outputs.loss
                
                loss.backward()
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                
                total_loss += loss.item()
                
                preds = torch.argmax(outputs.logits, dim=1)
                train_preds.extend(preds.cpu().numpy())
                train_labels.extend(batch['labels'].cpu().numpy())
            
            # Validation
            model.eval()
            val_preds, val_labels = [], []
            
            with torch.no_grad():
                for batch in val_loader:
                    batch = {k: v.to(self.device) for k, v in batch.items()}
                    outputs = model(**batch)
                    
                    preds = torch.argmax(outputs.logits, dim=1)
                    val_preds.extend(preds.cpu().numpy())
                    val_labels.extend(batch['labels'].cpu().numpy())
            
            # Calculate metrics
            val_acc = accuracy_score(val_labels, val_preds)
            precision, recall, f1, _ = precision_recall_fscore_support(
                val_labels, 
                val_preds, 
                average='binary'
            )
            
            print(f'\nEpoch {epoch+1} metrics:')
            print(f'Validation Accuracy: {val_acc:.4f}')
            print(f'Precision: {precision:.4f}')
            print(f'Recall: {recall:.4f}')
            print(f'F1 Score: {f1:.4f}')
            
            if val_acc > best_val_acc:
                best_val_acc = val_acc
                self.model = model
            
            trial.report(val_acc, epoch)
            
            if trial.should_prune():
                raise optuna.exceptions.TrialPruned()
        
        return best_val_acc

    def predict(self, text):
        """Predict sentiment for a given text."""
        if self.model is None:
            raise ValueError("Model hasn't been trained or loaded yet!")
            
        # Preprocess the text
        cleaned_text = self.preprocessor.clean_text(text)
        
        # Tokenize
        encoding = self.tokenizer.encode_plus(
            cleaned_text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        
        input_ids = encoding['input_ids'].to(self.device)
        attention_mask = encoding['attention_mask'].to(self.device)
        
        # Get prediction
        self.model.eval()
        with torch.no_grad():
            outputs = self.model(input_ids, attention_mask=attention_mask)
            probs = F.softmax(outputs.logits, dim=1)
            
        negative_score, positive_score = probs[0].cpu().numpy()
        
        return {
            'negative': float(negative_score),
            'positive': float(positive_score),
            'sentiment': 'positive' if positive_score > negative_score else 'negative',
            'confidence': float(max(negative_score, positive_score))
        }

    def save_model(self, path):
        """Save the trained model and tokenizer."""
        if self.model is None:
            raise ValueError("No model to save!")
        
        self.model.save_pretrained(path)
        self.tokenizer.save_pretrained(path)
        
    def load_model(self, path):
        """Load a trained model."""
        return DistilBertForSequenceClassification.from_pretrained(path).to(self.device)

### Main Function

In [5]:
def main():
    # Initialize the analyzer
    analyzer = SentimentAnalyzer()
    
    # Load and prepare data
    df = pd.read_csv('IMDB_dataset.csv')
    train_dataset, val_dataset = analyzer.prepare_data(df)
    
    # Optimize hyperparameters
    study = optuna.create_study(direction='maximize')
    objective = lambda trial: analyzer.train_model(trial, train_dataset, val_dataset)
    study.optimize(objective, n_trials=3)
    
    print(f"\nBest trial:")
    print(f"Value: {study.best_trial.value:.4f}")
    print(f"Params: {study.best_trial.params}")
    
    # Save the best model
    analyzer.save_model('best_sentiment_model')
    
    # Test the model
    test_reviews = [
        "This movie was absolutely fantastic! I loved every minute of it.",
        "What a terrible waste of time. I couldn't even finish watching it.",
        "It was okay, nothing special but not bad either."
    ]
    
    for review in test_reviews:
        result = analyzer.predict(review)
        print(f"\nReview: {review}")
        print(f"Prediction: {result}")

if __name__ == "__main__":
    main()

[I 2024-11-20 15:47:49,598] A new study created in memory with name: no-name-67a48029-57e3-46cb-9da5-38fd9a554f09
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3:   0%|          | 0/2500 [00:00<?, ?it/s]


Epoch 1 metrics:
Validation Accuracy: 0.9146
Precision: 0.9331
Recall: 0.8946
F1 Score: 0.9135


Epoch 2/3:   0%|          | 0/2500 [00:00<?, ?it/s]


Epoch 2 metrics:
Validation Accuracy: 0.9169
Precision: 0.9211
Recall: 0.9133
F1 Score: 0.9172


Epoch 3/3:   0%|          | 0/2500 [00:00<?, ?it/s]

[I 2024-11-21 17:04:32,177] Trial 0 finished with value: 0.9169 and parameters: {'batch_size': 16, 'learning_rate': 1.4485703536841932e-05}. Best is trial 0 with value: 0.9169.



Epoch 3 metrics:
Validation Accuracy: 0.9151
Precision: 0.9162
Recall: 0.9153
F1 Score: 0.9157


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3:   0%|          | 0/5000 [00:00<?, ?it/s]


Epoch 1 metrics:
Validation Accuracy: 0.9100
Precision: 0.8892
Recall: 0.9383
F1 Score: 0.9131


Epoch 2/3:   0%|          | 0/5000 [00:00<?, ?it/s]


Epoch 2 metrics:
Validation Accuracy: 0.9180
Precision: 0.9059
Recall: 0.9343
F1 Score: 0.9199


Epoch 3/3:   0%|          | 0/5000 [00:00<?, ?it/s]

[I 2024-11-22 09:45:43,709] Trial 1 finished with value: 0.918 and parameters: {'batch_size': 8, 'learning_rate': 1.8502624391120073e-05}. Best is trial 1 with value: 0.918.



Epoch 3 metrics:
Validation Accuracy: 0.9179
Precision: 0.9225
Recall: 0.9139
F1 Score: 0.9182


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3:   0%|          | 0/2500 [00:00<?, ?it/s]


Epoch 1 metrics:
Validation Accuracy: 0.9029
Precision: 0.9341
Recall: 0.8686
F1 Score: 0.9002


Epoch 2/3:   0%|          | 0/2500 [00:00<?, ?it/s]


Epoch 2 metrics:
Validation Accuracy: 0.9143
Precision: 0.9274
Recall: 0.9004
F1 Score: 0.9137


Epoch 3/3:   0%|          | 0/2500 [00:00<?, ?it/s]

[I 2024-11-23 13:11:08,534] Trial 2 finished with value: 0.9149 and parameters: {'batch_size': 16, 'learning_rate': 1.5156692683609128e-05}. Best is trial 1 with value: 0.918.



Epoch 3 metrics:
Validation Accuracy: 0.9149
Precision: 0.9130
Recall: 0.9186
F1 Score: 0.9158

Best trial:
Value: 0.9180
Params: {'batch_size': 8, 'learning_rate': 1.8502624391120073e-05}

Review: This movie was absolutely fantastic! I loved every minute of it.
Prediction: {'negative': 0.004245401360094547, 'positive': 0.995754599571228, 'sentiment': 'positive', 'confidence': 0.995754599571228}

Review: What a terrible waste of time. I couldn't even finish watching it.
Prediction: {'negative': 0.9982640147209167, 'positive': 0.0017359622288495302, 'sentiment': 'negative', 'confidence': 0.9982640147209167}

Review: It was okay, nothing special but not bad either.
Prediction: {'negative': 0.802784264087677, 'positive': 0.19721579551696777, 'sentiment': 'negative', 'confidence': 0.802784264087677}
