In [2]:
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm

# Load the pre-trained model and tokenizer
model_name = "bhadresh-savani/bert-base-uncased-emotion"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Create the emotion classifier pipeline
classifier = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    return_all_scores=True,
    device=0 if torch.cuda.is_available() else -1  # Use GPU if available
)

def get_predictions_batch(texts, threshold=0.5, batch_size=16):
    """Get predictions ensuring 5 labels"""
    predictions = []
    
    for i in tqdm(range(0, len(texts), batch_size), desc="Predicting"):
        batch_texts = [" ".join(tokens) if isinstance(tokens, list) else str(tokens) 
                      for tokens in texts[i:i + batch_size]]
        
        inputs = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors="pt"
        ).to(device)
        
        with torch.no_grad():
            outputs = model(**inputs)
            probs = torch.sigmoid(outputs.logits)
            batch_preds = (probs > threshold).int().cpu().numpy()
            
            # Ensure 5 labels
            if batch_preds.shape[1] != 5:
                batch_preds = batch_preds[:, :5]
            
            predictions.extend(batch_preds)
    
    return np.array(predictions)

def format_predictions(predictions, emotion_mapping):
    """
    Convert raw predictions to multi-hot format
    Each text can have multiple emotions (multi-label classification)
    """
    formatted_preds = []
    for pred in predictions:
        # Initialize zeros for all emotions
        emotion_scores = {e: 0 for e in emotions}
        
        # Update scores for predicted emotions
        for p in pred:
            mapped_emotion = emotion_mapping.get(p['label'])
            if mapped_emotion:
                emotion_scores[mapped_emotion] = p['score']
        
        # Convert to list in the correct order
        row = [emotion_scores[e] for e in emotions]
        formatted_preds.append(row)
    
    return np.array(formatted_preds)


# Load data
train = pd.read_csv('../public_data_test/track_a/train/eng.csv')
val = pd.read_csv('../public_data_test/track_a/dev/eng.csv')
test = pd.read_csv('../public_data_test/track_a/test/eng.csv')

# Extract text from datasets
val_text = val['text'].tolist()
test_text = test['text'].tolist()

# Define emotions and mapping (model's labels to our labels)
emotions = ['joy', 'sadness', 'surprise', 'fear', 'anger']
emotion_mapping = {
    'joy': 'joy',
    'sadness': 'sadness',
    'surprise': 'surprise',
    'fear': 'fear',
    'anger': 'anger',
    'love': None  # We'll ignore this emotion as it's not in our target set
}

Device set to use cpu


In [3]:
# # Preprocessing Config
# # config = {
# #     'sep_pn': True,      # Separate punctuation
# #     'rm_pn': False,      # Remove punctuation
# #     'apply_lemmatization': True,
# #     'apply_stemming': True,
# #     'add_bigrams': True,
# #     'rm_sw': False       # Remove stopwords
# # }

# def pre_process(text, config):
#     """Preprocess text with multiple options"""
#     def separate_punctuation(text):
#         # Fixed quotation marks in regex patterns
#         text = re.sub(r"(\w)([.,;:!?'\"\)])", r"\1 \2", text)
#         text = re.sub(r"([.,;:!?'\"\(\)])(\w)", r"\1 \2", text)
#         return text

#     def remove_punctuation(text):
#         # Fixed quotation marks in regex pattern
#         text = re.sub(r"[.,;:!?'\"\(\)]", "", text)
#         return text

#     # Apply preprocessing steps based on config
#     if config['sep_pn'] and not config['rm_pn']:
#         text = separate_punctuation(text)
#     if config['rm_pn'] and not config['sep_pn']:
#         text = remove_punctuation(text)

#     # Tokenize
#     doc = nlp(text.lower())
#     tokens = [token.text for token in doc]

#     # Apply stemming and lemmatization
#     if config['apply_stemming']:
#         tokens = [stemmer.stem(token) for token in tokens]
#     if config['apply_lemmatization']:
#         tokens = [lemmatizer.lemmatize(token) for token in tokens]

#     # Generate bigrams if configured
#     if config['add_bigrams']:
#         bigrams = [" ".join(gram) for gram in ngrams(tokens, 2)]
#         tokens.extend(bigrams)

#     # Remove stopwords if configured
#     if config['rm_sw']:
#         stop_words = set(stopwords.words('english'))
#         tokens = [word for word in tokens if word.lower() not in stop_words]

#     return " ".join(tokens)

# print("Preprocessing texts...")
# val_text = [pre_process(text, config) for text in tqdm(val['text'], desc="Preprocessing validation")]
# test_text = [pre_process(text, config) for text in tqdm(test['text'], desc="Preprocessing test")]

In [4]:
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics import jaccard_score, recall_score, precision_score, f1_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.util import ngrams
import spacy
import re
import datetime

# Initialize NLP tools
nlp = spacy.load("en_core_web_sm")
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

In [5]:
!pip install emoji
import emoji
from nltk.tokenize import word_tokenize
import string

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

def clean_text(text):
    """Basic text cleaning"""
    # Convert to lowercase
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)
    # Remove phone numbers
    text = re.sub(r'\+?[\d\-\(\) ]{8,}', '', text)
    return text.strip()

def handle_emojis(text):
    """Convert emojis to text descriptions"""
    return emoji.demojize(text, delimiters=(" ", " "))

def expand_contractions(text):
    """Expand common contractions"""
    contractions = {
        "ain't": "am not",
        "aren't": "are not",
        "can't": "cannot",
        "couldn't": "could not",
        "didn't": "did not",
        "doesn't": "does not",
        "don't": "do not",
        "hadn't": "had not",
        "hasn't": "has not",
        "haven't": "have not",
        "he'd": "he would",
        "he'll": "he will",
        "he's": "he is",
        "i'd": "i would",
        "i'll": "i will",
        "i'm": "i am",
        "i've": "i have",
        "isn't": "is not",
        "it's": "it is",
        "let's": "let us",
        "shouldn't": "should not",
        "that's": "that is",
        "there's": "there is",
        "they'd": "they would",
        "they'll": "they will",
        "they're": "they are",
        "they've": "they have",
        "wasn't": "was not",
        "we'd": "we would",
        "we're": "we are",
        "weren't": "were not",
        "what's": "what is",
        "where's": "where is",
        "who's": "who is",
        "won't": "will not",
        "wouldn't": "would not",
        "you'd": "you would",
        "you'll": "you will",
        "you're": "you are",
        "you've": "you have"
    }
    for contraction, expansion in contractions.items():
        text = text.replace(contraction, expansion)
    return text

def normalize_elongated_words(text):
    """Normalize words with repeated characters"""
    return re.sub(r'(.)\1{2,}', r'\1\1', text)

def pre_process(text, config):
    """Preprocessing function with careful string handling"""
    MAX_LENGTH = 510  # 512 - 2 for [CLS] and [SEP]
    
    try:
        # Ensure text is a string and handle initial cleaning
        text = str(text)
        text = clean_text(text)
        
        # Basic preprocessing
        if config.get('handle_emojis', False):
            text = handle_emojis(text)
        if config.get('expand_contractions', False):
            text = expand_contractions(text)
        if config.get('normalize_elongated', False):
            text = normalize_elongated_words(text)
        
        # Process with spaCy
        doc = nlp(text)
        
        # Get base tokens with stopword removal if configured
        if config.get('rm_sw', False):
            stop_words = set(stopwords.words('english'))
            # Handle stopwords with explicit string conversion
            base_tokens = [str(token.text) for token in doc 
                         if str(token.text).lower() not in stop_words]
        else:
            base_tokens = [str(token.text) for token in doc]
        
        # Check length and truncate if needed
        base_text = " ".join(base_tokens)
        if len(tokenizer.tokenize(base_text)) > MAX_LENGTH:
            return tokenizer.tokenize(base_text)[:MAX_LENGTH]
        
        # Apply lemmatization/stemming if configured
        if config.get('apply_lemmatization', False):
            base_tokens = [str(token.lemma_) for token in doc]
        elif config.get('apply_stemming', False):
            stemmer = PorterStemmer()
            base_tokens = [stemmer.stem(str(token)) for token in base_tokens]
        
        # Initialize features
        features = []
        current_length = len(tokenizer.tokenize(" ".join(base_tokens)))
        remaining_space = MAX_LENGTH - current_length
        
        # Add features if space permits
        if remaining_space > 0:
            # Add trigrams
            if config.get('add_trigrams', False) and len(base_tokens) >= 3:
                for i in range(len(base_tokens) - 2):
                    trigram = " ".join(base_tokens[i:i+3])
                    if len(tokenizer.tokenize(trigram)) <= remaining_space:
                        features.append(trigram)
                        remaining_space -= len(tokenizer.tokenize(trigram))
                    else:
                        break
            
            # Add POS tags
            if config.get('add_pos_tags', False):
                for token in doc:
                    pos_tag = f"{str(token.text)}_{token.pos_}"
                    if len(tokenizer.tokenize(pos_tag)) <= remaining_space:
                        features.append(pos_tag)
                        remaining_space -= len(tokenizer.tokenize(pos_tag))
                    else:
                        break
            
            # Add dependency tags
            if config.get('add_dep_tags', False):
                for token in doc:
                    dep_tag = f"{str(token.text)}_{token.dep_}"
                    if len(tokenizer.tokenize(dep_tag)) <= remaining_space:
                        features.append(dep_tag)
                        remaining_space -= len(tokenizer.tokenize(dep_tag))
                    else:
                        break
        
        # Combine and verify final length
        all_tokens = base_tokens + features
        final_text = " ".join(all_tokens)
        
        if len(tokenizer.tokenize(final_text)) > MAX_LENGTH:
            return tokenizer.tokenize(final_text)[:MAX_LENGTH]
        
        return all_tokens
        
    except Exception as e:
        print(f"Error in preprocessing: {str(e)}")
        # Fallback to basic tokenization
        return tokenizer.tokenize(str(text))[:MAX_LENGTH]

# Test the function
test_config = {
    'sep_pn': False,
    'rm_pn': False,
    'apply_lemmatization': False,
    'apply_stemming': False,
    'add_bigrams': False,
    'add_trigrams': True,
    'rm_sw': True,
    'handle_emojis': True,
    'expand_contractions': True,
    'normalize_elongated': True,
    'add_pos_tags': True,
    'add_dep_tags': True,
    'use_tfidf': True,
    'tfidf_max_features': 1000,
    'tfidf_ngram_range': 1,
    'tfidf_min_df': 2,
    'tfidf_max_df': 0.95
}

# Verify function works
print("Testing preprocessing function...")
sample_text = val['text'].iloc[0]
processed_tokens = pre_process(sample_text, test_config)
token_length = len(tokenizer.tokenize(" ".join(processed_tokens)))
print(f"Token length: {token_length}")
print(f"Within limit: {token_length <= 510}")

[33mDEPRECATION: Loading egg at /Users/angwang/miniforge3/lib/python3.12/site-packages/huggingface_hub-0.27.1-py3.8.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Using device: cpu
Testing preprocessing function...
Token length: 138
Within limit: True


In [6]:
# Get predictions
print("Getting validation predictions...")
val_predictions = get_predictions_batch(val_text, threshold=0.5)

print("Getting test predictions...")
test_predictions = get_predictions_batch(test_text, threshold=0.5)

# Convert probabilities to binary predictions (you can adjust the threshold)
threshold = 0.5
val_binary_preds = (val_predictions > threshold).astype(int)
test_binary_preds = (test_predictions > threshold).astype(int)

# Save predictions
def save_predictions(predictions, ids, filename):
    """Save multi-hot predictions to CSV"""
    df_predictions = pd.DataFrame(predictions, columns=emotions)
    df_predictions['id'] = ids
    df_predictions = df_predictions[['id'] + emotions]
    df_predictions.to_csv(filename, index=False)
    print(f"Saved predictions to {filename}")

# Save validation and test predictions
from datetime import datetime
# Save validation and test predictions
timestamp = datetime.now().strftime('%Y-%m-%d_%H_%M_%S')

save_predictions(
    val_predictions,
    val['id'],
    f'../results/val_predictions_{timestamp}.csv'
)
save_predictions(
    test_predictions,
    test['id'],
    f'../results/test_predictions_{timestamp}.csv'
)

Getting validation predictions...


Predicting:   0%|          | 0/8 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Predicting: 100%|██████████| 8/8 [00:03<00:00,  2.47it/s]


Getting test predictions...


Predicting: 100%|██████████| 173/173 [01:32<00:00,  1.88it/s]

Saved predictions to ../results/val_predictions_2025-01-30_12_27_50.csv
Saved predictions to ../results/test_predictions_2025-01-30_12_27_50.csv





In [7]:
from sklearn.metrics import jaccard_score, recall_score, precision_score, f1_score

def evaluate(y_true, y_pred):
    # Calculate Jaccard score
    jaccard = jaccard_score(y_true, y_pred, average='samples')
    print(f'Multilabel accuracy (Jaccard score): {round(jaccard, 4)}')
    
    """Evaluate with micro and macro metrics for multi-label classification"""
    for average in ['micro', 'macro']:
        recall = recall_score(y_true, y_pred, average=average, zero_division=0)
        precision = precision_score(y_true, y_pred, average=average, zero_division=0)
        f1 = f1_score(y_true, y_pred, average=average, zero_division=0)
    
        print(f'{average.upper()} recall: {round(recall, 4)}, '
              f'precision: {round(precision, 4)}, '
              f'f1: {round(f1, 4)}')

def evaluate_per_class(y_true, y_pred):
    """Evaluate metrics for each emotion separately"""
    for i, emotion in enumerate(emotions):
        print(f'*** {emotion} ***')
    
        recall = recall_score(y_true[:,i], y_pred[:,i], zero_division=0)
        precision = precision_score(y_true[:,i], y_pred[:,i], zero_division=0)
        f1 = f1_score(y_true[:,i], y_pred[:,i], zero_division=0)
        
        print(f'recall: {round(recall, 4)}, '
              f'precision: {round(precision, 4)}, '
              f'f1: {round(f1, 4)}\n')

# After getting predictions, add this evaluation code:
# Evaluate predictions
# print("\nEvaluating validation predictions...")
# val_true = val[emotions].values
# print("Overall Metrics:")
# evaluate(val_true, val_predictions)
# print("\nPer-class Metrics:")
# evaluate_per_class(val_true, val_predictions)

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

def create_tfidf_features(train_texts, val_texts, test_texts, config):
    """Create TF-IDF features for texts"""
    # Initialize TF-IDF vectorizer with parameters from config
    tfidf = TfidfVectorizer(
        max_features=config.get('tfidf_max_features', 1000),
        ngram_range=(1, config.get('tfidf_ngram_range', 1)),
        min_df=config.get('tfidf_min_df', 2),
        max_df=config.get('tfidf_max_df', 0.95)
    )
    
    # Fit on training data
    tfidf.fit(train_texts)
    
    # Transform all datasets
    train_tfidf = tfidf.transform(train_texts)
    val_tfidf = tfidf.transform(val_texts)
    test_tfidf = tfidf.transform(test_texts)
    
    return train_tfidf, val_tfidf, test_tfidf, tfidf

In [9]:
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup
import torch.nn.functional as F

class EmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        """Initialize dataset with texts and labels"""
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __getitem__(self, idx):
        """Get a single item"""
        text = str(self.texts[idx])  # Ensure text is string
        label = self.labels[idx]     # Get corresponding label
        
        # Tokenize text
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        # Remove batch dimension and create item dictionary
        item = {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.float)
        }
        
        return item

    def __len__(self):
        """Get length of dataset"""
        return len(self.texts)

def collate_batch(batch):
    """Custom collate function to ensure proper batching"""
    input_ids = torch.stack([item['input_ids'] for item in batch])
    attention_mask = torch.stack([item['attention_mask'] for item in batch])
    labels = torch.stack([item['labels'] for item in batch])
    
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

from transformers import AutoConfig

def fine_tune_model(model, train_texts, train_labels, val_texts, val_labels, config):
    """Fine-tune BERT model with properly initialized weights"""
    BATCH_SIZE = 16
    
    # Update model config for our 5 emotions
    model_config = AutoConfig.from_pretrained(model_name)
    model_config.problem_type = "multi_label_classification"
    model_config.num_labels = 5  # Our 5 emotions
    
    # Load pre-trained model
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        config=model_config,
        ignore_mismatched_sizes=True
    )
    
    # Initialize the new classifier weights properly
    torch.nn.init.xavier_uniform_(model.classifier.weight)
    torch.nn.init.zeros_(model.classifier.bias)
    
    print("\nModel configuration:")
    print(f"Classifier weight shape: {model.classifier.weight.shape}")
    print(f"Classifier bias shape: {model.classifier.bias.shape}")
    print(f"Number of labels: {model_config.num_labels}")
    print(f"Emotion labels: {emotions}")
    
    # Create datasets
    train_dataset = EmotionDataset(train_texts, train_labels, tokenizer)
    val_dataset = EmotionDataset(val_texts, val_labels, tokenizer)
    
    # Create dataloaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=BATCH_SIZE,
        shuffle=True,
        collate_fn=collate_batch,
        drop_last=True
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=BATCH_SIZE,
        shuffle=False,
        collate_fn=collate_batch,
        drop_last=True
    )
    
    # Optimizer
    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=config.get('learning_rate', 2e-5),
        eps=config.get('adam_epsilon', 1e-8)
    )
    
    # Loss function
    criterion = torch.nn.BCEWithLogitsLoss()
    
    # Training setup
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    
    best_val_loss = float('inf')
    best_model_state = None
    
    # Training loop
    for epoch in range(config.get('num_epochs', 3)):
        model.train()
        total_train_loss = 0
        
        for batch_idx, batch in enumerate(tqdm(train_loader, desc=f'Epoch {epoch + 1}')):
            # Move batch to device
            batch = {k: v.to(device) for k, v in batch.items()}
            labels = batch.pop('labels')
            
            # Clear gradients
            optimizer.zero_grad()
            
            try:
                # Forward pass
                outputs = model(**batch)
                logits = outputs.logits
                
                # Calculate loss
                loss = criterion(logits, labels)
                
                # Backward pass
                loss.backward()
                
                # Update weights
                optimizer.step()
                
                total_train_loss += loss.item()
                
            except Exception as e:
                print(f"Error in batch {batch_idx}: {str(e)}")
                print(f"Shapes - Input IDs: {batch['input_ids'].shape}, "
                      f"Logits: {logits.shape if 'logits' in locals() else 'N/A'}, "
                      f"Labels: {labels.shape}")
                continue
        
        # Validation
        model.eval()
        total_val_loss = 0
        
        with torch.no_grad():
            for batch in val_loader:
                batch = {k: v.to(device) for k, v in batch.items()}
                labels = batch.pop('labels')
                
                try:
                    outputs = model(**batch)
                    logits = outputs.logits
                    loss = criterion(logits, labels)
                    total_val_loss += loss.item()
                except Exception as e:
                    print(f"Error in validation: {str(e)}")
                    continue
        
        # Print epoch results
        avg_train_loss = total_train_loss / len(train_loader)
        avg_val_loss = total_val_loss / len(val_loader)
        
        print(f'\nEpoch {epoch + 1}:')
        print(f'Average training loss: {avg_train_loss:.4f}')
        print(f'Average validation loss: {avg_val_loss:.4f}')
        
        # Save best model
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            best_model_state = model.state_dict().copy()
    
    # Restore best model
    if best_model_state is not None:
        model.load_state_dict(best_model_state)
    
    return model

In [10]:
# Save Predictions 
def save_predictions(predictions, ids, filename):
    """Save multi-hot predictions to CSV"""
    # Create DataFrame with predictions
    df_predictions = pd.DataFrame(predictions, columns=emotions)
    
    # Add ID column
    df_predictions['id'] = ids
    
    # Reorder columns to put ID first
    df_predictions = df_predictions[['id'] + emotions]
    
    # Save to CSV
    df_predictions.to_csv(filename, index=False)
    print(f"Saved predictions to {filename}")

In [11]:
import itertools
from sklearn.metrics import jaccard_score
from datetime import datetime
!pip install emoji

def evaluate_preprocessing_config(config, val_texts, val_labels):
    """Quickly evaluate a preprocessing configuration without fine-tuning"""
    try:
        # Preprocess validation texts
        processed_texts = [pre_process(text, config) for text in tqdm(val_texts, desc="Preprocessing")]
        
        # Get predictions using base model (without fine-tuning)
        predictions = get_predictions_batch(processed_texts, threshold=0.5)
        
        # Calculate score
        score = jaccard_score(val_labels, predictions, average='samples')
        
        return score
    except Exception as e:
        print(f"Error with config {config}: {str(e)}")
        return 0.0


def grid_search_preprocessing():
    # Define all possible configurations
    config_options = {
        'sep_pn': [False],
        'rm_pn': [False],
        'apply_lemmatization': [False],
        'apply_stemming': [False],
        'add_bigrams': [False],
        'add_trigrams': [True, False],
        'rm_sw': [False],
        'handle_emojis': [True],
        'expand_contractions': [True],
        'normalize_elongated': [True],
        'add_pos_tags': [True],
        'add_dep_tags': [True],
        'use_tfidf': [True],
        'tfidf_max_features': [200],
        'tfidf_ngram_range': [1, 2],
        'tfidf_min_df': [2],
        'tfidf_max_df': [0.9]
    }

    # Generate all possible combinations
    keys = config_options.keys()
    combinations = [dict(zip(keys, v)) for v in itertools.product(*config_options.values())]
    
    # Remove invalid combinations
    valid_combinations = [config for config in combinations 
                        if not (config['sep_pn'] and config['rm_pn'])]
    
    best_scores = {
        'jaccard': 0,
        'micro_f1': 0,
        'macro_f1': 0,
        'emotion_f1s': {emotion: 0 for emotion in emotions}
    }
    best_configs = {
        'jaccard': None,
        'micro_f1': None,
        'macro_f1': None,
        'emotion_f1s': {emotion: None for emotion in emotions}
    }
    results = []
    
    print(f"Testing {len(valid_combinations)} configurations...")
    
    # Ensure labels are correct format
    train_labels = train[emotions].values
    val_labels = val[emotions].values
    test_labels = test[emotions].values
    
    # Verify label dimensions
    assert train_labels.shape[1] == 5, f"Train labels shape incorrect: {train_labels.shape}"
    assert val_labels.shape[1] == 5, f"Validation labels shape incorrect: {val_labels.shape}"
    assert test_labels.shape[1] == 5, f"Test labels shape incorrect: {test_labels.shape}"
    
    for config in tqdm(valid_combinations, desc="Grid Search"):
        try:
            # Process texts
            train_processed = [pre_process(str(text), config) for text in train['text']]
            val_processed = [pre_process(str(text), config) for text in val['text']]
            test_processed = [pre_process(str(text), config) for text in test['text']]

            # Convert tokens to strings for TF-IDF
            train_texts = [" ".join(tokens) for tokens in train_processed]
            val_texts = [" ".join(tokens) for tokens in val_processed]
            test_texts = [" ".join(tokens) for tokens in test_processed]
            
            # Create TF-IDF features
            train_tfidf, val_tfidf, test_tfidf, _ = create_tfidf_features(
                train_texts, val_texts, test_texts, config
            )
            
            # Get BERT predictions
            val_predictions = get_predictions_batch(val_processed, threshold=0.5)
            
            # Ensure predictions have 5 labels
            if val_predictions.shape[1] != 5:
                print(f"Warning: Predictions shape incorrect: {val_predictions.shape}")
                continue
            
            # Combine predictions with TF-IDF
            val_combined = hstack([val_predictions, val_tfidf]).toarray()
            
            # Convert to binary predictions
            val_final_predictions = (val_combined > 0.5).astype(int)
            
            # Ensure final predictions have 5 labels
            if val_final_predictions.shape[1] != 5:
                val_final_predictions = val_final_predictions[:, :5]
            
            # Calculate all metrics
            jaccard = jaccard_score(val[emotions].values, val_final_predictions, average='samples')
            micro_f1 = f1_score(val[emotions].values, val_final_predictions, average='micro')
            macro_f1 = f1_score(val[emotions].values, val_final_predictions, average='macro')
            
            # Calculate per-emotion F1 scores
            emotion_f1s = {}
            for i, emotion in enumerate(emotions):
                emotion_f1s[emotion] = f1_score(
                    val[emotions].values[:, i], 
                    val_final_predictions[:, i]
                )
            
            # Store all results
            result = {
                'config': config,
                'jaccard': jaccard,
                'micro_f1': micro_f1,
                'macro_f1': macro_f1,
                'emotion_f1s': emotion_f1s
            }
            results.append(result)
            
            # Update best scores - focusing on macro F1
            if macro_f1 > best_scores['macro_f1']:
                best_scores['macro_f1'] = macro_f1
                best_configs['macro_f1'] = config
                print(f"\nNew best Macro F1: {macro_f1:.4f}")
                print("Configuration:")
                for key, value in config.items():
                    print(f"  {key}: {value}")
                print("\nDetailed Metrics:")
                evaluate(val[emotions].values, val_final_predictions)
                evaluate_per_class(val[emotions].values, val_final_predictions)
            
            # Still track other metrics but don't print them
            if jaccard > best_scores['jaccard']:
                best_scores['jaccard'] = jaccard
                best_configs['jaccard'] = config
            
            if micro_f1 > best_scores['micro_f1']:
                best_scores['micro_f1'] = micro_f1
                best_configs['micro_f1'] = config
            
            for emotion in emotions:
                if emotion_f1s[emotion] > best_scores['emotion_f1s'][emotion]:
                    best_scores['emotion_f1s'][emotion] = emotion_f1s[emotion]
                    best_configs['emotion_f1s'][emotion] = config
            
        except Exception as e:
            print(f"Error with config {config}: {str(e)}")
            continue
    
    print("\nFinal Best Configuration (Macro F1):")
    print(f"Best Macro F1: {best_scores['macro_f1']:.4f}")
    print("Configuration:")
    for key, value in best_configs['macro_f1'].items():
        print(f"  {key}: {value}")
    
    return best_configs, best_scores, results
# Run grid search
print("Starting preprocessing grid search...")
best_configs, best_scores, all_results = grid_search_preprocessing()

print("\nBest Configurations Found:")
print(f"Best Jaccard Score: {best_scores['jaccard']:.4f}")
print("Configuration:")
for key, value in best_configs['jaccard'].items():
    print(f"  {key}: {value}")

print(f"\nBest Micro F1: {best_scores['micro_f1']:.4f}")
print("Configuration:")
for key, value in best_configs['micro_f1'].items():
    print(f"  {key}: {value}")

print(f"\nBest Macro F1: {best_scores['macro_f1']:.4f}")
print("Configuration:")
for key, value in best_configs['macro_f1'].items():
    print(f"  {key}: {value}")

print("\nBest Per-Emotion F1 Scores:")
for emotion in emotions:
    print(f"{emotion}: {best_scores['emotion_f1s'][emotion]:.4f}")
    print(f"Config: {best_configs['emotion_f1s'][emotion]}")

# Save results to CSV for later analysis
results_df = pd.DataFrame(all_results)

timestamp = datetime.now().strftime('%Y-%m-%d_%H_%M_%S')
results_df.to_csv(f'../results/preprocessing_grid_search_{timestamp}.csv', index=False)
print(f"\nDetailed results saved to: ../results/preprocessing_grid_search_{timestamp}.csv")

# ... existing code ...

# After grid search is complete, use the macro F1 configuration
config = best_configs['macro_f1']  # Use macro F1 config instead of Jaccard
print("\nGenerating final predictions with best Macro F1 configuration...")

# Preprocess texts with macro F1 config
print("Preprocessing texts...")
val_text = [pre_process(text, config) for text in tqdm(val['text'], desc="Preprocessing validation")]
test_text = [pre_process(text, config) for text in tqdm(test['text'], desc="Preprocessing test")]

# Get predictions
print("Getting validation predictions...")
val_predictions = get_predictions_batch(val_text, threshold=0.5)

print("Getting test predictions...")
test_predictions = get_predictions_batch(test_text, threshold=0.5)

# Evaluate predictions
print("\nFinal Evaluation with Best Macro F1 Configuration:")
val_true = val[emotions].values
print("Overall Metrics:")
evaluate(val_true, val_predictions)
print("\nPer-class Metrics:")
evaluate_per_class(val_true, val_predictions)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[33mDEPRECATION: Loading egg at /Users/angwang/miniforge3/lib/python3.12/site-packages/huggingface_hub-0.27.1-py3.8.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Starting preprocessing grid search...
Testing 4 configurations...


Predicting: 100%|██████████| 8/8 [00:33<00:00,  4.15s/it]
Grid Search:  25%|██▌       | 1/4 [01:19<03:57, 79.02s/it]


New best Macro F1: 0.2704
Configuration:
  sep_pn: False
  rm_pn: False
  apply_lemmatization: False
  apply_stemming: False
  add_bigrams: False
  add_trigrams: True
  rm_sw: False
  handle_emojis: True
  expand_contractions: True
  normalize_elongated: True
  add_pos_tags: True
  add_dep_tags: True
  use_tfidf: True
  tfidf_max_features: 200
  tfidf_ngram_range: 1
  tfidf_min_df: 2
  tfidf_max_df: 0.9

Detailed Metrics:
Multilabel accuracy (Jaccard score): 0.2533
MICRO recall: 0.5057, precision: 0.3134, f1: 0.387
MACRO recall: 0.3964, precision: 0.2085, f1: 0.2704
*** joy ***
recall: 0.3871, precision: 0.1739, f1: 0.24

*** sadness ***
recall: 0.6286, precision: 0.25, f1: 0.3577

*** surprise ***
recall: 0.0, precision: 0.0, f1: 0.0

*** fear ***
recall: 0.8413, precision: 0.5354, f1: 0.6543

*** anger ***
recall: 0.125, precision: 0.0833, f1: 0.1



Predicting: 100%|██████████| 8/8 [00:39<00:00,  4.93s/it]
Predicting: 100%|██████████| 8/8 [00:31<00:00,  3.96s/it]]
Grid Search:  75%|███████▌  | 3/4 [03:53<01:16, 76.44s/it]


New best Macro F1: 0.2714
Configuration:
  sep_pn: False
  rm_pn: False
  apply_lemmatization: False
  apply_stemming: False
  add_bigrams: False
  add_trigrams: False
  rm_sw: False
  handle_emojis: True
  expand_contractions: True
  normalize_elongated: True
  add_pos_tags: True
  add_dep_tags: True
  use_tfidf: True
  tfidf_max_features: 200
  tfidf_ngram_range: 1
  tfidf_min_df: 2
  tfidf_max_df: 0.9

Detailed Metrics:
Multilabel accuracy (Jaccard score): 0.2466
MICRO recall: 0.5057, precision: 0.3145, f1: 0.3878
MACRO recall: 0.3961, precision: 0.2082, f1: 0.2714
*** joy ***
recall: 0.4839, precision: 0.2206, f1: 0.303

*** sadness ***
recall: 0.5143, precision: 0.2222, f1: 0.3103

*** surprise ***
recall: 0.0, precision: 0.0, f1: 0.0

*** fear ***
recall: 0.8571, precision: 0.5243, f1: 0.6506

*** anger ***
recall: 0.125, precision: 0.0741, f1: 0.093



Predicting: 100%|██████████| 8/8 [00:38<00:00,  4.85s/it]
Grid Search: 100%|██████████| 4/4 [05:10<00:00, 77.57s/it]



Final Best Configuration (Macro F1):
Best Macro F1: 0.2714
Configuration:
  sep_pn: False
  rm_pn: False
  apply_lemmatization: False
  apply_stemming: False
  add_bigrams: False
  add_trigrams: False
  rm_sw: False
  handle_emojis: True
  expand_contractions: True
  normalize_elongated: True
  add_pos_tags: True
  add_dep_tags: True
  use_tfidf: True
  tfidf_max_features: 200
  tfidf_ngram_range: 1
  tfidf_min_df: 2
  tfidf_max_df: 0.9

Best Configurations Found:
Best Jaccard Score: 0.2533
Configuration:
  sep_pn: False
  rm_pn: False
  apply_lemmatization: False
  apply_stemming: False
  add_bigrams: False
  add_trigrams: True
  rm_sw: False
  handle_emojis: True
  expand_contractions: True
  normalize_elongated: True
  add_pos_tags: True
  add_dep_tags: True
  use_tfidf: True
  tfidf_max_features: 200
  tfidf_ngram_range: 1
  tfidf_min_df: 2
  tfidf_max_df: 0.9

Best Micro F1: 0.3878
Configuration:
  sep_pn: False
  rm_pn: False
  apply_lemmatization: False
  apply_stemming: False


Preprocessing validation: 100%|██████████| 116/116 [00:00<00:00, 123.67it/s]
Preprocessing test: 100%|██████████| 2767/2767 [00:19<00:00, 142.82it/s]


Getting validation predictions...


Predicting: 100%|██████████| 8/8 [00:40<00:00,  5.12s/it]


Getting test predictions...


Predicting: 100%|██████████| 173/173 [18:06<00:00,  6.28s/it]


Final Evaluation with Best Macro F1 Configuration:
Overall Metrics:
Multilabel accuracy (Jaccard score): 0.2466
MICRO recall: 0.5057, precision: 0.3145, f1: 0.3878
MACRO recall: 0.3961, precision: 0.2082, f1: 0.2714

Per-class Metrics:
*** joy ***
recall: 0.4839, precision: 0.2206, f1: 0.303

*** sadness ***
recall: 0.5143, precision: 0.2222, f1: 0.3103

*** surprise ***
recall: 0.0, precision: 0.0, f1: 0.0

*** fear ***
recall: 0.8571, precision: 0.5243, f1: 0.6506

*** anger ***
recall: 0.125, precision: 0.0741, f1: 0.093






In [12]:
# Save final predictions with macro F1 config
save_predictions(
    val_predictions,
    val['id'],
    f'../results/val_predictions_best_macro_f1_{timestamp}.csv'
)
save_predictions(
    test_predictions,
    test['id'],
    f'../results/test_predictions_best_macro_f1_{timestamp}.csv'
)

# For fine-tuning, use the macro F1 configuration
print("\nNow fine-tuning model with best Macro F1 configuration...")
# Preprocess all data with macro F1 config
train_processed = [pre_process(text, config) for text in tqdm(train['text'], desc="Preprocessing train")]
val_processed = [pre_process(text, config) for text in tqdm(val['text'], desc="Preprocessing val")]
test_processed = [pre_process(text, config) for text in tqdm(test['text'], desc="Preprocessing test")]

# Fine-tune model with macro F1 config
fine_tuned_model = fine_tune_model(
    model,
    train_processed,
    train[emotions].values,
    val_processed,
    val[emotions].values,
    {
        'learning_rate': 2e-5,
        'num_epochs': 3,
        'batch_size': 16
    }
)

Saved predictions to ../results/val_predictions_best_macro_f1_2025-01-30_12_33_01.csv
Saved predictions to ../results/test_predictions_best_macro_f1_2025-01-30_12_33_01.csv

Now fine-tuning model with best Macro F1 configuration...


Preprocessing train:   0%|          | 0/2768 [00:00<?, ?it/s]

Preprocessing train: 100%|██████████| 2768/2768 [00:38<00:00, 71.61it/s]
Preprocessing val: 100%|██████████| 116/116 [00:01<00:00, 73.31it/s]
Preprocessing test: 100%|██████████| 2767/2767 [00:21<00:00, 126.54it/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bhadresh-savani/bert-base-uncased-emotion and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([6]) in the checkpoint and torch.Size([5]) in the model instantiated
- classifier.weight: found shape torch.Size([6, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Model configuration:
Classifier weight shape: torch.Size([5, 768])
Classifier bias shape: torch.Size([5])
Number of labels: 5
Emotion labels: ['joy', 'sadness', 'surprise', 'fear', 'anger']


Epoch 1:  43%|████▎     | 74/173 [41:27<1:22:26, 49.97s/it]

### Grid Search for the best threshold:

In [None]:
# Using the best configuration based on the Macro F1 score from above
# Now do grid search for the best threshold

### Fine-tuning the model with the best threshold

# Using the best threshold found above, fine-tune the model using BERT emotion

Top 5 Configurations:

1. Jaccard Score: 0.3513
Configuration:
  sep_pn: False
  rm_pn: False
  apply_lemmatization: False
  apply_stemming: False
  add_bigrams: False
  rm_sw: False

2. Jaccard Score: 0.3455
Configuration:
  sep_pn: False
  rm_pn: False
  apply_lemmatization: True
  apply_stemming: False
  add_bigrams: False
  rm_sw: False

3. Jaccard Score: 0.3441
...
  apply_lemmatization: False
  apply_stemming: False
  add_bigrams: False
  rm_sw: False


Initial Results Without Preprocessing:

Evaluating validation predictions...
Overall Metrics:
Multilabel accuracy (Jaccard score): 0.347
MICRO recall: 0.3523, precision: 0.6139, f1: 0.4477
MACRO recall: 0.4, precision: 0.7202, f1: 0.4234

Per-class Metrics:
*** joy ***
recall: 0.5161, precision: 0.6957, f1: 0.5926

*** sadness ***
recall: 0.4, precision: 0.875, f1: 0.549

*** surprise ***
recall: 0.0323, precision: 1.0, f1: 0.0625

*** fear ***
recall: 0.3016, precision: 0.6552, f1: 0.413

*** anger ***
recall: 0.75, precision: 0.375, f1: 0.5