In [None]:
!pip install transformers

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
from collections import defaultdict

import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Set intial variables and constants
%config InlineBackend.figure_format='retina'

# Graph Designs
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 8

# Random seed for reproducibilty
# RANDOM_SEED = 42
# np.random.seed(RANDOM_SEED)
# torch.manual_seed(RANDOM_SEED)

# Set GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
!pip install gdown --quiet  

import gdown
import os

def downloadFiles(file_urls, target_dir):
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)
    
    for file_name, file_url in file_urls:
        file_path = os.path.join(target_dir, file_name)
        print(f"Đang tải {file_name} vào {target_dir}...")
        gdown.download(file_url, file_path, quiet=False)

file_list = [
    ["train_stockemo.csv", "https://drive.google.com/uc?id=14kpQhdpjt57ySe9omZSofbmFF4iYUIDc"],
    ["val_stockemo.csv", "https://drive.google.com/uc?id=1-8FC0f1RDCNSkRt8doTDMAPrmdmazQ5u"],
    ["test_stockemo.csv", "https://drive.google.com/uc?id=1-A1n7mRMbje-me1rQpce_QsfFnlH0av7"],
    ["processed_stockemo.csv", "https://drive.google.com/uc?id=1-7QLxjVIezZLJ_Og5m3DmmW32BabnH2i"]
]

downloadFiles(file_list, "stockemo")

print("\nDanh sách file trong folder 'stockemo':")
print(os.listdir("stockemo"))

In [None]:
df_train = pd.read_csv('/kaggle/working/stockemo/train_stockemo.csv')
df_test = pd.read_csv('/kaggle/working/stockemo/test_stockemo.csv')
df_val = pd.read_csv('/kaggle/working/stockemo/val_stockemo.csv')
df = pd.concat([df_train, df_val, df_test], axis=0, ignore_index=True)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df_train = df_train.sample(frac=1, random_state=42).reset_index(drop=True)
df_test = df_test.sample(frac=1, random_state=42).reset_index(drop=True)
df_val = df_val.sample(frac=1, random_state=42).reset_index(drop=True)

print(df_train.shape)
print(df_test.shape)
print(df_val.shape)
print(df.shape)

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
def to_sentiment_label(value):
    if value == 'bullish':
        return 1
    else:
        return 0

def to_emotion_label(value):
    emotion_mapping = {
        'ambiguous': 0,  # Mơ hồ (tiêu cực nhất)
        'anxiety': 1,    # Lo lắng
        'panic': 2,      # Hoảng loạn
        'depression': 3, # Trầm cảm
        'confusion': 4,  # Bối rối
        'anger': 5,      # Tức giận
        'disgust': 6,    # Ghê tởm
        'surprise': 7,   # Ngạc nhiên
        'belief': 8,     # Niềm tin
        'optimism': 9,   # Lạc quan
        'excitement': 10,# Hào hứng
        'amusement': 11  # Thích thú (tích cực nhất)
    }

    return emotion_mapping.get(value, -1)  # -1 nếu không hợp lệ

def remove_first_word(value):
    """
    Loại bỏ chữ đầu tiên trong chuỗi và trả về chuỗi còn lại.

    Parameters:
        value (str): Chuỗi đầu vào.

    Returns:
        str: Chuỗi sau khi đã bỏ chữ đầu tiên.
    """
    # Tách chuỗi thành danh sách các từ
    words = value.split()
    
    # Kiểm tra nếu chuỗi có ít nhất 2 từ
    if len(words) > 1:
        # Ghép lại các từ còn lại
        return " ".join(words[1:])
    else:
        # Nếu chỉ có 1 từ hoặc chuỗi rỗng, trả về chuỗi rỗng
        return ""

# Áp dụng hàm to_sentiment_label và to_emotion_label vào các cột của tập dữ liệu
df_train['sentiment_label'] = df_train.senti_label.apply(to_sentiment_label)
df_train['emotion_label'] = df_train.emo_label.apply(to_emotion_label)
#df_train['content'] = df_train.processed.apply(remove_first_word)

df_test['sentiment_label'] = df_test.senti_label.apply(to_sentiment_label)
df_test['emotion_label'] = df_test.emo_label.apply(to_emotion_label)
#df_test['content'] = df_test.processed.apply(remove_first_word)

df_val['sentiment_label'] = df_val.senti_label.apply(to_sentiment_label)
df_val['emotion_label'] = df_val.emo_label.apply(to_emotion_label)
#df_val['content'] = df_val.processed.apply(remove_first_word)

df['sentiment_label'] = df.senti_label.apply(to_sentiment_label)
df['emotion_label'] = df.emo_label.apply(to_emotion_label)
#df['content'] = df.processed.apply(remove_first_word)

In [None]:
# Plot the distribution
class_sentiments = [
    'negative', 'positive'
]
class_emotions = [
    'ambiguous', 'anxiety', 'panic', 'depression', 'confusion', 
    'anger', 'disgust', 'surprise', 'belief', 'optimism', 
    'excitement', 'amusement'
]

In [None]:
MODEL_NAME = 'bert-base-cased'

tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
tokenizer

In [None]:
MAX_LEN = 160

In [None]:
class StockEmoDataset(Dataset):
    def __init__(self, contents, targets, tokenizer, max_len):
        self.contents = contents
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.contents)

    def __getitem__(self, item):
        content = str(self.contents[item])
        target = self.targets[item]

        encoding = self.tokenizer.encode_plus(
            content,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'content_text': content,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.long)
        }
        
#Example
example_text = "Hello, how are you?"

encoding = tokenizer.encode_plus(
    example_text,
    add_special_tokens=True,  # Thêm [CLS] ở đầu và [SEP] ở cuối
    max_length=10,            # Giới hạn độ dài là 10 token
    return_token_type_ids=False,
    pad_to_max_length=True,   # Thêm [PAD] nếu cần để đạt đến độ dài 10
    return_attention_mask=True,
    return_tensors='pt'       # Trả về tensor PyTorch
)

print(encoding)

In [None]:
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = StockEmoDataset(
        contents=df.processed.to_numpy(),
        targets=df.emotion_label.to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
    )
    
    return DataLoader(
        ds,
        batch_size=batch_size,
        num_workers=0
    )

In [None]:
# Create train, test and val data loaders
BATCH_SIZE = 64
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

In [None]:
bert_model = BertModel.from_pretrained(MODEL_NAME)

In [None]:
class SentimentClassifier(nn.Module):
    def __init__(self, hidden_dim, n_classes):
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(MODEL_NAME)
        self.out = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size, hidden_dim),
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.Linear(hidden_dim, n_classes)
        )
    
    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        logits = self.out(pooled_output)
        
        return logits, pooled_output  

In [None]:
model = SentimentClassifier(512, len(class_emotions))
model = model.to(device)

In [None]:
print(bert_model.config.hidden_size)

In [None]:
EPOCHS = 5

# Optimizer Adam 
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)

total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

# Set the loss function 
loss_fn = nn.CrossEntropyLoss().to(device)

In [None]:
def supervised_simcse_loss(embeddings, labels, temperature=0.05):
    """
    Compute the supervised SimCSE loss.
    
    Args:
        embeddings (torch.Tensor): Tensor of shape (batch_size, hidden_dim) containing sentence embeddings.
        labels (torch.Tensor): Tensor of shape (batch_size,) containing the labels for supervised contrastive learning.
        temperature (float): Temperature scaling parameter for contrastive loss.
    
    Returns:
        torch.Tensor: The computed contrastive loss.
    """
    # Normalize embeddings to unit vectors
    embeddings = F.normalize(embeddings, p=2, dim=1)
    
    # Compute similarity matrix (batch_size x batch_size)
    similarity_matrix = torch.matmul(embeddings, embeddings.T)  # cosine similarity
    
    # Scale by temperature
    similarity_matrix = similarity_matrix / temperature
    
    # Create labels for the contrastive loss
    batch_size = labels.size(0)
    contrastive_labels = torch.eq(labels.unsqueeze(1), labels.unsqueeze(0)).float()  # shape: (batch_size, batch_size)
    
    # Mask diagonal (self-comparisons should not contribute to loss)
    mask = ~torch.eye(batch_size, dtype=bool, device=labels.device)
    
    # Apply mask and compute log-softmax
    similarity_matrix_exp = torch.exp(similarity_matrix)
    masked_similarity = similarity_matrix_exp * mask
    log_prob = similarity_matrix - torch.log(masked_similarity.sum(dim=1, keepdim=True))
    
    # Compute supervised contrastive loss
    contrastive_loss = - (contrastive_labels * log_prob).sum(dim=1) / contrastive_labels.sum(dim=1)
    
    # Average over the batch
    loss = contrastive_loss.mean()
    
    return loss

In [None]:
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples, temperature=0.05):
    model = model.train()
    classification_losses = []
    contrastive_losses = []
    correct_predictions = 0
    
    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["targets"].to(device)
        
        # Forward pass to get logits and embeddings
        logits, embeddings = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        # Classification loss
        _, preds = torch.max(logits, dim=1)
        classification_loss = loss_fn(logits, targets)
        
        # Contrastive loss
        contrastive_loss = supervised_simcse_loss(embeddings, targets, temperature=temperature)
        
        # Combine both losses
        loss = classification_loss + contrastive_loss
        
        correct_predictions += torch.sum(preds == targets)
        classification_losses.append(classification_loss.item())
        contrastive_losses.append(contrastive_loss.item())
        
        # Backward prop
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
    
    return (
        correct_predictions.double() / n_examples,
        np.mean(classification_losses),
        np.mean(contrastive_losses),
    )

In [None]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0
    
    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)
            
            # Forward pass
            logits, _ = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            
            # Classification loss
            _, preds = torch.max(logits, dim=1)
            loss = loss_fn(logits, targets)
            
            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())
    
    return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
%%time

history = defaultdict(list)
best_accuracy = 0

for epoch in range(EPOCHS):
    
    # Show details 
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    print("-" * 10)
    
    # Train the model
    train_acc, train_class_loss, train_contrast_loss = train_epoch(
        model,
        train_data_loader,
        loss_fn,
        optimizer,
        device,
        scheduler,
        len(df_train),
        temperature=0.05
    )
    
    print(f"Train loss {train_class_loss:.4f} accuracy {train_acc:.4f}")
    print(f"Classification Loss {train_class_loss:.4f}")
    print(f"Contrastive Loss {train_contrast_loss:.4f}")
    
    # Get model performance (accuracy and loss)
    val_acc, val_loss = eval_model(
        model,
        val_data_loader,
        loss_fn,
        device,
        len(df_val)
    )
    
    print(f"Val loss {val_loss:.4f} accuracy {val_acc:.4f}")
    print()
    
    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_class_loss)
    history['train_contrast_loss'].append(train_contrast_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)
    
    # If we beat previous performance, save the model
    if val_acc > best_accuracy:
        torch.save(model.state_dict(), 'best_model_state.bin')
        best_accuracy = val_acc

    #Batch_size = 64
    #drop_out = 0.5

In [None]:
# Plot training and validation accuracy
plt.plot(history['train_acc'], label='train accuracy')
plt.plot(history['val_acc'], label='validation accuracy')

# Graph chars
plt.title('Training history')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.ylim([0, 1]);

In [None]:
test_acc, _ = eval_model(
  model,
  test_data_loader,
  loss_fn,
  device,
  len(df_test)
)

test_acc.item()

In [None]:
def get_predictions(model, data_loader):
    model = model.eval()

    review_texts = []
    predictions = []
    prediction_probs = []
    real_values = []

    with torch.no_grad():
        for d in data_loader:
            texts = d["content_text"]
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)

            logits, _ = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            _, preds = torch.max(logits, dim=1)

            review_texts.extend(texts)
            predictions.extend(preds)
            prediction_probs.extend(logits)
            real_values.extend(targets)

    predictions = torch.stack(predictions).cpu()
    prediction_probs = torch.stack(prediction_probs).cpu()
    real_values = torch.stack(real_values).cpu()

    return review_texts, predictions, prediction_probs, real_values

In [None]:
y_review_texts, y_pred, y_pred_probs, y_test = get_predictions(
    model,
    test_data_loader
)

In [None]:
print(classification_report(y_test, y_pred, target_names=class_emotions))

In [None]:
def show_confusion_matrix(confusion_matrix):
    hmap = sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues")
    hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
    hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
    plt.ylabel('True sentiment')
    plt.xlabel('Predicted sentiment');

cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(cm, index=class_emotions, columns=class_emotions)
show_confusion_matrix(df_cm)

In [None]:
review_text = "I love completing my todos! Best app ever!!!"

In [None]:
encoded_review = tokenizer.encode_plus(
    review_text,
    max_length=MAX_LEN,
    add_special_tokens=True,
    return_token_type_ids=False,
    pad_to_max_length=True,
    return_attention_mask=True,
    return_tensors='pt',
)

In [None]:
input_ids = encoded_review['input_ids'].to(device)
attention_mask = encoded_review['attention_mask'].to(device)

logits, _ = model(input_ids, attention_mask)
_, prediction = torch.max(logits, dim=1)

print(f'Review text: {review_text}')
print(f'Sentiment  : {class_emotions[prediction]}')