In [1]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

In [None]:
# 1. load data
train_df = pd.read_csv('./Data_csv/train.csv')
test_df = pd.read_csv('./Data_csv/test.csv')
train_df.head()


Unnamed: 0.1,Unnamed: 0,tweet_id,text,emotion
0,1380877,0x333571,If ur evr <LH> when you do chores. Put a music...,sadness
1,977634,0x2c1e11,Closed Sell 2.1 Lots EURUSD 1.17914 for +10.0 ...,joy
2,1341653,0x284456,KEEF #annoyed! cant date 😍😍 miss natalie ok...,anger
3,1486802,0x298e6c,Yesterday we beat the Royals 13 to 2. Today we...,surprise
4,817689,0x201880,@mayogaabanter @mayogaabanter #mayogaa by the ...,joy


In [3]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

train_df['emotion_encoded'] = label_encoder.fit_transform(train_df['emotion'])
train_df.head()

Unnamed: 0.1,Unnamed: 0,tweet_id,text,emotion,emotion_encoded
0,1380877,0x333571,If ur evr <LH> when you do chores. Put a music...,sadness,5
1,977634,0x2c1e11,Closed Sell 2.1 Lots EURUSD 1.17914 for +10.0 ...,joy,4
2,1341653,0x284456,KEEF #annoyed! cant date 😍😍 miss natalie ok...,anger,0
3,1486802,0x298e6c,Yesterday we beat the Royals 13 to 2. Today we...,surprise,6
4,817689,0x201880,@mayogaabanter @mayogaabanter #mayogaa by the ...,joy,4


In [4]:
# print(train_df[train_df['emotion_encoded']==7])
mylabels={
    'anger': 0,
    'anticipation':1,
    'disgust':2,
    'fear':3,
    'joy':4,
    'sadness':5,
    'surprise':6,
    'trust':7
}


In [5]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
            self.texts = texts
            self.labels = labels
            self.tokenizer = tokenizer
            self.max_length = max_length
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}

In [6]:
# train_df['text_length'] = train_df['text'].apply(lambda x: len(x.split())) 
# print(train_df['text_length'].describe())  

In [7]:
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits

In [8]:
def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

In [9]:
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)

In [10]:
def predict_sentiment(text, model, tokenizer, device, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
        label = next((k for k, v in mylabels.items() if v == preds.item()), "Unknown")
        return label

In [None]:
bert_model_name = 'bert-base-uncased'
num_classes = 8
max_length = 40
batch_size = 16
num_epochs = 3
learning_rate = 2e-5

In [12]:
train_texts, val_texts, train_labels, val_labels = train_test_split(train_df['text'].tolist(), train_df['emotion_encoded'].tolist(), test_size=0.2, random_state=42)

In [13]:
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(bert_model_name, num_classes).to(device)

In [15]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



In [16]:
for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        train(model, train_dataloader, optimizer, scheduler, device)
        accuracy, report = evaluate(model, val_dataloader, device)
        print(f"Validation Accuracy: {accuracy:.4f}")
        #print(report)

Epoch 1/5


  attn_output = torch.nn.functional.scaled_dot_product_attention(


Validation Accuracy: 0.6558
Epoch 2/5
Validation Accuracy: 0.6641
Epoch 3/5
Validation Accuracy: 0.6678
Epoch 4/5
Validation Accuracy: 0.6645
Epoch 5/5
Validation Accuracy: 0.6587


In [17]:
torch.save(model.state_dict(), "./model/bert_classifier_v2.pth")

In [18]:
test_df['predicted_emotion'] = test_df['text'].apply(
    lambda x: predict_sentiment(x, model, tokenizer, device, max_length)
)

In [19]:
output_df = test_df[['tweet_id', 'predicted_emotion']]
output_df.rename(columns={'tweet_id': 'id', 'predicted_emotion': 'emotion'}, inplace=True)
output_df.to_csv('part2_result/bert_classifier_v2.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output_df.rename(columns={'tweet_id': 'id', 'predicted_emotion': 'emotion'}, inplace=True)
