In [4]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

# Load data from CSV
train = pd.read_csv("data/train.csv", encoding='unicode_escape')
train = df.dropna(subset=['text', 'sentiment'])


In [8]:
# Preprocess data
# Assume 'text' column contains the text data and 'sentiment' column contains the sentiment labels
texts = train['text'].tolist()
sentiments = train['sentiment'].tolist()

# Encode labels
label_encoder = LabelEncoder()
sentiments_encoded = label_encoder.fit_transform(sentiments)

# Split data into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, sentiments_encoded, test_size=0.2, random_state=42)

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

# Tokenize and encode the data
def tokenize_and_encode(texts, labels):
    input_ids = []
    attention_masks = []

    for text in tqdm(texts, desc="Tokenizing"):
        encoded_dict = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=64,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels)
    
    return input_ids, attention_masks, labels

train_input_ids, train_attention_masks, train_labels = tokenize_and_encode(train_texts, train_labels)
val_input_ids, val_attention_masks, val_labels = tokenize_and_encode(val_texts, val_labels)

# Create DataLoader
batch_size = 32

train_data = TensorDataset(train_input_ids, train_attention_masks, train_labels)
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Tokenizing: 100%|██████████████████████████████████████████████████████████████| 21984/21984 [00:08<00:00, 2544.80it/s]
Tokenizing: 100%|████████████████████████████████████████████████████████████████| 5496/5496 [00:02<00:00, 2389.41it/s]


In [9]:
val_data = TensorDataset(val_input_ids, val_attention_masks, val_labels)
val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=False)

In [10]:
# Fine-tune the pre-trained BERT model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

epochs = 3

for epoch in range(epochs):
    model.train()
    total_train_loss = 0

    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1} Training"):
        input_ids, attention_masks, labels = batch
        input_ids, attention_masks, labels = input_ids.to(device), attention_masks.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
        loss = outputs.loss
        total_train_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f"Avg Train Loss: {avg_train_loss}")

    # Validation
    model.eval()
    total_eval_accuracy = 0

    for batch in tqdm(val_dataloader, desc=f"Epoch {epoch + 1} Validation"):
        input_ids, attention_masks, labels = batch
        input_ids, attention_masks, labels = input_ids.to(device), attention_masks.to(device), labels.to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
        
        logits = outputs.logits
        _, preds = torch.max(logits, dim=1)
        total_eval_accuracy += torch.sum(preds == labels).item()
    
    avg_val_accuracy = total_eval_accuracy / len(val_texts)
    print(f"Avg Validation Accuracy: {avg_val_accuracy}")

Epoch 1 Training: 100%|████████████████████████████████████████████████████████████| 687/687 [2:34:52<00:00, 13.53s/it]


Avg Train Loss: 0.6085579006605968


Epoch 1 Validation: 100%|████████████████████████████████████████████████████████████| 172/172 [12:19<00:00,  4.30s/it]


Avg Validation Accuracy: 0.789665211062591


Epoch 2 Training: 100%|████████████████████████████████████████████████████████████| 687/687 [2:11:13<00:00, 11.46s/it]


Avg Train Loss: 0.42231182374461423


Epoch 2 Validation: 100%|████████████████████████████████████████████████████████████| 172/172 [10:12<00:00,  3.56s/it]


Avg Validation Accuracy: 0.7996724890829694


Epoch 3 Training: 100%|████████████████████████████████████████████████████████████| 687/687 [2:04:11<00:00, 10.85s/it]


Avg Train Loss: 0.29923520700835243


Epoch 3 Validation: 100%|████████████████████████████████████████████████████████████| 172/172 [10:18<00:00,  3.59s/it]

Avg Validation Accuracy: 0.7878457059679768





In [13]:
test = pd.read_csv("data/test.csv", encoding='unicode_escape')
test = df.dropna(subset=['text', 'sentiment'])


test_texts = test['text'].tolist()
test_sentiments = test['sentiment'].tolist()

# Encode labels
test_sentiments_encoded = label_encoder.transform(test_sentiments)

# Tokenize and encode the test data
test_input_ids, test_attention_masks, test_labels = tokenize_and_encode(test_texts, test_sentiments_encoded)

# Create DataLoader for test set
test_data = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

Tokenizing: 100%|██████████████████████████████████████████████████████████████| 27480/27480 [00:10<00:00, 2524.96it/s]


In [14]:
# Define a function for evaluation
def evaluate_model(model, dataloader):
    model.eval()
    total_correct = 0
    total_predictions = 0
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids, attention_masks, labels = batch
            input_ids, attention_masks, labels = input_ids.to(device), attention_masks.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask=attention_masks)
            logits = outputs.logits
            _, predicted = torch.max(logits, 1)
            
            total_correct += (predicted == labels).sum().item()
            total_predictions += len(labels)
    
    accuracy = total_correct / total_predictions
    return accuracy

# Create DataLoader for test set
test_data = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

# Evaluate the model on the test set
test_accuracy = evaluate_model(model, test_dataloader)
print(f"Test Accuracy: {test_accuracy}")


Evaluating: 100%|████████████████████████████████████████████████████████████████████| 859/859 [59:43<00:00,  4.17s/it]

Test Accuracy: 0.9196506550218341





In [19]:
from sklearn.metrics import classification_report

# Define a function for evaluation
def evaluate_model(model, dataloader):
    model.eval()
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids, attention_masks, labels = batch
            input_ids, attention_masks, labels = input_ids.to(device), attention_masks.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask=attention_masks)
            logits = outputs.logits
            _, predicted = torch.max(logits, 1)
            
            all_predictions.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    return all_predictions, all_labels

# Create DataLoader for test set
test_data = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

# Evaluate the model on the test set and get predictions and true labels
predictions, true_labels = evaluate_model(model, test_dataloader)

# Print classification report
target_names = list(label_encoder.classes_)
print(classification_report(true_labels, predictions, target_names=target_names))


Evaluating: 100%|██████████████████████████████████████████████████████████████████| 859/859 [1:00:29<00:00,  4.23s/it]

              precision    recall  f1-score   support

    negative       0.88      0.95      0.92      7781
     neutral       0.94      0.87      0.90     11117
    positive       0.93      0.95      0.94      8582

    accuracy                           0.92     27480
   macro avg       0.92      0.93      0.92     27480
weighted avg       0.92      0.92      0.92     27480




