In [1]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import wandb
from preprocess import FinanceNews, TwitterNews, merge_df
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

# Initialize WandB for logging metrics
wandb.init(project="finbert-sentiment-analysis", entity="your-wandb-username")

# Assuming merge_df, FinanceNews, TwitterNews, and Finbert are defined as in your example.

# Paths to dataset
finance = "../dataset/sentiment/all-data.csv"
twitter_train = "../dataset/twitter/sent_train.csv"
twitter_test = "../dataset/twitter/sent_valid.csv"

# Step 1: Load data
finance_news = FinanceNews(finance)
twitter_news = TwitterNews(twitter_train, twitter_test)

# Merge datasets (if you want to combine the two)
merged_data = merge_df(finance_news.train, twitter_news.train)

# Step 2: Prepare the dataset and dataloader
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.float)
        }

tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
texts = merged_data['text'].tolist()
labels = merged_data['label'].tolist()

train_dataset = SentimentDataset(texts, labels, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# Step 3: Initialize the FinBERT model
model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone', num_labels=3)

# Step 4: Fine-tuning the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop with WandB logging
def train(model, dataloader, optimizer, device, epochs=3):
    model.train()
    for epoch in range(epochs):
        loop = tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}")
        epoch_losses = []
        correct_predictions = 0
        total_predictions = 0

        for batch in loop:
            optimizer.zero_grad()

            # Get inputs
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels.long())
            loss = outputs.loss
            logits = outputs.logits

            # Backward pass
            loss.backward()
            optimizer.step()

            # Track loss and accuracy
            epoch_losses.append(loss.item())

            preds = torch.argmax(logits, dim=-1).cpu().numpy()
            labels = labels.cpu().numpy()
            correct_predictions += (preds == labels).sum()
            total_predictions += len(labels)

            # Update progress bar with current loss
            loop.set_postfix(loss=loss.item())

        # Calculate epoch loss and accuracy
        avg_loss = sum(epoch_losses) / len(epoch_losses)
        accuracy = correct_predictions / total_predictions

        # Log metrics to WandB
        wandb.log({
            'epoch': epoch + 1,
            'loss': avg_loss,
            'accuracy': accuracy
        })

        print(f"Epoch {epoch + 1} - Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}")

# Start training and logging metrics
train(model, train_dataloader, optimizer, device)

# Step 5: Finish WandB logging
wandb.finish()


  from .autonotebook import tqdm as notebook_tqdm
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mssongjinseob[0m. Use [1m`wandb login --relogin`[0m to force relogin
wandb: ERROR Error while calling W&B API: entity your-wandb-username not found during upsertBucket (<Response [404]>)


CommError: It appears that you do not have permission to access the requested resource. Please reach out to the project owner to grant you access. If you have the correct permissions, verify that there are no issues with your networking setup.(Error 404: Not Found)