In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import wandb
from preprocess import FinanceNews, TwitterNews, merge_df
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm


wandb.init(project="finbert-sentiment-analysis",name="trainFinBert")


finance = "../../dataset/sentiment/all-data.csv"
twitter_train = "../../dataset/twitter/sent_train.csv"
twitter_test = "../../dataset/twitter/sent_valid.csv"

finance_news = FinanceNews(finance)
twitter_news = TwitterNews(twitter_train, twitter_test)
merged_data = merge_df(finance_news.train, twitter_news.train)

class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.float)
        }

tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
texts = merged_data['text'].tolist()
labels = merged_data['label'].tolist()

train_dataset = SentimentDataset(texts, labels, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone', num_labels=3)
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-6)
# Training loop with batch-wise wandb logging
# Fine-tuning the model with batch-wise logging 
# and saving model every 200 batches
def train(model, dataloader, optimizer, device, epochs=3, save_interval=200):
    model.train()
    batch_count = 0  
    for epoch in range(epochs):
        loop = tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}")
        for batch_idx, batch in enumerate(loop):
            optimizer.zero_grad()

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels.long())
            loss = outputs.loss
            logits = outputs.logits

            loss.backward()
            optimizer.step()

            preds = torch.argmax(logits, dim=-1).cpu().numpy()
            labels_np = labels.cpu().numpy()
            batch_accuracy = (preds == labels_np).sum() / len(labels_np)

            wandb.log({
                'batch_loss': loss.item(),
                'batch_accuracy': batch_accuracy
            })

            batch_count += 1
            if batch_count % save_interval == 0:
                model_save_path = f"pretrained_{batch_count}.pt"
                torch.save(model.state_dict(), model_save_path)
                print(f"Model saved as {model_save_path} at batch {batch_count}")

            loop.set_postfix(loss=loss.item(), batch_accuracy=batch_accuracy)

train(model, train_dataloader, optimizer, device)


In [6]:
from model import Finbert
f = Finbert()

f.infernece(['the company is doing regular amount of work!'],enablePrint=True)

[{'label': 'Neutral', 'score': 0.999850869178772}]


[{'label': 'Neutral', 'score': 0.999850869178772}]