In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import torch
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

# Load your financial news dataset (assuming it's stored in a CSV file)
data = pd.read_csv("financial_news.csv")

# Preprocess your data (assuming 'text' column contains the news text and 'label' column contains the labels)
texts = data['text'].tolist()
labels = data['label'].tolist()

# Split the dataset into training, validation, and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.1, random_state=42)

# Load the pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the input texts
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

# Convert labels to tensors
train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)
test_labels = torch.tensor(test_labels)

# Create DataLoader for training, validation, and testing sets
train_dataset = TensorDataset(torch.tensor(train_encodings['input_ids']),
                              torch.tensor(train_encodings['attention_mask']),
                              train_labels)
val_dataset = TensorDataset(torch.tensor(val_encodings['input_ids']),
                            torch.tensor(val_encodings['attention_mask']),
                            val_labels)
test_dataset = TensorDataset(torch.tensor(test_encodings['input_ids']),
                             torch.tensor(test_encodings['attention_mask']),
                             test_labels)

batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)

# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
epochs = 3

for epoch in range(epochs):
    model.train()
    train_losses = []
    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}'):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        train_losses.append(loss.item())
        loss.backward()
        optimizer.step()
    scheduler.step()
    print(f"Epoch {epoch + 1}, Train loss: {np.mean(train_losses)}")

# Validation loop
model.eval()
val_losses = []
val_preds = []
val_targets = []

with torch.no_grad():
    for batch in tqdm(val_loader, desc="Validation"):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        val_losses.append(loss.item())
        logits = outputs.logits
        val_preds.extend(torch.argmax(logits, axis=1).cpu().detach().numpy())
        val_targets.extend(labels.cpu().detach().numpy())

print(f"Validation loss: {np.mean(val_losses)}")
print("Validation Accuracy:", accuracy_score(val_targets, val_preds))
print("Validation Classification Report:")
print(classification_report(val_targets, val_preds))

# Test loop
model.eval()
test_losses = []
test_preds = []
test_targets = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Testing"):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        test_losses.append(loss.item())
        logits = outputs.logits
        test_preds.extend(torch.argmax(logits, axis=1).cpu().detach().numpy())
        test_targets.extend(labels.cpu().detach().numpy())

print(f"Test loss: {np.mean(test_losses)}")
print("Test Accuracy:", accuracy_score(test_targets, test_preds))
print("Test Classification Report:")
print(classification_report(test_targets, test_preds))


ModuleNotFoundError: No module named 'transformers'