<a href="https://colab.research.google.com/github/b1lalhasham/finmodels/blob/main/finx_classifier_bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip install transformers datasets torch scikit-learn pandas accelerate




In [5]:
import os
os.environ["WANDB_DISABLED"] = "true"
import pandas as pd
import torch
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

# Load dataset
df = pd.read_csv("train_data.csv")

# Encode labels
label_map = {label: idx for idx, label in enumerate(df["category"].unique())}
df["label"] = df["category"].map(label_map)

# Train/test split
train_texts, test_texts, train_labels, test_labels = train_test_split(df["prompt"], df["label"], test_size=0.2)

# Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(example):
    return tokenizer(example["prompt"], padding="max_length", truncation=True)

# Prepare datasets
train_dataset = Dataset.from_dict({"prompt": train_texts.tolist(), "label": train_labels.tolist()}).map(tokenize_function, batched=True)
test_dataset = Dataset.from_dict({"prompt": test_texts.tolist(), "label": test_labels.tolist()}).map(tokenize_function, batched=True)

# Load model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_map))

# Training args
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train
trainer.train()

# Save model
model.save_pretrained("finx_bert_model")
tokenizer.save_pretrained("finx_bert_model")


Map:   0%|          | 0/6 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss
1,No log,1.197841
2,No log,1.200431
3,No log,1.20529


('finx_bert_model/tokenizer_config.json',
 'finx_bert_model/special_tokens_map.json',
 'finx_bert_model/vocab.txt',
 'finx_bert_model/added_tokens.json')

In [6]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load trained model
model_path = "finx_bert_model"
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)
model.eval()  # Set to evaluation mode

# Label mapping (same as training)
label_map = {0: "Research Expert", 1: "Financial Assistant Chatbot", 2: "Analysis & Insights Expert", 3: "Investing Expert"}

# Function to classify a prompt
def classify_prompt(prompt):
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True)

    with torch.no_grad():
        outputs = model(**inputs)

    probs = torch.nn.functional.softmax(outputs.logits, dim=1)  # Convert logits to probabilities
    confidence, predicted_class = torch.max(probs, dim=1)

    # Set a confidence threshold (e.g., 0.6)
    if confidence.item() < 0.6:
        return "Financial Assistant Chatbot (GPT-4 Mini)"

    return label_map[predicted_class.item()]

# Test classification
test_prompt = "Summarize Tesla's earnings call"
predicted = classify_prompt(test_prompt)
print(f"🔎 FinX Model Prediction: {predicted}")


🔎 FinX Model Prediction: Investing Expert


In [None]:
while True:
    user_input = input("\nEnter a prompt (or type 'exit'): ")
    if user_input.lower() == 'exit':
        break
    result = classify_prompt(user_input)
    print(f"🧠 FinX Model Prediction: {result}")




In [8]:
test_prompts = [
    "Summarize Tesla's earnings call",  # Research Expert ✅
    "What is a credit default swap?",  # Financial Assistant Chatbot ✅
    "How did the Fed rate hike impact stocks?",  # Analysis & Insights Expert ✅
    "Which sectors are best for portfolio diversification?",  # Investing Expert ✅
    "Tell me a joke",  # GPT-4 Mini (Low confidence) ✅
    "Translate 'Hello' to French",  # GPT-4 Mini ✅
    "Who is the president of the US?",  # GPT-4 Mini ✅
    "What is happening in the economy?",  # Analysis & Insights Expert ✅
    "How much rain will we get tomorrow?",  # GPT-4 Mini (Not a finance question) ✅
]

for prompt in test_prompts:
    print(f"🔎 Query: {prompt}")
    print(f"🤖 FinX Model Prediction: {classify_prompt(prompt)}\n")


🔎 Query: Summarize Tesla's earnings call
🤖 FinX Model Prediction: Investing Expert

🔎 Query: What is a credit default swap?
🤖 FinX Model Prediction: Investing Expert

🔎 Query: How did the Fed rate hike impact stocks?
🤖 FinX Model Prediction: Investing Expert

🔎 Query: Which sectors are best for portfolio diversification?
🤖 FinX Model Prediction: Investing Expert

🔎 Query: Tell me a joke
🤖 FinX Model Prediction: Financial Assistant Chatbot

🔎 Query: Translate 'Hello' to French
🤖 FinX Model Prediction: Investing Expert

🔎 Query: Who is the president of the US?
🤖 FinX Model Prediction: Investing Expert

🔎 Query: What is happening in the economy?
🤖 FinX Model Prediction: Investing Expert

🔎 Query: How much rain will we get tomorrow?
🤖 FinX Model Prediction: Investing Expert

