# GOAL

Classify financial news snippets as positive, neutral, or negative. Model is trained on a huggingface dataset and a real-time analysis RAG pipeline is implemented for additional context.

Dataset:
financial_phrasebank, sentences_allagree. Several sentiment analysis context phrases.

Model:
distilbert-base-uncased. Effective and light-weight.

Compute: 
Free-tier friendly (ml.m4.xlarge CPU or local).

Framework: 
Hugging Face datasets + transformers (Trainer API).

# User Query

In [38]:
query = "How is Nvidia is doing in 2025?"

# Import Dataset

In [None]:
from datasets import load_dataset

# Load dataset
dataset = load_dataset("financial_phrasebank", "sentences_75agree", trust_remote_code=True)

# Shuffle dataset
dataset = dataset.shuffle(seed=42)

# Train/Test/Split
dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)

# Check
print(dataset)
print(dataset["train"][0])


# Tokenize 

In [None]:
from transformers import AutoTokenizer

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenization function
def tokenize_function(example):
    return tokenizer(example["sentence"], padding="max_length", truncation=True, max_length=512)

# Apply tokenization
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Set format
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# Check first sample
print(tokenized_dataset["train"][0])


# Fine-tune Model

In [None]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Load model (3 labels: negative, neutral, positive)
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",      
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

# Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="weighted"),
        "precision": precision_score(labels, predictions, average="weighted"),
        "recall": recall_score(labels, predictions, average="weighted"),
    }

# Trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics
)

# Train model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained("./finetuned-512-model")
tokenizer.save_pretrained("./finetuned-512-model")

# Run Inference

In [None]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load model and tokenizer from local directory
model = AutoModelForSequenceClassification.from_pretrained("./finetuned-512-model")
tokenizer = AutoTokenizer.from_pretrained("./finetuned-512-model")

# Load the trained model and tokenizer into pipeline
sentiment_pipeline = pipeline("text-classification", model=model, tokenizer=tokenizer)

examples = ["The stock market is crashing."]

for sentence in examples:
    result = sentiment_pipeline(sentence)[0]
    label_id = int(result['label'].split('_')[-1])
    label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
    print(f"{sentence} → {label_map[label_id]} (score: {result['score']:.2f})")


# RAG Pipeline

News Parser

In [None]:
import feedparser
import re
from html import unescape

def clean_html(text):
    text = re.sub(r'<a.*?>.*?</a>', '', text)
    text = re.sub(r'<.*?>', '', text)
    return unescape(text.strip())

def fetch_google_news_clean(query, max_results=200):
    url = f"https://news.google.com/rss/search?q={query.replace(' ', '+')}+when:7d&hl=en-US&gl=US&ceid=US:en"
    feed = feedparser.parse(url)

    cleaned = []
    for entry in feed.entries[:max_results]:
        title = entry.title
        summary = clean_html(entry.get("summary", ""))
        text = f"{title}. {summary}"
        cleaned.append(text.strip())
    
    return cleaned

# Test
news = fetch_google_news_clean(query, max_results=200)
for i, item in enumerate(news, 1):
    print(f"{i}. {item}")


Embed News with FAISS

In [None]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Embedder (small, fast, free)
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# Fetch news
news = fetch_google_news_clean(query, max_results=200)

# Step 2: Embed news headlines
news_embeddings = embedder.encode(news, convert_to_numpy=True)

# Step 3: Create FAISS structure
dim = news_embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(news_embeddings)

# Step 4: Function to query similar headlines
def retrieve_news(user_query, top_k=5):
    query_vec = embedder.encode([user_query], convert_to_numpy=True)
    distances, indices = index.search(query_vec, top_k)
    return [(news[i], distances[0][j]) for j, i in enumerate(indices[0])]

top_matches = retrieve_news(query)

for i, (snippet, dist) in enumerate(top_matches, 1):
    print(f"{i}. [distance: {round(dist, 2)}] {snippet}")


Acessing RAG

In [None]:
from transformers import pipeline

# Build classifier
sentiment_pipeline = pipeline("text-classification", model=model, tokenizer=tokenizer)

label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}

# Full RAG-enhanced classification
def classify_with_rag(user_query, top_k=5):
    # Retrieve relevant news
    top_news = retrieve_news(user_query, top_k=top_k)
    
    # Format RAG context
    context = " ".join([snippet for snippet, _ in top_news])
    
    # Combine prompt + context as model input
    full_input = f"<NEWS>: {context} <QUERY>: {user_query}"

    # Run your classifier
    pred = sentiment_pipeline(full_input)[0]
    label_id = int(pred["label"].split("_")[-1])
    label = label_map[label_id]
    score = round(pred["score"], 3)

    return label, score, full_input

label, score, full_input = classify_with_rag(query)

print(f"\n🧠 Classified Query: '{query}'")
print(f"🔎 Result: {label} • {score}")
print(f"\n📎 Model Input: {full_input[:200]}...")
