# Business Description Clustering with FinBERT

This notebook implements a pipeline for clustering companies based on their business descriptions using FinBERT embeddings and cosine similarity.

## Pipeline Steps
1. Load and preprocess business descriptions
2. Generate embeddings using FinBERT
3. Filter relevant business function sentences
4. Compute similarity scores
5. Save filtered results

## Setup
Import required libraries and ensure dependencies are installed.

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
import nltk
import spacy
import re
import os
from nltk.tokenize import sent_tokenize
from transformers import AutoTokenizer, AutoModel

# Download required NLTK data
nltk.download("punkt", quiet=True)

# Constants
MODEL_NAME = "yiyanghkust/finbert-tone"
SIMILARITY_THRESHOLD = 0.75
DATA_PATH = "../data/raw/management_support.csv"
RESULTS_PATH = "../data/results/filtered_companies.csv"

## Data Loading and Preprocessing
Note: FinBERT is case-sensitive as it was trained on financial documents where capitalization carries meaning.

In [None]:
def clean_text(text: str) -> str:
    """Remove special characters and extra whitespace while preserving case.
    
    Args:
        text: Input text to clean
        
    Returns:
        Cleaned text with only letters and single spaces
    """
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return re.sub(r'\s+', ' ', text).strip()

# Load and process dataset
df = pd.read_csv(DATA_PATH)
df.rename(columns={
    'Company Name': 'company_name', 
    'Business Description': 'business_description'
}, inplace=True)

# Clean descriptions and split into sentences
df["cleaned_description"] = df["business_description"].apply(clean_text)
df["sentences"] = df["cleaned_description"].apply(sent_tokenize)

# Create one row per sentence
df_expanded = df.explode("sentences").rename(
    columns={"sentences": "individual_sentences"}
).reset_index(drop=True)

print("Processed", len(df), "companies with", len(df_expanded), "total sentences")

## Generate FinBERT Embeddings
Create sentence embeddings using the FinBERT model.

In [None]:
def encode_sentences(sentences: list) -> torch.Tensor:
    """Generate FinBERT embeddings for a list of sentences.
    
    Args:
        sentences: List of sentences to encode
        
    Returns:
        Tensor of sentence embeddings
    """
    inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :]

# Initialize model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Generate embeddings
bd_sentences = df_expanded["individual_sentences"].tolist()
bd_embeddings = encode_sentences(bd_sentences)

## Filter Business Function Sentences
Extract sentences that describe core business functions and activities.

In [None]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

FUNCTION_KEYWORDS = {
    "provides", "offers", "delivers", "specializes", "develops",
    "manufactures", "produces", "designs", "implements", "supports"
}

def describes_function(sentence: str) -> bool:
    """Check if sentence contains business function keywords.
    
    Args:
        sentence: Input sentence to check
        
    Returns:
        True if sentence contains function keywords
    """
    return any(word in sentence.lower() for word in FUNCTION_KEYWORDS)

def contains_business_activity(sentence: str) -> bool:
    """Check if sentence contains business-related verbs or nouns.
    
    Args:
        sentence: Input sentence to analyze
        
    Returns:
        True if sentence contains business activity indicators
    """
    doc = nlp(sentence)
    return any(token.pos_ in {"VERB", "NOUN"} and 
              token.dep_ in {"ROOT", "pobj"} for token in doc)

# Filter sentences describing business functions
df_filtered = df_expanded[df_expanded["individual_sentences"].apply(
    lambda x: describes_function(x) or contains_business_activity(x)
)]

print(f"Filtered from {len(df_expanded)} to {len(df_filtered)} relevant sentences")

## Compute Similarity Scores
Calculate cosine similarity between sentences and apply threshold filtering.

In [None]:
def find_similar_companies(query: str, threshold: float = SIMILARITY_THRESHOLD) -> pd.DataFrame:
    """Find companies with similar business descriptions.
    
    Args:
        query: Search query sentence
        threshold: Minimum similarity score (default: 0.75)
        
    Returns:
        DataFrame of matching companies with similarity scores
    """
    query_embedding = encode_sentences([query])
    cosine_scores = F.cosine_similarity(query_embedding, bd_embeddings)
    
    # Filter results above threshold
    filtered_results = [
        {
            "company_name": df_filtered.iloc[idx]["company_name"],
            "business_description": df_filtered.iloc[idx]["business_description"],
            "similar_sentence": df_filtered.iloc[idx]["individual_sentences"],
            "similarity_score": round(score.item(), 4)
        }
        for score, idx in zip(cosine_scores, range(len(cosine_scores)))
        if score.item() >= threshold
    ]
    
    return pd.DataFrame(filtered_results)

# Example search
query = "Provides consultancy services."
results_df = find_similar_companies(query)
print(f"Found {len(results_df)} similar companies")
results_df.head()

## Save Results

In [None]:
# Ensure results directory exists
os.makedirs(os.path.dirname(RESULTS_PATH), exist_ok=True)

# Save filtered results
results_df.to_csv(RESULTS_PATH, index=False)
print(f"Results saved to {RESULTS_PATH}")