# **Business Description Clustering**
This Jupyter Notebook implements a machine learning pipeline for clustering companies based on their business descriptions.

**Steps:**
1. Load and preprocess data
2. Encode business descriptions using `SentenceTransformer`
3. Compute similarity between descriptions
4. Filter out non-relevant companies
5. Visualize results with a heatmap

**Dependencies:** `pandas`, `numpy`, `sentence-transformers`, `nltk`, `torch`, `seaborn`, `matplotlib`

## **1. Install & Import Required Libraries**

In [None]:
!pip install pandas numpy sentence-transformers nltk torch seaborn matplotlib

## **2. Load & Preprocess Data**

In [None]:

import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize

nltk.download("punkt", quiet=True)

# Load data
file_path = "data/raw/management_support.csv"  # Adjust path if needed
df = pd.read_csv(file_path)

# Rename columns
df.rename(columns={'Company Name': 'company_name', 'Business Description': 'business_description'}, inplace=True)
df["cleaned_description"] = df["business_description"].str.lower()

# Tokenize descriptions into sentences
df["sentences"] = df["cleaned_description"].apply(sent_tokenize)

# Expand DataFrame (one row per sentence)
df_expanded = df.explode("sentences").rename(columns={"sentences": "individual_sentences"}).reset_index(drop=True)
df_expanded.head()


## **3. Encode Sentences Using `SentenceTransformer`**

In [None]:

import torch
from sentence_transformers import SentenceTransformer

# Load pre-trained model
model = SentenceTransformer("all-mpnet-base-v2")

# Encode individual sentences
bd_sentences = df_expanded["individual_sentences"].tolist()
bd_embeddings = model.encode(bd_sentences, convert_to_tensor=True)


## **4. Compute Similarity Scores**

In [None]:

from sentence_transformers import util

# Input a query sentence
input_sentences = ["provides consultancy services."]
input_sentences_processed = [s.lower() for s in input_sentences]
input_embedding = model.encode(input_sentences_processed, convert_to_tensor=True)

# Compute cosine similarity scores
cosine_scores = util.cos_sim(input_embedding, bd_embeddings)[0]

# Get top 25 most similar descriptions
top_k = 25
top_results = torch.topk(cosine_scores, k=top_k)

# Store results in DataFrame
results = []
for score, idx in zip(top_results[0], top_results[1]):
    idx = idx.item()
    results.append({
        "company_name": df_expanded.iloc[idx]["company_name"],
        "business_description": df_expanded.iloc[idx]["business_description"],
        "similar_sentence": df_expanded.iloc[idx]["individual_sentences"],
        "similarity_score": round(score.item(), 4),
    })

results_df = pd.DataFrame(results)
results_df.head()


## **5. Filter Non-Similar Companies**

In [None]:

# Define non-relevant category
non_similar_sentences = ["company provides consumer loans"]
non_similar_sentences_processed = [s.lower() for s in non_similar_sentences]
non_similar_embeddings = model.encode(non_similar_sentences_processed, convert_to_tensor=True)

# Function to check if a company should be excluded
def is_similar_to_non_similar(description_embedding):
    cosine_scores = util.cos_sim(description_embedding, non_similar_embeddings)[0]
    return cosine_scores.max().item() >= 0.6  # Adjust threshold if needed

# Filter out companies with non-relevant similarities
companies_to_exclude = set()
for company_name, group in df_expanded.groupby("company_name"):
    embeddings = model.encode(group["individual_sentences"].tolist(), convert_to_tensor=True)
    if any(is_similar_to_non_similar(embedding) for embedding in embeddings):
        companies_to_exclude.add(company_name)

# Remove excluded companies from dataset
df_filtered = df_expanded[~df_expanded["company_name"].isin(companies_to_exclude)].reset_index(drop=True)
df_filtered.head()


## **6. Compute Similarity After Filtering**

In [None]:

# Recompute similarity scores with filtered dataset
filtered_bd_sentences = df_filtered["individual_sentences"].tolist()
filtered_bd_embeddings = model.encode(filtered_bd_sentences, convert_to_tensor=True)
filtered_cosine_scores = util.cos_sim(input_embedding, filtered_bd_embeddings)[0]

# Get top 25 most similar descriptions after filtering
top_results_filtered = torch.topk(filtered_cosine_scores, k=top_k)

# Store results in DataFrame
filtered_results = []
for score, idx in zip(top_results_filtered[0], top_results_filtered[1]):
    idx = idx.item()
    filtered_results.append({
        "company_name": df_filtered.iloc[idx]["company_name"],
        "business_description": df_filtered.iloc[idx]["business_description"],
        "similar_sentence": df_filtered.iloc[idx]["individual_sentences"],
        "similarity_score": round(score.item(), 4),
    })

filtered_results_df = pd.DataFrame(filtered_results)
filtered_results_df.head()


## **7. Visualize Similarity Scores Using Heatmap**

In [None]:

import seaborn as sns
import matplotlib.pyplot as plt

# Pivot data for heatmap
pivot_table = filtered_results_df.pivot(index="company_name", columns="similar_sentence", values="similarity_score")

# Plot heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(pivot_table, cmap="coolwarm", annot=False)
plt.title("Company Similarity Heatmap")
plt.xlabel("Similar Sentences")
plt.ylabel("Company Name")
plt.show()


## **8. Save Results**

In [None]:

# Save results to CSV
filtered_results_df.to_csv("data/results/filtered_companies.csv", index=False)
print("Results saved successfully! ✅")
