Consider a large collection of financial news articles where duplicate content. After preprocessing and normalization as described above, we generate embeddings and find the cosine similarity. For larger datasets, algithms like FAISS

In [12]:
# pip install faiss-cpu



In [44]:
from sentence_transformers import SentenceTransformer

from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

In [47]:
import numpy as np
import faiss


# sample financial articles. content should be cleaned for document cleaning
articles = [
    {
        "id": 1,
        "title": "Tech Stocks Surge Amid Market Rally",
        "publication_date": "2025-01-20",
        "content": ("Today, technology stocks surged as investors reacted positively to new earnings reports. "
                    "The market rally was led by significant gains in the tech sector, highlighting robust consumer confidence "
                    "and improved revenue forecasts.")
    },
    {
        "id": 2,
        "title": "Global Markets Rally on Positive Economic Data",
        "publication_date": "2025-01-20",
        "content": ("Global financial markets experienced a broad rally following the release of encouraging economic data. "
                    "Analysts noted that improved metrics are fueling optimism across sectors such as technology and manufacturing.")
    },
    {
        "id": 3,
        "title": "Tech Sector Drives Record-Breaking Market Performance",
        "publication_date": "2025-01-21",
        "content": ("In a record-breaking day for the stock market, the tech sector led gains. Experts attribute the surge "
                    "to strong earnings, increased innovation investments, and favorable market conditions.")
    },
    {
        "id": 4,
        "title": "Market Rally: Tech Stocks and Economic Recovery Fuel Optimism",
        "publication_date": "2025-01-20",
        "content": ("Investors remain optimistic as the market rally continues. A combination of soaring tech stocks and signs of "
                    "economic recovery signal sustained growth in financial markets.")
    },
    {
        "id": 5,
        "title": "Financial Analysts Forecast Continued Growth Amid Market Rally",
        "publication_date": "2025-01-22",
        "content": ("A group of financial analysts predicts that the ongoing market rally will persist in the coming weeks, "
                    "citing strong fundamentals and a recovering economy.")
    },
    {
        "id": 6,
        "title": "Energy Sector Experiences Decline Amid Falling Oil Prices",
        "publication_date": "2025-01-23",
        "content": ("The energy sector faced a downturn today as falling oil prices impacted revenue projections and investor sentiment. "
                    "This decline contrasts sharply with gains in other sectors.")
    },
    {
        "id": 7,
        "title": "Renewable Energy Investments Rise Despite Market Uncertainty",
        "publication_date": "2025-01-24",
        "content": ("In a surprising twist, investments in renewable energy have increased even as overall market conditions remain uncertain. "
                    "Analysts see this as a long-term trend driven by environmental policy shifts.")
    },
    {
        "id": 8,
        "title": "Banking Sector Under Pressure as Loan Defaults Increase",
        "publication_date": "2025-01-25",
        "content": ("Pressure mounts in the banking sector as a spike in loan defaults raises concerns about credit risk and financial stability. "
                    "Industry experts recommend caution in the near term.")
    },
    {
        "id": 9,
        "title": "Retail Stocks Steady Amid Economic Fluctuations",
        "publication_date": "2025-01-26",
        "content": ("Retail stocks have remained steady despite broader economic fluctuations. This stability is attributed to resilient consumer demand "
                    "and effective cost management strategies.")
    },
    {
        "id": 10,
        "title": "Consumer Confidence Rises as Unemployment Drops",
        "publication_date": "2025-01-27",
        "content": ("Recent reports indicate a rise in consumer confidence alongside a drop in unemployment rates. This improvement is seen as a positive "
                    "indicator for the broader economy and various market sectors.")
    }
]

# Step 1: Compute semantic embeddings for each article's content using SentenceTransformer.
model = SentenceTransformer('all-MiniLM-L6-v2')
texts = [article["content"] for article in articles]
embeddings = model.encode(texts)

# Step 2: Normalize the embeddings to unit length (this makes cosine similarity equivalent to inner product).
faiss.normalize_L2(embeddings)

# Step 3: Use FAISS k-means clustering to partition the 10 articles into 3 clusters.
d = embeddings.shape[1]        # dimension of embeddings
nclusters = 3                  # set at least 3 clusters
kmeans = faiss.Kmeans(d, nclusters, niter=20, verbose=True)
kmeans.train(embeddings)
# Assign each embedding to a cluster.
D, cluster_ids = kmeans.index.search(embeddings, 1)
cluster_ids = cluster_ids.flatten()

# Step 4: Group articles by cluster.
clusters = {}
for idx, cid in enumerate(cluster_ids):
    clusters.setdefault(cid, []).append(articles[idx])

# Step 5: Display clusters and suggest which article to retain from each cluster.
print("Clusters of similar articles and retention suggestions:\n")
for cid, group in clusters.items():
    print(f"Cluster {cid}:")
    for art in group:
        print(f" - Article {art['id']}: {art['title']} (Published: {art['publication_date']})")
    # Suggest retaining the article with the longest content (as a proxy for richest information).
    suggested = max(group, key=lambda x: len(x["content"]))
    print(f"  Suggested to retain: Article {suggested['id']} - {suggested['title']}\n")


Clusters of similar articles and retention suggestions:

Cluster 1:
 - Article 1: Tech Stocks Surge Amid Market Rally (Published: 2025-01-20)
 - Article 3: Tech Sector Drives Record-Breaking Market Performance (Published: 2025-01-21)
 - Article 6: Energy Sector Experiences Decline Amid Falling Oil Prices (Published: 2025-01-23)
  Suggested to retain: Article 1 - Tech Stocks Surge Amid Market Rally

Cluster 0:
 - Article 2: Global Markets Rally on Positive Economic Data (Published: 2025-01-20)
 - Article 4: Market Rally: Tech Stocks and Economic Recovery Fuel Optimism (Published: 2025-01-20)
 - Article 7: Renewable Energy Investments Rise Despite Market Uncertainty (Published: 2025-01-24)
 - Article 8: Banking Sector Under Pressure as Loan Defaults Increase (Published: 2025-01-25)
 - Article 9: Retail Stocks Steady Amid Economic Fluctuations (Published: 2025-01-26)
 - Article 10: Consumer Confidence Rises as Unemployment Drops (Published: 2025-01-27)
  Suggested to retain: Article 2 - G