In [3]:
import sqlite3, os
import pandas as pd, numpy as np
import faiss
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

In [4]:
def load_raw_articles():
    conn = sqlite3.connect("../data/financial_news.db")
    cur = conn.cursor()

    cur.execute(
        """
        SELECT id, title, content FROM raw_news ORDER BY id
        """
    )
    conn.commit()
    articles = cur.fetchall()

    cur.close()
    conn.close()
    return pd.DataFrame(articles)

df = load_raw_articles()
print(df.shape)
df.head()

(400, 3)


Unnamed: 0,0,1,2
0,1,Worried About Inflation? These 3 ETFs Offer Re...,Inflation has slowed but remains a major conce...
1,2,Intel’s Black Friday Breakout: Apple Rumors Fu...,A holiday stock surge fueled by credible Apple...
2,3,Klarna's Crypto Play: A Plan to Fix Its Profit...,Klarna's launch of a stablecoin is a strategic...
3,4,Meta Platforms May Ditch NVIDIA Chips—Here’s W...,Meta Platforms may be looking to alter where i...
4,5,SoFi Technologies: From Fintech Speculation to...,SoFi Technologies is proving its long-term val...


In [5]:
# Title
len(df[1].iloc[2]), df[1].iloc[2]

(54, "Klarna's Crypto Play: A Plan to Fix Its Profit Problem")

In [6]:
# Content
len(df[2].iloc[2]), df[2].iloc[2]

(151,
 "Klarna's launch of a stablecoin is a strategic pivot to lower transaction costs and challenge legacy payment networks, signaling a new era for fintech.")

In [7]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Embed each article by combining title and content
texts = (df[1] + " " + df[2]).tolist()
print(len(texts))

400


In [8]:
texts[2]

"Klarna's Crypto Play: A Plan to Fix Its Profit Problem Klarna's launch of a stablecoin is a strategic pivot to lower transaction costs and challenge legacy payment networks, signaling a new era for fintech."

In [9]:
print(texts[217], "\n---\n", texts[261])

Global air travel demand jumps 6.6% in October, as per IATA IATA reports global air passenger traffic grew 6.6% in October 2025, with Asia-Pacific and Europe leading gains and strong holiday travel expected. 
---
 Apple contests India’s new antitrust penalty law in Delhi High Court, flags $38 billion risk Apple has challenged India’s new antitrust penalty law in Delhi High Court, risking up to 38 billion dollars in fines, after CCI accused it of abusive conduct in the iOS app market.


In [10]:
embeddings = model.encode(texts, show_progress_bar=True)
embeddings = np.array(embeddings).astype("float32")
embeddings.shape

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Batches: 100%|██████████| 13/13 [00:07<00:00,  1.81it/s]


(400, 384)

#### **Build FAISS Index**

In [11]:
dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embeddings)

In [12]:
k = 6  
distances, indices = index.search(embeddings, k)

In [13]:
type(distances), type(indices)
distances[2]

array([0.       , 1.0971218, 1.2460732, 1.2585977, 1.270399 , 1.2768719],
      dtype=float32)

In [14]:
def l2_to_cosine(l2_dist):
    return 1 - (l2_dist / 2)

clusters = []
visited = set()

for i, neighbors in enumerate(indices):
    if i in visited:
        continue

    cluster = []
    for j, dist in zip(neighbors, distances[i]):
        cos_sim = l2_to_cosine(dist)
        if cos_sim > 0.80:
            cluster.append(j)
            visited.add(j)
    clusters.append(cluster)


len(clusters), clusters[210: 220]

(388,
 [[np.int64(212), np.int64(213)],
  [np.int64(214)],
  [np.int64(215)],
  [np.int64(216)],
  [np.int64(217)],
  [np.int64(218)],
  [np.int64(219)],
  [np.int64(220)],
  [np.int64(221)],
  [np.int64(222)]])

In [15]:
unique_stories = []
for group in clusters:
    # print(group)
    articles = [df.iloc[i] for i in group]
    # print(articles)
    

    unique_stories.append({
        "article_ids": [a[0] for a in articles],
        "num_articles": len(articles),
        "article_title": articles[0][1],
        "combined_text": " ".join([a[2] for a in articles]),
    })
    # print(unique_stories)
    

len(unique_stories)

388

In [16]:
def create_stories_table():
    conn = sqlite3.connect("../data/financial_news.db")
    cur = conn.cursor()

    cur.execute(
        """ 
        CREATE TABLE IF NOT EXISTS unique_news (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            article_ids TEXT,
            article_title TEXT,
            combined_text TEXT,
            num_articles INT
            )
        """
    )
    conn.commit()
    cur.close()
    conn.close()

In [17]:
for story in unique_stories:
    print(
            str(story["article_ids"]),"\n",
            story["article_title"],"\n",
            story["combined_text"],"\n",
            story["num_articles"]
        )
    break

[np.int64(1)] 
 Worried About Inflation? These 3 ETFs Offer Real Protection 
 Inflation has slowed but remains a major concern for many investors; these ETFs can help provide a buffer through the use of TIPS, commodities, or T-Bills. 
 1


In [18]:
def save_unique_story(story):
    conn = sqlite3.connect("../data/financial_news.db")
    cur = conn.cursor()

    cur.execute(
        """ 
        INSERT INTO unique_news (article_ids, article_title, combined_text, num_articles)
        VALUES (?, ?, ?, ?)
        """, (
            str(story["article_ids"]),
            story["article_title"],
            story["combined_text"],
            story["num_articles"]
        )
    )
    conn.commit()
    cur.close()
    conn.close()


create_stories_table()

for story in unique_stories:
    save_unique_story(story)

print("SUCCESS")

SUCCESS


In [19]:
for story in unique_stories[212:215]:
    print("Story:", story["article_title"])
    print("Content:", story["combined_text"])
    print("Articles:", story["article_ids"])
    print("Count:", story["num_articles"])
    print("---")

Story: Adani Group eyes up to $5 billion investment in Google’s India AI data centre
Content: Adani Group’s potential investment comes as global tech giants and Indian conglomerates accelerate spending on high-capacity data infrastructure. With Google planning a large AI-focused campus in Visakhapatnam, industry players say India’s data centre race is entering a new scale, driven by surging demand for computing power.
Articles: [np.int64(216)]
Count: 1
---
Story: WinZO promoters used 'deceptive' algorithm to cheat gamers: ED
Content: The promoters of online real money gaming company WinZO "cheated" genuine players by using a "deceptive" algorithm leading to the generation of illicit funds of ₹177 crore in about a year's time, the ED has alleged.
Articles: [np.int64(217)]
Count: 1
---
Story: Global air travel demand jumps 6.6% in October, as per IATA
Content: IATA reports global air passenger traffic grew 6.6% in October 2025, with Asia-Pacific and Europe leading gains and strong holida

In [20]:
conn = sqlite3.connect("../data/financial_news.db")
cur = conn.cursor()
cur.execute(
    """ 
    SELECT * FROM unique_news WHERE id BETWEEN 214 AND 220
    """
)
result = cur.fetchall()
cur.close()
conn.close()

In [21]:
result

[(214,
  '[np.int64(217)]',
  "WinZO promoters used 'deceptive' algorithm to cheat gamers: ED",
  'The promoters of online real money gaming company WinZO "cheated" genuine players by using a "deceptive" algorithm leading to the generation of illicit funds of ₹177 crore in about a year\'s time, the ED has alleged.',
  1),
 (215,
  '[np.int64(218)]',
  'Global air travel demand jumps 6.6% in October, as per IATA',
  'IATA reports global air passenger traffic grew 6.6% in October 2025, with Asia-Pacific and Europe leading gains and strong holiday travel expected.',
  1),
 (216,
  '[np.int64(219)]',
  'Loca Loka targets 5% of India’s tequila market in first year after $12.5 million fundraise',
  'Loca Loka’s founders say India’s tequila moment has finally arrived, and their strategy blends global validation with aggressive domestic expansion. After testing the brand in the US and securing competition wins, the trio is now rolling out the product across major Indian cities, confident that 