In [34]:
import sqlite3, os
import pandas as pd, numpy as np
import faiss
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

In [35]:
def load_raw_articles():
    conn = sqlite3.connect("../data/financial_news.db")
    cur = conn.cursor()

    cur.execute(
        """
        SELECT id, title, content FROM raw_news ORDER BY id
        """
    )
    conn.commit()
    articles = cur.fetchall()

    cur.close()
    conn.close()
    return pd.DataFrame(articles)

df = load_raw_articles()
print(df.shape)
df.head()

(400, 3)


Unnamed: 0,0,1,2
0,1,Worried About Inflation? These 3 ETFs Offer Re...,Inflation has slowed but remains a major conce...
1,2,Intel’s Black Friday Breakout: Apple Rumors Fu...,A holiday stock surge fueled by credible Apple...
2,3,Klarna's Crypto Play: A Plan to Fix Its Profit...,Klarna's launch of a stablecoin is a strategic...
3,4,Meta Platforms May Ditch NVIDIA Chips—Here’s W...,Meta Platforms may be looking to alter where i...
4,5,SoFi Technologies: From Fintech Speculation to...,SoFi Technologies is proving its long-term val...


In [36]:
# Title
len(df[1].iloc[2]), df[1].iloc[2]

(54, "Klarna's Crypto Play: A Plan to Fix Its Profit Problem")

In [37]:
# Content
len(df[2].iloc[2]), df[2].iloc[2]

(151,
 "Klarna's launch of a stablecoin is a strategic pivot to lower transaction costs and challenge legacy payment networks, signaling a new era for fintech.")

In [38]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Embed each article by combining title and content
texts = (df[1] + " " + df[2]).tolist()
print(len(texts))

400


In [39]:
texts[2]

"Klarna's Crypto Play: A Plan to Fix Its Profit Problem Klarna's launch of a stablecoin is a strategic pivot to lower transaction costs and challenge legacy payment networks, signaling a new era for fintech."

In [40]:
print(texts[217], "\n---\n", texts[261])

J&K Bank expects credit growth to exceed 12%, plans capital raise Jammu & Kashmir Bank is preparing to raise up to ₹1,250 crore in capital to fund expansion, likely in the fourth quarter or early next fiscal, depending on market valuation. J&amp;K Bank continues to maintain its profitability guidance and sees its stock as undervalued while loan growth and digital initiatives drive momentum. 
---
 J&K Bank to raise ₹750 crore equity via QIP, ₹500 crore through tier 2 bonds In addition, the board approved raising up to ₹500 crore via non-convertible, redeemable, unsecured, Basel III-compliant Tier 2 bonds. Shares of Jammu and Kashmir Bank Ltd ended at ₹106.40, up by ₹1.20, or 1.14%, on the BSE.


In [41]:
embeddings = model.encode(texts, show_progress_bar=True)
embeddings = np.array(embeddings).astype("float32")
embeddings.shape

Batches: 100%|██████████| 13/13 [00:08<00:00,  1.59it/s]


(400, 384)

#### **Build FAISS Index**

In [42]:
dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embeddings)

In [43]:
k = 6  
distances, indices = index.search(embeddings, k)

In [44]:
type(distances), type(indices)
distances[2]

array([0.       , 1.097122 , 1.2353373, 1.2460734, 1.2585977, 1.2703987],
      dtype=float32)

In [45]:
def l2_to_cosine(l2_dist):
    return 1 - (l2_dist / 2)

clusters = []
visited = set()

for i, neighbors in enumerate(indices):
    if i in visited:
        continue

    cluster = []
    for j, dist in zip(neighbors, distances[i]):
        cos_sim = l2_to_cosine(dist)
        if cos_sim > 0.80:
            cluster.append(j)
            visited.add(j)
    clusters.append(cluster)


len(clusters), clusters[210: 220]

(386,
 [[np.int64(214)],
  [np.int64(215)],
  [np.int64(216)],
  [np.int64(217), np.int64(261)],
  [np.int64(218)],
  [np.int64(219)],
  [np.int64(220), np.int64(185)],
  [np.int64(221)],
  [np.int64(222)],
  [np.int64(223)]])

In [46]:
unique_stories = []
for group in clusters:
    # print(group)
    articles = [df.iloc[i] for i in group]
    # print(articles)
    

    unique_stories.append({
        "article_ids": articles[0].tolist(),
        "num_articles": len(articles),
        "article_title": articles[0][1],
        "combined_text": " ".join([a[2] for a in articles]),
    })
    # print(unique_stories)
    # break

len(unique_stories)

386

In [47]:
def create_stories_table():
    conn = sqlite3.connect("../data/financial_news.db")
    cur = conn.cursor()

    cur.execute(
        """ 
        CREATE TABLE IF NOT EXISTS unique_news (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            article_ids TEXT,
            article_title TEXT,
            combined_text TEXT,
            num_articles INT
            )
        """
    )
    conn.commit()
    cur.close()
    conn.close()

In [48]:
def save_unique_story(story):
    conn = sqlite3.connect("../data/financial_news.db")
    cur = conn.cursor()

    cur.execute(
        """ 
        INSERT INTO unique_news (article_ids, article_title, combined_text, num_articles)
        VALUES (?, ?, ?, ?)
        """, (
            str(story["article_ids"]),
            story["article_title"],
            story["combined_text"],
            story["num_articles"]
        )
    )
    conn.commit()
    cur.close()
    conn.close()


create_stories_table()

for story in unique_stories:
    save_unique_story(story)

print("SUCCESS")

SUCCESS


In [49]:
for story in unique_stories[212:215]:
    print("Story:", story["article_title"])
    print("Content:", story["combined_text"])
    print("Articles:", story["article_ids"])
    print("Count:", story["num_articles"])
    print("---")

Story: How the New Labour Codes will expand insurance and social security coverage
Content: The Government of India’s Four Labour Codes unify 29 laws, extending EPF, ESI, gratuity, and social security to all workers, with Aadhaar-linked portability and standardised workplace benefits.
Articles: [np.int64(217), 'How the New Labour Codes will expand insurance and social security coverage', 'The Government of India’s Four Labour Codes unify 29 laws, extending EPF, ESI, gratuity, and social security to all workers, with Aadhaar-linked portability and standardised workplace benefits.']
Count: 1
---
Story: J&K Bank expects credit growth to exceed 12%, plans capital raise
Content: Jammu & Kashmir Bank is preparing to raise up to ₹1,250 crore in capital to fund expansion, likely in the fourth quarter or early next fiscal, depending on market valuation. J&amp;K Bank continues to maintain its profitability guidance and sees its stock as undervalued while loan growth and digital initiatives drive

In [50]:
conn = sqlite3.connect("../data/financial_news.db")
cur = conn.cursor()
cur.execute(
    """ 
    SELECT * FROM unique_news WHERE id BETWEEN 214 AND 220
    """
)
result = cur.fetchall()
cur.close()
conn.close()

In [51]:
result

[(214,
  "[np.int64(218), 'J&K Bank expects credit growth to exceed 12%, plans capital raise', 'Jammu & Kashmir Bank is preparing to raise up to ₹1,250 crore in capital to fund expansion, likely in the fourth quarter or early next fiscal, depending on market valuation. J&amp;K Bank continues to maintain its profitability guidance and sees its stock as undervalued while loan growth and digital initiatives drive momentum.']",
  'J&K Bank expects credit growth to exceed 12%, plans capital raise',
  'Jammu & Kashmir Bank is preparing to raise up to ₹1,250 crore in capital to fund expansion, likely in the fourth quarter or early next fiscal, depending on market valuation. J&amp;K Bank continues to maintain its profitability guidance and sees its stock as undervalued while loan growth and digital initiatives drive momentum. In addition, the board approved raising up to ₹500 crore via non-convertible, redeemable, unsecured, Basel III-compliant Tier 2 bonds. Shares of Jammu and Kashmir Bank Lt