In [1]:
import sqlite3, os
import pandas as pd, numpy as np
import faiss
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_raw_articles():
    conn = sqlite3.connect("../data/financial_news.db")
    cur = conn.cursor()

    cur.execute(
        """
        SELECT id, title, content FROM raw_news ORDER BY id
        """
    )
    conn.commit()
    articles = cur.fetchall()

    cur.close()
    conn.close()
    return pd.DataFrame(articles)

df = load_raw_articles()
print(df.shape)
df.head()

(400, 3)


Unnamed: 0,0,1,2
0,1,Worried About Inflation? These 3 ETFs Offer Re...,Inflation has slowed but remains a major conce...
1,2,Intel’s Black Friday Breakout: Apple Rumors Fu...,A holiday stock surge fueled by credible Apple...
2,3,Klarna's Crypto Play: A Plan to Fix Its Profit...,Klarna's launch of a stablecoin is a strategic...
3,4,Meta Platforms May Ditch NVIDIA Chips—Here’s W...,Meta Platforms may be looking to alter where i...
4,5,SoFi Technologies: From Fintech Speculation to...,SoFi Technologies is proving its long-term val...


In [3]:
# Title
len(df[1].iloc[2]), df[1].iloc[2]

(54, "Klarna's Crypto Play: A Plan to Fix Its Profit Problem")

In [4]:
# Content
len(df[2].iloc[2]), df[2].iloc[2]

(151,
 "Klarna's launch of a stablecoin is a strategic pivot to lower transaction costs and challenge legacy payment networks, signaling a new era for fintech.")

In [5]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Embed each article by combining title and content
texts = (df[1] + " " + df[2]).tolist()
print(len(texts))

400


In [6]:
texts[2]

"Klarna's Crypto Play: A Plan to Fix Its Profit Problem Klarna's launch of a stablecoin is a strategic pivot to lower transaction costs and challenge legacy payment networks, signaling a new era for fintech."

In [7]:
embeddings = model.encode(texts, show_progress_bar=True)
embeddings = np.array(embeddings).astype("float32")
embeddings.shape

Batches: 100%|██████████| 13/13 [00:07<00:00,  1.74it/s]


(400, 384)

#### **Build FAISS Index**

In [8]:
dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embeddings)

In [9]:
k = 6  
distances, indices = index.search(embeddings, k)

In [10]:
type(distances), type(indices)
distances[2]

array([0.       , 1.097122 , 1.2353373, 1.2460734, 1.2585977, 1.2703987],
      dtype=float32)

In [11]:
def l2_to_cosine(l2_dist):
    return 1 - (l2_dist / 2)

clusters = []
visited = set()

for i, neighbors in enumerate(indices):
    if i in visited:
        continue

    cluster = []
    for j, dist in zip(neighbors, distances[i]):
        cos_sim = l2_to_cosine(dist)
        if cos_sim > 0.80:
            cluster.append(j)
            visited.add(j)
    clusters.append(cluster)


len(clusters),

(386,)

In [12]:
unique_stories = []
for group in clusters:
    articles = df.iloc[group]
    
    story_text = " ".join(articles[1].tolist())

    unique_stories.append({
        "article_ids": articles[0].tolist(),
        "num_articles": len(articles),
        "article_title": articles[1].iloc[0],
        "combined_text": story_text
    })

len(unique_stories)

386

In [13]:
def create_stories_table():
    conn = sqlite3.connect("../data/financial_news.db")
    cur = conn.cursor()

    cur.execute(
        """ 
        CREATE TABLE IF NOT EXISTS unique_news (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            article_ids TEXT,
            article_title TEXT,
            combined_text TEXT,
            num_articles INT
            )
        """
    )
    conn.commit()
    cur.close()
    conn.close()

In [14]:
def save_unique_story(story):
    conn = sqlite3.connect("../data/financial_news.db")
    cur = conn.cursor()

    cur.execute(
        """ 
        INSERT INTO unique_news (article_ids, article_title, combined_text, num_articles)
        VALUES (?, ?, ?, ?)
        """, (
            str(story["article_ids"]),
            story["article_title"],
            story["combined_text"],
            story["num_articles"]
        )
    )
    conn.commit()
    cur.close()
    conn.close()


create_stories_table()

for story in unique_stories:
    save_unique_story(story)

print("SUCCESS")

OperationalError: table unique_news has no column named article_ids

In [15]:
for story in unique_stories[:5]:
    print("Story:", story["article_title"])
    print("Articles:", story["article_ids"])
    print("Count:", story["num_articles"])
    print("---")

Story: Worried About Inflation? These 3 ETFs Offer Real Protection
Articles: [1]
Count: 1
---
Story: Intel’s Black Friday Breakout: Apple Rumors Fuel a Holiday Rally
Articles: [2]
Count: 1
---
Story: Klarna's Crypto Play: A Plan to Fix Its Profit Problem
Articles: [3]
Count: 1
---
Story: Meta Platforms May Ditch NVIDIA Chips—Here’s Why Investors Care
Articles: [4]
Count: 1
---
Story: SoFi Technologies: From Fintech Speculation to Profit Engine
Articles: [5]
Count: 1
---
