In [1]:
import time

import argparse
import os
import random
import re
from datetime import datetime, timedelta

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
from sklearn.metrics import pairwise_distances
from sklearn.neighbors import NearestNeighbors

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from tqdm import tqdm
from collections import Counter

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/simoneritt/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/simoneritt/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/simoneritt/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
%load_ext line_profiler

In [3]:
# Constants
DEFAULT_NUM_ARTICLES = 3

START_DATE = '2025-03-01'
END_DATE = '2025-03-03'
INPUT_PATH = './news/news_2025_03.csv'
OUTPUT_PATH = './result'

In [7]:
def load_data(file_path):
    df = pd.read_csv(file_path)
    df.drop_duplicates(subset=['text'], keep='first', inplace=True)
    df.drop_duplicates(subset=['title'], keep='first', inplace=True)
    # df = df[df['text'].apply(len) > 800]
    # df = df[df['text'].apply(len) < 10000]
    # df = df[df['title'].str.contains('Opinion:') == False]
    df = df.reset_index(drop = True)
    return df

def preprocess_text(text):
    """Clean and preprocess text data."""
    tokens = nltk.word_tokenize(text)
    tokens = [re.sub(r'[^a-zA-Z]', '', token.lower()) for token in tokens]
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words and len(token) > 2]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)


def vectorize_text(news_text, n_components=100):
    """
    TF-IDF vectorization and dimension reducation of text data.
    """
    print("Preprocessing text data...")
    preprocessed_text = [preprocess_text(text) for text in news_text]
    
    print("Vectorizing text data with TF-IDF...")
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(preprocessed_text)
    
    if n_components and len(news_text) >= n_components:
        print("Reducing dimensionality with PCA...")
        pca = PCA(n_components=n_components)
        X = pca.fit_transform(X.toarray())
    
    return X


def adjust_dbscan_params(X, k=5):
    """
    Adjust DBSCAN parameters `eps` and `min_samples` based on the dataset X.
    """
    if X.shape[0] < k:
        return 0.5, 2  # Default fallback for small datasets
    
    nbrs = NearestNeighbors(n_neighbors=k).fit(X)
    distances, _ = nbrs.kneighbors(X)
    sorted_distances = np.sort(distances[:, k - 1], axis=0)
    
    eps = np.percentile(sorted_distances, 90)
    min_samples = max(2, int(np.log(len(X))))
    
    print(f"Adjusted DBSCAN Params → eps: {eps:.4f}, min_samples: {min_samples}")
    return eps, min_samples


def cluster_texts(X, eps=0.5, min_samples=3):
    """
    Cluster vectorized news articles using DBSCAN.
    """
    print("Clustering text data using DBSCAN...")
    dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric='cosine')
    dbscan.fit(X)
    return dbscan.labels_


def select_top_articles(data, labels, X, avg_distance_threshold=0.6):
    """
    Select top articles - one from each top-k valid clusters with largest cluster size.
    Parameters:
        - data (pd.DataFrame): The article dataset.
        - labels (array): Cluster labels from DBSCAN.
        - X (array): Cluster data points.
        - avg_distance_threshold (float): Max average pairwise distance for valid clusters.
    Returns:
        - selected_articles (pd.DataFrame): Selected top articles.
    """
    # Group indices by cluster
    clusters = {}
    for i, label in enumerate(labels):
        if label not in clusters:
            clusters[label] = []
        clusters[label].append(i)
    
    valid_clusters = []
    
    # Check for validity of each cluster based on average pairwise distance
    for cluster_id, indices in clusters.items():
        if cluster_id == -1 or len(indices) < 2:
            continue  # Skip noise and tiny clusters
        
        cluster_points = X[indices]
        avg_distance = np.mean(pairwise_distances(cluster_points, metric='cosine'))
        
        # Only consider clusters with avg_distance <= avg_distance_threshold
        if avg_distance <= avg_distance_threshold:
            valid_clusters.append((cluster_id, len(indices), avg_distance))
    
    # Sort valid clusters by size (descending)
    valid_clusters = sorted(valid_clusters, key=lambda x: x[1], reverse=True)
    
    # Sanity check
    print(f"--- Titles in Top-{DEFAULT_NUM_ARTICLES} Valid Clusters ---")
    for i, (cluster_id, size, avg_distance) in enumerate(valid_clusters[:DEFAULT_NUM_ARTICLES], start=1):
        cluster_indices = [idx for idx, lbl in enumerate(labels) if lbl == cluster_id]
        cluster_titles = data.iloc[cluster_indices]['title'].tolist()
        print(f"\nCluster {i} (ID: {cluster_id}, Size: {size}, Avg Distance: {avg_distance:.4f}) Titles:")
        for title in cluster_titles:
            print(f"- {title}")
    print("--------------------------------------\n")
    
    selected_indices = set()
    selected_articles = []
    
    # Pick one random article from each of the top-K valid clusters
    for cluster_id, _, _ in valid_clusters[:DEFAULT_NUM_ARTICLES]:
        idx = random.choice(clusters[cluster_id])
        selected_indices.add(idx)
        selected_articles.append(data.iloc[[idx]])
    
    # Add random articles if fewer than DEFAULT_NUM_ARTICLES are selected
    while len(selected_articles) < DEFAULT_NUM_ARTICLES:
        idx = random.randint(0, len(data) - 1)
        if idx not in selected_indices:
            selected_indices.add(idx)
            selected_articles.append(data.iloc[[idx]])
    
    return pd.concat(selected_articles, ignore_index=True)


def process_date(date, data, output_path):
    """
    Process and save selected articles for a specific date.
    """
    daily_data = data[data['date'] == date]
    if daily_data.empty:
        print(f"No data for date: {date}")
        return
    
    news_text = daily_data['text'].tolist()
    X = vectorize_text(news_text)
    # eps, min_samples = adjust_dbscan_params(X)
    eps, min_samples = 0.5, 3
    labels = cluster_texts(X, eps=eps, min_samples=min_samples)
    
    # Sanity check - cluster statistics
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise_points = list(labels).count(-1)
    n_grouped_points = len(labels) - n_noise_points
    total_samples = len(labels)
    
    print(f"\n--- Cluster Statistics ---")
    print(f"Number of clusters: {n_clusters}")
    print(f"Total number of samples: {total_samples}")
    print(f"Number of grouped points: {n_grouped_points}")
    print(f"Number of noise points: {n_noise_points}")
    print("---------------------------\n")
    
    selected_articles = select_top_articles(
        daily_data,
        labels,
        X,
        avg_distance_threshold=0.7
    )
    
    save_path = os.path.join(output_path, date)
    os.makedirs(save_path, exist_ok=True)
    selected_articles.to_csv(os.path.join(save_path, 'articles_selected.csv'), index=False)
    print(f"Articles for {date} saved successfully!")


def main():
    data = load_data(INPUT_PATH)
    
    start_date = datetime.strptime(START_DATE, "%Y-%m-%d")
    end_date = datetime.strptime(END_DATE, "%Y-%m-%d")
    
    date_range = pd.date_range(start=start_date, end=end_date)
    
    for current_date in tqdm(date_range, desc="Processing Dates"):
        date_str = current_date.strftime("%Y-%m-%d")
        print(f"\nProcessing date: {date_str}")
        process_date(date_str, data, OUTPUT_PATH) 
    
    return data

In [8]:
%lprun -f main data = main()

Processing Dates:   0%|          | 0/3 [00:00<?, ?it/s]


Processing date: 2025-03-01
Preprocessing text data...


Processing Dates:  33%|███▎      | 1/3 [00:00<00:01,  1.16it/s]

Vectorizing text data with TF-IDF...
Reducing dimensionality with PCA...
Clustering text data using DBSCAN...

--- Cluster Statistics ---
Number of clusters: 7
Total number of samples: 131
Number of grouped points: 29
Number of noise points: 102
---------------------------

--- Titles in Top-3 Valid Clusters ---

Cluster 1 (ID: 0, Size: 11, Avg Distance: 0.4470) Titles:
- Inside the 139 minutes that upended the US-Ukraine alliance
- Zelensky says Trump’s backing is ‘crucial’ after US president berated him at White House
- Trump objected to Zelenskyy's tone and body language in Oval Office clash, White House says
- Zelenskyy seeks support at emergency European summit after bruising Trump encounter
- NATO's Rutte urges Zelenskyy to mend his relationship with Trump
- White House clash boosts pressure on Europe to aid Ukraine without U.S.
- Opinion | Zelensky doesn’t hold the cards. But he can still make a deal.
- How the Trump-Zelensky Oval Office meeting spiraled into chaos
- The debacle

Processing Dates:  67%|██████▋   | 2/3 [00:01<00:00,  1.11it/s]

Clustering text data using DBSCAN...

--- Cluster Statistics ---
Number of clusters: 6
Total number of samples: 111
Number of grouped points: 23
Number of noise points: 88
---------------------------

--- Titles in Top-3 Valid Clusters ---

Cluster 1 (ID: 0, Size: 8, Avg Distance: 0.5255) Titles:
- Europe seeks to take control of Ukraine negotiations, after Zelensky’s nightmare Trump visit sparks panic
- White House amplifies rave reviews for Trump’s handling of Zelensky showdown as Europe rallies around Ukraine
- Starmer says Europe faces a 'once in a generation moment' as leaders discuss ending war in Ukraine
- After Trump clash, Ukraine's Zelenskyy gets warm UK welcome before European summit
- European leaders ‘doubling down’ on backing Zelensky after Trump blowup
- Zelensky should apologize to Trump to help Ukraine - The Washington Post
- Trump-Zelensky meeting: The end of the geopolitical ‘West’
- UK prime minister unveils steps toward a Ukraine peace deal, urges US cooperation

C

Processing Dates: 100%|██████████| 3/3 [00:03<00:00,  1.12s/it]

Clustering text data using DBSCAN...

--- Cluster Statistics ---
Number of clusters: 21
Total number of samples: 271
Number of grouped points: 159
Number of noise points: 112
---------------------------

--- Titles in Top-3 Valid Clusters ---

Cluster 1 (ID: 8, Size: 46, Avg Distance: 0.4418) Titles:
- The crypto president has some ideas for your tax dollars
- Bitcoin tumbles 9%, reversing most of the rally from Trump's crypto reserve announcement
- Bitcoin gives back gains driven by President Trump's crypto reserve announcement: CNBC Crypto World
- Leon Cooperman says he's selling into market strength and holding lots of cash
- Southeast Asia and India are defining fintech around the world: Pine Labs CEO
- Bitcoin erases gains from Trump's crypto reserve announcement
- Watch Monday's full episode of Fast Money — March 3, 2025
- Pantera's legal chief on how a U.S. crypto reserve could impact digital asset prices
- Fast Money: SIEGY, GOOGL, MCHI, CI
- Bitcoin gives back gains after Trum




Timer unit: 1e-09 s

Total time: 3.69143 s
File: /var/folders/tv/ry2w9b4n2p5bl45w_bdb30r80000gn/T/ipykernel_5683/1272839139.py
Function: main at line 174

Line #      Hits         Time  Per Hit   % Time  Line Contents
   174                                           def main():
   175         1  323480000.0    3e+08      8.8      data = load_data(INPUT_PATH)
   176                                               
   177         1      29000.0  29000.0      0.0      start_date = datetime.strptime(START_DATE, "%Y-%m-%d")
   178         1       8000.0   8000.0      0.0      end_date = datetime.strptime(END_DATE, "%Y-%m-%d")
   179                                               
   180         1     141000.0 141000.0      0.0      date_range = pd.date_range(start=start_date, end=end_date)
   181                                               
   182         4    3256000.0 814000.0      0.1      for current_date in tqdm(date_range, desc="Processing Dates"):
   183         3      79000.0  26333.

In [9]:
%lprun -f process_date process_date('2025-03-01', data, OUTPUT_PATH)

Preprocessing text data...
Vectorizing text data with TF-IDF...
Reducing dimensionality with PCA...
Clustering text data using DBSCAN...

--- Cluster Statistics ---
Number of clusters: 7
Total number of samples: 131
Number of grouped points: 30
Number of noise points: 101
---------------------------

--- Titles in Top-3 Valid Clusters ---

Cluster 1 (ID: 0, Size: 11, Avg Distance: 0.4411) Titles:
- Inside the 139 minutes that upended the US-Ukraine alliance
- Zelensky says Trump’s backing is ‘crucial’ after US president berated him at White House
- Trump objected to Zelenskyy's tone and body language in Oval Office clash, White House says
- Zelenskyy seeks support at emergency European summit after bruising Trump encounter
- NATO's Rutte urges Zelenskyy to mend his relationship with Trump
- White House clash boosts pressure on Europe to aid Ukraine without U.S.
- Opinion | Zelensky doesn’t hold the cards. But he can still make a deal.
- How the Trump-Zelensky Oval Office meeting spiral

Timer unit: 1e-09 s

Total time: 0.893436 s
File: /var/folders/tv/ry2w9b4n2p5bl45w_bdb30r80000gn/T/ipykernel_5683/1272839139.py
Function: process_date at line 133

Line #      Hits         Time  Per Hit   % Time  Line Contents
   133                                           def process_date(date, data, output_path):
   134                                               """
   135                                               Process and save selected articles for a specific date.
   136                                               """
   137         1    1971000.0    2e+06      0.2      daily_data = data[data['date'] == date]
   138         1      10000.0  10000.0      0.0      if daily_data.empty:
   139                                                   print(f"No data for date: {date}")
   140                                                   return
   141                                               
   142         1      76000.0  76000.0      0.0      news_text = daily_data['text