In [1]:
from tqdm.auto import tqdm
import pandas as pd
import json
from collections import Counter

# Clustering & classification
import hdbscan
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

from processing import prepare_text, extract_keywords, embed_texts, analyze_sentiment

def get_keywords_per_cluster(
    df,
    keywords_col="keywords",
    cluster_col="cluster_id",
    top_n=10
):
    """
    Returns a DataFrame:
    cluster_id | top_keywords
    """

    results = []

    # Ignore HDBSCAN noise
    clusters = sorted(c for c in df[cluster_col].unique() if c != -1)

    for cluster_id in clusters:
        cluster_rows = df[df[cluster_col] == cluster_id]

        # Flatten all keyword lists in this cluster
        all_keywords = []
        for kws in cluster_rows[keywords_col]:
            if isinstance(kws, list):
                all_keywords.extend(kws)

        if not all_keywords:
            continue

        # Count keyword frequency
        counter = Counter(all_keywords)

        top_keywords = [
            kw for kw, _ in counter.most_common(top_n)
        ]

        results.append({
            "cluster_id": cluster_id,
            "top_keywords": top_keywords
        })

    return pd.DataFrame(results)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# ================= LOAD DATA ================= #
df1 = pd.read_csv("dawn_content.csv")
df2 = pd.read_csv("dawn_links.csv")

df = pd.merge(df1, df2, on="id", how="left", suffixes=("", "_y"))

df = df.rename(columns={"text": "content"})
df = df[["id", "title", "content", "url", "date", "location"]]

tqdm.pandas()

# ---- Step 1: Prepare text (ALL rows) ----
df["text"] = df.progress_apply(
    lambda r: prepare_text(r["title"], r["content"]),
    axis=1
)

# ---- Step 2: Keywords (FIRST 20,000 rows ONLY) ----
df["keywords"] = None

N_KEYWORDS = 20_000

df.loc[:N_KEYWORDS - 1, "keywords"] = (
    df.loc[:N_KEYWORDS - 1, "text"]
      .progress_apply(extract_keywords)
)

# ---- Step 3: Embeddings (ALL rows, unchanged) ----
embeddings = embed_texts(df["text"].tolist())


100%|██████████| 88851/88851 [00:01<00:00, 56970.12it/s]
100%|██████████| 20000/20000 [2:56:07<00:00,  1.89it/s]  
Batches: 100%|██████████| 1389/1389 [1:40:43<00:00,  4.35s/it]


ValueError: Unrecognized metric 'cosine'

In [None]:
# ---- Step 4: Clustering ----
df_cluster = df.sample(
    n=30_000,
    random_state=42
)

emb_cluster = embeddings[df_cluster.index]

from sklearn.preprocessing import normalize

emb_cluster = normalize(emb_cluster)

clusterer = hdbscan.HDBSCAN(
    min_cluster_size=40,
    min_samples=10,
    metric="euclidean",
    cluster_selection_method="eom"
)

df["cluster_id"] = -1
df.loc[df_cluster.index, "cluster_id"] = df_cluster["cluster_id"]
df_cluster["cluster_id"] = clusterer.fit_predict(emb_cluster)


# # ---- Step 5: Cluster keywords ----
# cluster_keywords = get_keywords_per_cluster(df_cluster)
# print(cluster_keywords)

   cluster_id                                       top_keywords
0           0  [grand prix qualifying, astonishing formula po...
1           1  [barca held celta, relegation zone coming, str...
2           2  [open, women draw, kyrgios australian open, na...
3           3  [said, coronavirus, covid 19, confirmed, sindh...
4           4  [votes biden leading, tragedy joe biden, trump...


In [19]:
# Propagate cluster labels back to full df
df["cluster_id"] = -1
df.loc[df_cluster.index, "cluster_id"] = df_cluster["cluster_id"]

In [None]:
noise_ratio = (df_cluster["cluster_id"] == -1).mean()
print(f"Noise ratio: {noise_ratio:.2%}")


Noise ratio: 30.01%


',cluster_id,top_keywords\n0,0,"[\'grand prix qualifying\', \'astonishing formula pole\', \'sergio perez\', \'istanbul canadian lance\', \'said stroll\', \'point mexican\', \'switch wet intermediate\', \'weeks feels really\', \'sochi track mercedes\', \'hamilton grabs\']"\n1,1,"[\'barca held celta\', \'relegation zone coming\', \'striker luis suarez\', \'aspas said\', \'demotion draw opening\', \'point real\', \'margin error getting\', \'feeling lost important\', \'barca real lose\', \'openers clasico clash\']"\n2,2,"[\'open\', \'women draw\', \'kyrgios australian open\', \'nadal kept thoughts\', \'tension affected men\', \'rose roared waved\', \'ahead fourth seemingly\', \'younger flashier\', \'double faulting create\', \'wrapped set point\']"\n3,3,"[\'said\', \'coronavirus\', \'covid 19\', \'confirmed\', \'sindh high court\', \'covid 19 pandemic\', \'2020\', \'months\', \'karachi\', \'officials said\']"\n4,4,"[\'votes biden leading\', \'tragedy joe biden\', \'trump divisive presidenc

In [17]:

cluster_keywords.to_csv('kw.csv')

In [None]:

# ---- Step 6: Cluster → Category (MANUAL MAP) ----
CLUSTER_TO_CATEGORY = {
    0: "Business",
    1: "Sports",
    2: "Tennis",
    3: "Politics",
    4: "World"
}

df["category"] = df["cluster_id"].map(CLUSTER_TO_CATEGORY)
df["category"] = df["category"].fillna("Other")

# ---- Step 7: Train classifier ----
mask = (
    (df_cluster["cluster_id"] != -1) &
    (df_cluster["cluster_id"].isin(CLUSTER_TO_CATEGORY))
)

df_train = df_cluster[mask]

le = LabelEncoder()
y = le.fit_transform(
    df_train["cluster_id"].map(CLUSTER_TO_CATEGORY)
)

clf = LogisticRegression(
    max_iter=2000,
    class_weight="balanced"
)

clf.fit(embeddings[df_train.index], y)

# ---- Step 8: Predict categories for ALL rows ----
df["category"] = le.inverse_transform(
    clf.predict(embeddings)
)


  1%|          | 717/88851 [00:58<1:59:04, 12.34it/s]


KeyboardInterrupt: 

In [None]:
# ---- Step 9: Sentiment (FIRST 20,000 rows ONLY) ----
df["sentiment"] = None

N_SENTIMENT = 20_000

df.loc[:N_SENTIMENT - 1, "sentiment"] = (
    df.loc[:N_SENTIMENT - 1, "text"]
      .progress_apply(analyze_sentiment)
)

# ================= CSV OUTPUTS ================= #

# CSV1: id, date, category, location, sentiment
csv1 = df[["id", "date", "category", "location", "sentiment"]]
csv1.to_csv("output/csv1_core.csv", index=False)

# CSV2: id, embedding
csv2 = pd.DataFrame({
    "id": df["id"],
    "embedding": [json.dumps(e.tolist()) for e in embeddings]
})
csv2.to_csv("output/csv2_embeddings.csv", index=False)

# CSV3: id, date, keyword
rows = []
for _, r in df.iterrows():
    if isinstance(r["keywords"], list):
        for kw in r["keywords"]:
            rows.append({
                "id": r["id"],
                "date": r["date"],
                "keyword": kw
            })


pd.DataFrame(rows).to_csv(
    "output/csv3_keywords.csv", index=False
)

# CSV4: id, title, url
csv4 = df[["id", "title", "url"]]
csv4.to_csv("output/csv4_metadata.csv", index=False)

print("Cached pipeline complete")

  1%|          | 133/20000 [00:10<24:54, 13.29it/s]