In [2]:
import os
import io
import json
import math
from typing import Tuple, List, Dict, Any
from datetime import datetime, timedelta

import numpy as np
import pandas as pd
from pymongo import MongoClient
from tqdm.auto import tqdm

# NLP & embedding libs
from sentence_transformers import SentenceTransformer
import umap
import hdbscan
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import pairwise_distances_argmin_min

# Image libs
from PIL import Image
import warnings

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
warnings.filterwarnings("ignore", category=UserWarning)

In [4]:
MONGO_URI = os.getenv("MONGO_URI", "mongodb://localhost:27017")
DB_NAME = os.getenv("DB_NAME", "life_db")
CHATS_COLLECTION = os.getenv("CHATS_COLLECTION", "chats")
MEDIA_COLLECTION = os.getenv("MEDIA_COLLECTION", "media")

In [5]:
OUTPUT_DIR = os.getenv("OUTPUT_DIR", "pipeline_outputs")
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [6]:
TEXT_EMBED_MODEL = os.getenv(
    "TEXT_EMBED_MODEL", "paraphrase-multilingual-MiniLM-L12-v2"
)
CLIP_MODEL = os.getenv("CLIP_MODEL", "clip-ViT-B-32")

In [7]:
def temporal_aggregations(chats: pd.DataFrame, media: pd.DataFrame, tz: str = None):
    out = {}
    for freq, name in [("D","day"), ("W","week"), ("M","month"), ("Y","year")]:
        cagg = chats.groupby(pd.Grouper(key="Datetime", freq=freq)).agg(
            msg_count=("Message", "count"),
            total_chars=("CharCount", "sum"),
            emoji_count=("EmojiCount", "sum"),
            avg_words=("WordCount", "mean")
        ).fillna(0)
        magg = media.groupby(pd.Grouper(key="Datetime", freq=freq)).agg(
            media_count=("FileName", "count"),
            has_gps=("HasGPS", "sum")
        ).fillna(0)
        df = cagg.join(magg, how="outer").fillna(0)
        df = df.reset_index().rename(columns={"Datetime": "period_start"})
        for col in ["msg_count", "media_count"]:
            if df[col].std(ddof=0) > 0:
                df[f"{col}_z"] = (df[col] - df[col].mean()) / (df[col].std(ddof=0) + 1e-9)
            else:
                df[f"{col}_z"] = 0.0
        df["activity_score"] = 0.7 * df["msg_count_z"] + 0.3 * df["media_count_z"]
        lo = df["activity_score"].quantile(0.2)
        hi = df["activity_score"].quantile(0.8)
        df["period_label"] = df["activity_score"].apply(lambda v: "Quiet" if v <= lo else ("Active" if v >= hi else "Typical"))
        out[name] = df
    return out

In [None]:
# -------------------------
# 2) Topic-based Clustering (Chats)
# -------------------------
def topic_cluster_chats(chats: pd.DataFrame,
                        text_col: str = "CleanMessage",
                        min_len: int = 3,
                        embed_model_name: str = TEXT_EMBED_MODEL,
                        umap_n_neighbors: int = 15,
                        umap_min_dist: float = 0.0,
                        umap_dims: int = 64,
                        hdb_min_cluster_size: int = 30,
                        use_pca: bool = True,
                        pca_dim: int = 64):
    """
    Input: chats DataFrame with a CleanMessage column
    Output: messages_df with added 'topic_id' and topic_summary DataFrame
    """
    df = chats.copy()
    if text_col not in df.columns:
        raise ValueError(f"'{text_col}' not in chats DataFrame")
    df[text_col] = df[text_col].fillna("").astype(str)
    df["len"] = df[text_col].str.len()
    df = df[df["len"] >= min_len].reset_index(drop=True)
    if df.empty:
        return df, pd.DataFrame()

    print("[topic] loading text embedding model:", embed_model_name)
    txt_model = SentenceTransformer(embed_model_name)
    texts = df[text_col].tolist()
    embeddings = txt_model.encode(texts, batch_size=64, show_progress_bar=True, normalize_embeddings=True)

    # optional PCA
    if use_pca and embeddings.shape[1] > pca_dim:
        print(f"[topic] reducing dims with PCA -> {pca_dim}")
        pca = PCA(n_components=pca_dim, random_state=42)
        embeddings = pca.fit_transform(embeddings)

    # UMAP to get compact low-d representation for HDBSCAN
    print("[topic] running UMAP")
    reducer = umap.UMAP(n_neighbors=umap_n_neighbors, min_dist=umap_min_dist, n_components=umap_dims, metric="cosine", random_state=42)
    X_umap = reducer.fit_transform(embeddings)

    # HDBSCAN clustering
    print("[topic] running HDBSCAN")
    clusterer = hdbscan.HDBSCAN(min_cluster_size=hdb_min_cluster_size, metric="euclidean", cluster_selection_method="eom")
    labels = clusterer.fit_predict(X_umap)
    df["topic_id"] = labels

    # Topic summaries: top keywords per cluster (tf-idf)
    from sklearn.feature_extraction.text import TfidfVectorizer
    topic_list = []
    for t in sorted(set(labels)):
        mask = df["topic_id"] == t
        texts_in_cluster = df.loc[mask, text_col].tolist()
        if not texts_in_cluster:
            topic_label = "Misc"
            n_msgs = 0
            first_seen = None
            last_seen = None
        else:
            n_msgs = len(texts_in_cluster)
            first_seen = df.loc[mask, "Datetime"].min() if "Datetime" in df.columns else None
            last_seen = df.loc[mask, "Datetime"].max() if "Datetime" in df.columns else None
            # TF-IDF top words
            try:
                tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=3000, stop_words="english")
                X = tfidf.fit_transform(texts_in_cluster)
                scores = np.asarray(X.mean(axis=0)).ravel()
                top_idx = scores.argsort()[::-1][:6]
                terms = np.array(tfidf.get_feature_names_out())[top_idx]
                topic_label = ", ".join(terms[:4])
            except Exception:
                # fallback simple freq words
                words = " ".join(texts_in_cluster).lower().split()
                common = pd.Series(words).value_counts().head(5).index.tolist()
                topic_label = " ".join(common)

        topic_list.append({
            "topic_id": int(t),
            "n_msgs": int(n_msgs),
            "topic_label": topic_label,
            "first_seen": str(first_seen),
            "last_seen": str(last_seen)
        })

    topic_summary = pd.DataFrame(topic_list).sort_values("n_msgs", ascending=False).reset_index(drop=True)
    return df, topic_summary