In [1]:
import duckdb
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import scipy.sparse as sp
import numpy as np
from factorize import factorize
from tqdm import tqdm
import pickle
from collections import defaultdict
import faiss
import os
import json
import numpy as np
import scipy.sparse as sp
import hashlib
from datetime import datetime, timedelta


def get_mapping_hash(mapping):
    """
    Create a deterministic hash of a mapping dictionary.
    """
    # Convert mapping to a sorted list of tuples to ensure consistent ordering
    sorted_items = sorted(mapping.items())
    # Convert to string and encode to bytes
    mapping_str = json.dumps(sorted_items)
    return hashlib.sha256(mapping_str.encode()).hexdigest()

def load_mapping(mapping_file):
    """
    Load a mapping from a JSON file. If the file doesn't exist, return an empty dict.
    """
    if os.path.exists(mapping_file):
        with open(mapping_file, "r") as f:
            mapping = json.load(f)
    else:
        mapping = {}
    return mapping

def update_mapping(mapping, new_items):
    """
    Update the mapping with new items. New items are appended by assigning 
    them an index equal to the current length of the mapping.
    """
    for item in new_items:
        if item not in mapping:
            mapping[item] = len(mapping)
    return mapping


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
con = duckdb.connect('/home/sgan/scan_results.duckdb')
torch.manual_seed(42)  # IMPORTANT: temporary solution for deterministic results. Need this so that consumer_embeddings stays the same across runs.

def create_user_embedding(end_date):
    train_producer_df = con.execute(f"""
    WITH producers AS (
        SELECT 
            json_extract_string(record, '$.subject') as producer_did
        FROM records 
        WHERE collection = 'app.bsky.graph.follow'
        AND createdAt < '{end_date}'  -- before training cutoff date
        AND createdAt >= '2023-01-01' 
        GROUP BY json_extract_string(record, '$.subject')
        HAVING COUNT(*) >= 30
    )
    SELECT producer_did
    FROM producers
    """).fetchdf()

    # Get the edges (consumer-producer relationships)
    train_edges_df = con.execute("""
    SELECT 
        repo as consumer_did,
        json_extract_string(record, '$.subject') as producer_did
    FROM records
    WHERE 
        collection = 'app.bsky.graph.follow'
        AND json_extract_string(record, '$.subject') IN (SELECT producer_did FROM train_producer_df)
    """).fetchdf()

    
    # File paths for persistent mappings
    consumer_mapping_file = 'consumer_mapping.json'
    producer_mapping_file = 'producer_mapping.json'
    hash_file = 'mappings_hash.json'
    
    # Load existing mappings (or create new ones if they don't exist)
    consumer_to_idx = load_mapping(consumer_mapping_file)
    producer_to_idx = load_mapping(producer_mapping_file)
    
    # Store original hashes
    original_hashes = {
        'consumer': get_mapping_hash(consumer_to_idx),
        'producer': get_mapping_hash(producer_to_idx)
    }
    
    # Get new DIDs from the current training data
    new_consumers = train_edges_df['consumer_did'].unique().tolist()
    new_producers = train_producer_df['producer_did'].unique().tolist()
    
    # Update the mappings with any new DIDs
    consumer_to_idx = update_mapping(consumer_to_idx, new_consumers)
    producer_to_idx = update_mapping(producer_to_idx, new_producers)
    
    # Get new hashes
    new_hashes = {
        'consumer': get_mapping_hash(consumer_to_idx),
        'producer': get_mapping_hash(producer_to_idx)
    }
    
    # Check if mappings changed
    mappings_changed = (original_hashes != new_hashes)
    
    if mappings_changed:
        print("Warning: Mappings have changed! You should recompute post embeddings.")
        # Save the updated mappings to disk
        with open(consumer_mapping_file, 'w') as f:
            json.dump(consumer_to_idx, f)
        with open(producer_mapping_file, 'w') as f:
            json.dump(producer_to_idx, f)
        # Save the new hashes
        with open(hash_file, 'w') as f:
            json.dump(new_hashes, f)
    else:
        print("Mappings unchanged, safe to use existing post embeddings.")

    # Create sparse matrix in COO format; each edge has weight 1
    rows = [consumer_to_idx[consumer] for consumer in train_edges_df['consumer_did']]
    cols = [producer_to_idx[producer] for producer in train_edges_df['producer_did']]
    data = np.ones(len(rows))
    
    # Build the sparse matrix (then convert to CSR format for efficient multiplication)
    matrix = sp.coo_matrix(
        (data, (rows, cols)),
        shape=(len(consumer_to_idx), len(producer_to_idx))
    )
    
    print("Matrix shape:", matrix.shape)

    producer_community_affinities, consumer_embeddings, kmeans_cluster_centers = factorize(
        matrix, 
        algorithm='svd',
        n_components=64,
        n_clusters=100,
        device=device
    )

    return consumer_embeddings, consumer_to_idx

In [2]:
user_dynamic_features = {}
# Define start and end dates
start_date = datetime.strptime("2023-03-15", "%Y-%m-%d")
end_date = datetime.strptime("2023-06-30", "%Y-%m-%d")
embedding_dim=64

# Iterate through each day in the range
current_date = start_date
while current_date <= end_date:
    date_str = current_date.strftime("%Y-%m-%d")
    date_int = int(current_date.timestamp())

    # Get embeddings and consumer ID mapping
    consumer_embeddings, consumer_to_idx = create_user_embedding(date_str)

    # Query likes data for the given day
    likes_df = con.execute(f"""
        SELECT DISTINCT repo AS userId
        FROM records
        WHERE createdAt >= '{date_str}' 
            AND createdAt < '{(current_date + timedelta(days=1)).strftime("%Y-%m-%d")}'
            AND collection = 'app.bsky.feed.like'
    """).fetchdf()

    # Initialize dictionary for the current date
    user_dynamic_features[date_int] = {}

    # Process users
    for _, row in likes_df.iterrows():
        try:
            user_dynamic_features[date_int][row['userId']] = consumer_embeddings[consumer_to_idx[row['userId']]]
        except KeyError:  # If user is not found, assign zero vector
            user_dynamic_features[date_int][row['userId']] = np.zeros(embedding_dim)

    # Move to the next day
    current_date += timedelta(days=1)

print("Finished processing all dates.")

save_path = "/home/sgan/private/DyGLib/DG_data/bluesky/user_dynamic_features.pkl"

with open(save_path, "wb") as f:
    pickle.dump(user_dynamic_features, f)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Matrix shape: (56572, 677)
Matrix shape: (56596, 683)
Matrix shape: (56686, 691)
Matrix shape: (56816, 706)
Matrix shape: (57168, 713)
Matrix shape: (57226, 717)
Matrix shape: (57233, 721)
Matrix shape: (57382, 746)
Matrix shape: (57554, 767)
Matrix shape: (57591, 775)
Matrix shape: (57656, 781)
Matrix shape: (57701, 787)
Matrix shape: (57704, 793)
Matrix shape: (57864, 798)
Matrix shape: (58091, 810)
Matrix shape: (58133, 821)
Matrix shape: (58134, 826)
Matrix shape: (58146, 828)
Matrix shape: (58208, 835)
Matrix shape: (58248, 838)
Matrix shape: (58398, 839)
Matrix shape: (58405, 842)
Matrix shape: (58453, 845)
Matrix shape: (60871, 1137)
Matrix shape: (61619, 1313)
Matrix shape: (62500, 1435)
Matrix shape: (64011, 1611)
Matrix shape: (70320, 1932)
Matrix shape: (72357, 2277)
Matrix shape: (79128, 2635)
Matrix shape: (82039, 3079)
Matrix shape: (82783, 3331)
Matrix shape: (83693, 3555)
Matrix shape: (84845, 3832)
Matrix shape: (86017, 4287)
Matrix shape: (87581, 4757)
Matrix shape: (

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Matrix shape: (125391, 22937)
Matrix shape: (125741, 23591)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Matrix shape: (126182, 24454)
Matrix shape: (126549, 25322)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Matrix shape: (126922, 26060)
Matrix shape: (127230, 26580)
Matrix shape: (127472, 27026)
Matrix shape: (127765, 27644)
Matrix shape: (127995, 28109)
Matrix shape: (128183, 28473)
Matrix shape: (128464, 28994)
Matrix shape: (128695, 29531)
Matrix shape: (128919, 29877)
Matrix shape: (129082, 30196)
Matrix shape: (129237, 30579)
Matrix shape: (129604, 30962)
Matrix shape: (129773, 31469)
Matrix shape: (130077, 31756)
Matrix shape: (130211, 31990)
Matrix shape: (130400, 32228)
Matrix shape: (130523, 32436)
Matrix shape: (130646, 32695)
Matrix shape: (130745, 32928)
Matrix shape: (130850, 33145)
Matrix shape: (130938, 33355)
Matrix shape: (131044, 33565)
Matrix shape: (131214, 33798)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Matrix shape: (131302, 34027)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Matrix shape: (131398, 34225)
Matrix shape: (131478, 34416)
Matrix shape: (131539, 34577)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Matrix shape: (131688, 34850)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Matrix shape: (131749, 35041)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Matrix shape: (131877, 35267)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Matrix shape: (131998, 35522)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Matrix shape: (132088, 35762)
Matrix shape: (132211, 35979)
Matrix shape: (132302, 36208)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Matrix shape: (132403, 36450)
Matrix shape: (132494, 36744)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Matrix shape: (132613, 37166)
Matrix shape: (132766, 37546)
Matrix shape: (132877, 37871)
Matrix shape: (132963, 38122)
Matrix shape: (133024, 38346)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Matrix shape: (133086, 38585)
Matrix shape: (133139, 38809)
Matrix shape: (133269, 39283)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Matrix shape: (133622, 41073)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Matrix shape: (133818, 42023)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Matrix shape: (133903, 42575)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Matrix shape: (133980, 43028)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Matrix shape: (134108, 43572)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Matrix shape: (134298, 44670)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Matrix shape: (134414, 45353)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Matrix shape: (134505, 45882)
Finished processing all dates.


In [3]:
user_mapping_path = "/home/sgan/private/DyGLib/DG_data/bluesky/user_mapping.pkl"

with open(user_mapping_path, "rb") as file:
    user_mapping = pickle.load(file)

In [4]:
missing_users_per_date = {}

# Iterate over each date in user_dynamic_features
for date, users in user_dynamic_features.items():
    # Get user IDs from the current date
    user_ids = users.keys()

    # Find users not in user_mapping
    missing_users = [user for user in user_ids if user not in user_mapping]

    # Store missing users if any exist for this date
    if missing_users:
        missing_users_per_date[date] = missing_users
        print(f"{len(missing_users)} users missing in user_mapping for {date}")

# Summary
if missing_users_per_date:
    print("\nSome users are missing in user_mapping across multiple dates.")
else:
    print("\nAll users in user_dynamic_features exist in user_mapping.")



All users in user_dynamic_features exist in user_mapping.


In [5]:
# Replace user IDs with their mapped indices
user_dynamic_features_mapped = {
    date: {user_mapping[user]: emb for user, emb in users.items()} 
    for date, users in user_dynamic_features.items()
}

print("User IDs in user_dynamic_features have been replaced with user_mapping indices.")

User IDs in user_dynamic_features have been replaced with user_mapping indices.


In [6]:
save_path = "/home/sgan/private/DyGLib/DG_data/bluesky/user_dynamic_features.pkl"

with open(save_path, "wb") as f:
    pickle.dump(user_dynamic_features_mapped, f)