In [1]:
import duckdb
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import scipy.sparse as sp
import numpy as np

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# pd.set_option('display.max_colwidth', None)

# Create/connect to DuckDB database
con = duckdb.connect('scan_results.duckdb')
con.execute("SHOW TABLES").fetchall()

[('records',)]

In [2]:
con.execute("SELECT * FROM records WHERE collection = 'app.bsky.feed.like' LIMIT 10").fetchdf()

Unnamed: 0,repo,collection,rkey,at_rev,created_at,createdAt,deleted,record
0,did:plc:42kmtf65uqs765coei7bimwx,app.bsky.feed.like,3ju7xbomxzo2w,3ldmz4frpsl2f,2024-12-20 04:20:19.074,2023-04-25 20:53:03.947,False,"{""$type"":""app.bsky.feed.like"",""subject"":{""cid""..."
1,did:plc:42kmtf65uqs765coei7bimwx,app.bsky.feed.like,3jumsneq2im2p,3ldmz4frpsl2f,2024-12-20 04:20:19.075,2023-04-30 23:34:44.283,False,"{""$type"":""app.bsky.feed.like"",""subject"":{""cid""..."
2,did:plc:42kmtf65uqs765coei7bimwx,app.bsky.feed.like,3juodgk5sp62q,3ldmz4frpsl2f,2024-12-20 04:20:19.090,2023-05-01 14:07:48.386,False,"{""$type"":""app.bsky.feed.like"",""subject"":{""cid""..."
3,did:plc:42kmtf65uqs765coei7bimwx,app.bsky.feed.like,3juodma2lef2o,3ldmz4frpsl2f,2024-12-20 04:20:19.092,2023-05-01 14:10:59.179,False,"{""$type"":""app.bsky.feed.like"",""subject"":{""cid""..."
4,did:plc:42kmtf65uqs765coei7bimwx,app.bsky.feed.like,3juogabmpkc2h,3ldmz4frpsl2f,2024-12-20 04:20:19.086,2023-05-01 14:57:59.426,False,"{""$type"":""app.bsky.feed.like"",""subject"":{""cid""..."
5,did:plc:42kmtf65uqs765coei7bimwx,app.bsky.feed.like,3juorcqcslm2f,3ldmz4frpsl2f,2024-12-20 04:20:19.095,2023-05-01 18:16:13.037,False,"{""$type"":""app.bsky.feed.like"",""subject"":{""cid""..."
6,did:plc:42kmtf65uqs765coei7bimwx,app.bsky.feed.like,3juorkrtjvo2g,3ldmz4frpsl2f,2024-12-20 04:20:19.093,2023-05-01 18:20:43.066,False,"{""$type"":""app.bsky.feed.like"",""subject"":{""cid""..."
7,did:plc:42kmtf65uqs765coei7bimwx,app.bsky.feed.like,3juoroisbdo24,3ldmz4frpsl2f,2024-12-20 04:20:19.097,2023-05-01 18:22:47.800,False,"{""$type"":""app.bsky.feed.like"",""subject"":{""cid""..."
8,did:plc:42kmtf65uqs765coei7bimwx,app.bsky.feed.like,3juouevbgl22z,3ldmz4frpsl2f,2024-12-20 04:20:19.094,2023-05-01 19:11:06.569,False,"{""$type"":""app.bsky.feed.like"",""subject"":{""cid""..."
9,did:plc:42kmtf65uqs765coei7bimwx,app.bsky.feed.like,3juoufapfja2r,3ldmz4frpsl2f,2024-12-20 04:20:19.073,2023-05-01 19:11:18.560,False,"{""$type"":""app.bsky.feed.like"",""subject"":{""cid""..."


In [3]:
con.execute("SELECT COUNT(*) FROM records WHERE collection = 'app.bsky.feed.like'").fetchall()

[(22133792,)]

warning: there are followed user dids that do not exist as a repo in the records table. This is likely because they were banned or deleted.

In [4]:
producer_df = con.execute("""
WITH followed_users AS (
    SELECT 
        json_extract_string(record, '$.subject') as producer_did,
        COUNT(*) as follower_count
    FROM records 
    WHERE collection = 'app.bsky.graph.follow'
    GROUP BY json_extract_string(record, '$.subject')
    HAVING COUNT(*) >= 30
)
SELECT 
    producer_did,
    follower_count
FROM followed_users
WHERE producer_did IN (
    SELECT DISTINCT repo 
    FROM records 
    WHERE collection = 'app.bsky.feed.post'
)
ORDER BY follower_count DESC
            """).fetchdf()
producer_df

Unnamed: 0,producer_did,follower_count
0,did:plc:z72i7hdynmk6r22z27h6tvur,25219
1,did:plc:p7gxyfr5vii5ntpwo7f6dhe2,20953
2,did:plc:oky5czdrnfjpqslsw2a5iclo,19270
3,did:plc:6wpkkitfdkgthatfvspcfmjo,14964
4,did:plc:ragtjsm2j2vknwkz3zp4oxrd,13353
...,...,...
40259,did:plc:t2t4ua6ztqmpk6utkiyy6rse,30
40260,did:plc:4eu3tde7ath6e4av5xl3e4bj,30
40261,did:plc:oij3yw4sbjm6v2xyts5verp5,30
40262,did:plc:3zynnjgncvzyvj3ffwbdkbq7,30


In [6]:
# Get the edges (consumer-producer relationships)
edges_df = con.execute("""
WITH producers AS (
    -- First get our valid producers (30+ followers with posts)
    SELECT 
        json_extract_string(record, '$.subject') as producer_did,
        COUNT(*) as follower_count
    FROM records 
    WHERE collection = 'app.bsky.graph.follow'
    GROUP BY json_extract_string(record, '$.subject')
    HAVING COUNT(*) >= 30
    AND producer_did IN (
        SELECT DISTINCT repo 
        FROM records 
        WHERE collection = 'app.bsky.feed.post'
    )
)
SELECT 
    repo as consumer_did,
    json_extract_string(record, '$.subject') as producer_did
FROM records
WHERE 
    collection = 'app.bsky.graph.follow'
    AND json_extract_string(record, '$.subject') IN (SELECT producer_did FROM producers)
""").fetchdf()

edges_df

Unnamed: 0,consumer_did,producer_did
0,did:plc:7hxhbhphfselzxjxhrxfykzr,did:plc:nvog7rczakwzh5ckxnjnwqdd
1,did:plc:7hxhbhphfselzxjxhrxfykzr,did:plc:ohvstchboonnmbplvwkl33ko
2,did:plc:7hxhbhphfselzxjxhrxfykzr,did:plc:2mq2phxyc6clfn3olwsq324i
3,did:plc:7hxhbhphfselzxjxhrxfykzr,did:plc:won64wy47o3efqq576yitp3k
4,did:plc:7hxhbhphfselzxjxhrxfykzr,did:plc:ct44vuzkgksyh2cexn3ifh6b
...,...,...
5516127,did:plc:f2xxcgq4mdjt2vnr4gaufirz,did:plc:34w2qqfauelc42s57x2mc4dt
5516128,did:plc:f2xxcgq4mdjt2vnr4gaufirz,did:plc:gpunjjgvlyb4racypz3yfiq4
5516129,did:plc:f2xxcgq4mdjt2vnr4gaufirz,did:plc:7cydhemkldfiqllszdmx6tyv
5516130,did:plc:f2xxcgq4mdjt2vnr4gaufirz,did:plc:rlililcfm7c5ux4au2sn35vz


# TODO: Persistent mapping of DIDs to indices

In [5]:
# Create mappings of DIDs to indices
producer_to_idx = {did: idx for idx, did in enumerate(producer_df['producer_did'].unique())}
consumer_to_idx = {did: idx for idx, did in enumerate(edges_df['consumer_did'].unique())}

# Create sparse matrix in COO format (will convert to CSR after)
rows = [consumer_to_idx[consumer] for consumer in edges_df['consumer_did']]
cols = [producer_to_idx[producer] for producer in edges_df['producer_did']]
data = np.ones(len(rows))  # Each edge has weight 1

# Create the sparse matrix
# Shape is (n_consumers, n_producers)
matrix = sp.coo_matrix(
    (data, (rows, cols)), 
    shape=(len(consumer_to_idx), len(producer_to_idx))
).tocsr()  # Convert to CSR format for efficient multiplication
matrix.shape

(131924, 40264)

In [53]:
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
import numpy as np

# Choose decomposition method
algorithm = 'svd'  # Options: 'nmf' or 'svd'
n_components = 20  # number of dimensions to reduce to

if algorithm == 'nmf':
    # Use Non-negative Matrix Factorization
    nmf = NMF(n_components=n_components, init='nndsvd')
    producer_embeddings = nmf.fit_transform(matrix.T)  # (40264, 20)
    consumer_embeddings = matrix @ producer_embeddings  # (131924, 20)
elif algorithm == 'svd':
    # Use Singular Value Decomposition
    svd = TruncatedSVD(n_components=n_components)
    producer_embeddings = svd.fit_transform(matrix.T)  # (40264, 20)
    consumer_embeddings = matrix @ producer_embeddings  # (131924, 20)

# L2 normalize both producer and consumer embeddings before clustering
producer_embeddings_norm = normalize(producer_embeddings, norm='l2')
consumer_embeddings_norm = normalize(consumer_embeddings, norm='l2')

# Cluster the normalized embeddings
kmeans = KMeans(n_clusters=100)
producer_communities = kmeans.fit_predict(producer_embeddings_norm)

# Get affinity scores (0-1) for how strongly each producer belongs to their assigned cluster only
assigned_distances = np.zeros(len(producer_embeddings_norm))
for i, (producer, cluster) in enumerate(zip(producer_embeddings_norm, producer_communities)):
    # Calculate distance to assigned cluster center only
    distance = np.linalg.norm(producer - kmeans.cluster_centers_[cluster])
    assigned_distances[i] = distance

producer_community_affinities = 1 - (assigned_distances / assigned_distances.max())  # Convert to 0-1 scale

In [54]:
print("Min value:", producer_embeddings_norm.min())
print("Max value:", producer_embeddings_norm.max())
producer_embeddings_norm

Min value: -0.8793239151213066
Max value: 0.966168127134238


array([[ 0.21349931, -0.55067247, -0.28522976, ...,  0.05554103,
         0.01010565,  0.20167031],
       [ 0.25106899, -0.79990034, -0.02635134, ...,  0.02182669,
        -0.01460902,  0.05589873],
       [ 0.2580131 , -0.61044055, -0.40529104, ..., -0.11384615,
         0.18824316, -0.01314027],
       ...,
       [ 0.79749464,  0.17382489,  0.28014759, ..., -0.00415421,
         0.02707997,  0.03453673],
       [ 0.74505227,  0.11638777,  0.28715673, ..., -0.11345078,
         0.0335287 , -0.01746893],
       [ 0.05488355, -0.10151267, -0.07807169, ...,  0.20806883,
        -0.35396918,  0.41635161]])

In [40]:
print(f"Average affinity: {producer_community_affinities.mean():.3f}")
print(f"Median affinity: {np.median(producer_community_affinities):.3f}")
print(f"25th percentile: {np.percentile(producer_community_affinities, 25):.3f}")
print(f"75th percentile: {np.percentile(producer_community_affinities, 75):.3f}")
print(f"Number of producers with affinity < 0.25: {(producer_community_affinities < 0.50).sum()}")
producer_community_affinities

Average affinity: 0.733
Median affinity: 0.760
25th percentile: 0.671
75th percentile: 0.817
Number of producers with affinity < 0.25: 1918


array([0.16982994, 0.48919644, 0.21123965, ..., 0.88530252, 0.83899085,
       0.45885741])

In [39]:
# Get producer 23's DID and community
producer_did = producer_df['producer_did'].iloc[23]
community = producer_communities[23]
affinity = producer_community_affinities[23]

print(f"Producer 23:")
print(f"DID: {producer_did}")
print(f"Profile: https://bsky.app/profile/{producer_did}")
print(f"Community: {community}")
print(f"Affinity score: {affinity:.3f}")

Producer 23:
DID: did:plc:oc6vwdlmk2kqyida5i74d3p5
Profile: https://bsky.app/profile/did:plc:oc6vwdlmk2kqyida5i74d3p5
Community: 66
Affinity score: 0.267


In [45]:
# Create a DataFrame to show producer DIDs with their corresponding communities and profile links
community_df = pd.DataFrame({
    'producer_did': producer_df['producer_did'],
    'community': producer_communities,
    'profile_link': 'https://bsky.app/profile/' + producer_df['producer_did']
})

# Filter to show only producers in community 10
pd.set_option('display.max_colwidth', None)
community_df[community_df['community'] == 5]['profile_link']

802      https://bsky.app/profile/did:plc:ohvstchboonnmbplvwkl33ko
1990     https://bsky.app/profile/did:plc:umarumf3h5t4q2ecsmj7om23
2006     https://bsky.app/profile/did:plc:t775cnvnigwc5lcgha77gn6j
2465     https://bsky.app/profile/did:plc:nvog7rczakwzh5ckxnjnwqdd
2856     https://bsky.app/profile/did:plc:otqqwe3tzvtkazlpjiscz7pq
                                   ...                            
39280    https://bsky.app/profile/did:plc:wmt3sznujfli4qifja7f223q
40111    https://bsky.app/profile/did:plc:wcunb4t46jd7dyw3jxliy2tx
40166    https://bsky.app/profile/did:plc:t6ukjxgvhjytyyyyjfxwf3as
40188    https://bsky.app/profile/did:plc:3ztu4viwmkfff7kowtcshcxh
40263    https://bsky.app/profile/did:plc:ddscecurxqh6jqr2xsdcq5by
Name: profile_link, Length: 123, dtype: object

At this point we have Producers x Communities matrix. (remember to measure quality)

In [55]:
community_df.groupby('community').size().sort_values(ascending=False).reset_index(name='size')[-20:-10]

Unnamed: 0,community,size
80,67,181
81,87,178
82,83,177
83,16,171
84,56,162
85,84,153
86,72,152
87,36,151
88,1,142
89,79,142


Do Louvain/Leiden, Label Propagation, Personalized PageRank

In [13]:
import time
import numpy as np
from scipy.sparse import diags
from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn.utils.extmath import randomized_svd

def run_BGC(W, k, dim_factor=5, alpha=0.3):
    """
    Runs the BGC clustering algorithm on a given bipartite graph matrix W (CSR format).

    Parameters:
      W : csr_matrix
          Bipartite consumer-producer matrix (shape: [n_consumers, n_producers])
      k : int
          Number of clusters
      dim_factor : float, default=5
          Factor used to set the reduced embedding dimension: dimension = int(dim_factor * k)
      alpha : float, default=0.3
          Alpha parameter as used in the scaling step.

    Returns:
      labels : np.ndarray
          Cluster labels for each consumer.
    """

    # Compute inverse-scaling vector for columns (producers)
    c = np.array(np.sqrt(W.sum(axis=0)))
    c[c == 0] = 1
    c = 1.0 / c
    # Convert to a 1D array then create diagonal matrix
    cinv = diags(np.array(c).flatten())
    
    # Normalize rows of W (each row sums to 1)
    F = preprocessing.normalize(W, norm='l1', axis=1)

    # Get transpose of W (B: producers x consumers)
    B = W.T

    # Scale B by the inverse vector (each column of B is scaled)
    Bc = cinv.dot(B)

    # Compute row scaling for W (for consumers)
    r = np.array(np.sqrt(W.sum(axis=1)))
    r[r == 0] = 1
    r = 1.0 / r
    r = diags(np.array(r).flatten())
    
    # Construct the intermediate matrix L = Bc * r
    L = Bc.dot(r)

    # Set embedding dimension and run randomized SVD on L
    dim = int(dim_factor * k)
    start = time.time()
    U, s, V = randomized_svd(L, n_components=dim, n_iter=20)
    # Square singular values
    s = s ** 2

    # Scale singular values using alpha factor
    s = (1.0 - alpha) / (1.0 - alpha * s)
    # Convert np.matrix to np.array (since diags(todense()) returns a matrix)
    s = np.asarray(diags(s).todense())
    
    # Multiply U by the scaled singular values, then map back using F
    U = U.dot(s)
    U = F.dot(U)
    U = np.asarray(U)  # ensure U is a numpy array (not np.matrix)
    
    # Normalize rows of U to unit L2 norm
    U = preprocessing.normalize(U, norm='l2', axis=1)

    # Run k-means on the row embeddings
    clustering = KMeans(n_clusters=k, random_state=1024).fit(U)
    labels = clustering.labels_

    elapsedTime = time.time() - start
    print("Elapsed time (secs) for BGC clustering: {:.3f}".format(elapsedTime))
    
    return labels

# Example usage:
# Assuming 'matrix' is your consumer-producer csr_matrix, and you want k=10 clusters:
labels = run_BGC(matrix, k=10, dim_factor=20, alpha=0.3)

Elapsed time (secs) for BGC clustering: 71.821


In [14]:
# Create producer-producer similarity matrix through shared followers
# Convert CSR matrix to PyTorch sparse tensor
matrix_tensor = torch.sparse_coo_tensor(
    torch.LongTensor(np.vstack([matrix.nonzero()])),
    torch.FloatTensor(matrix.data),
    torch.Size(matrix.shape)
).to(device)

# Compute producer similarity using PyTorch sparse matrix multiplication
producer_similarity = torch.sparse.mm(matrix_tensor.t(), matrix_tensor).cpu().numpy()
# Or consumer-consumer similarity
consumer_similarity = matrix @ matrix.T  # This gives us consumers that follow similar producers

# You could then run traditional community detection on either projection
# For example, using producer similarity:
from sklearn.cluster import SpectralClustering
# Normalize the similarity matrix
producer_similarity_normalized = producer_similarity / producer_similarity.diagonal().max()
clustering = SpectralClustering(n_clusters=10, affinity='precomputed')
producer_communities = clustering.fit_predict(producer_similarity_normalized.toarray())

RuntimeError: CUDA error: insufficient resources when calling `cusparseSpGEMM_workEstimation( handle, opA, opB, &alpha, matA, matB, &beta, matC, computeType, CUSPARSE_SPGEMM_DEFAULT, spgemmDesc, &bufferSize1, dBuffer1)`