In [48]:
import duckdb
import pandas as pd
import json

con = duckdb.connect('bluesky_180MB.duckdb')

collections_df = con.execute("""
    SELECT 
        collection,
        COUNT(*) as count,
        ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) FROM records), 2) as percentage
    FROM records 
    GROUP BY collection 
    ORDER BY count DESC 
    LIMIT 10
""").fetchdf()

total_rows = con.execute("SELECT COUNT(*) as total FROM records").fetchdf().iloc[0,0]
print(f"\nTotal records: {total_rows:,}")
print("\nTop 10 collections by record count:")
display(collections_df)


Total records: 900,000

Top 10 collections by record count:


Unnamed: 0,collection,count,percentage
0,app.bsky.feed.like,453558,50.4
1,app.bsky.graph.follow,205106,22.79
2,app.bsky.feed.repost,122556,13.62
3,app.bsky.feed.post,96415,10.71
4,app.bsky.graph.block,11719,1.3
5,app.bsky.graph.listitem,4926,0.55
6,app.bsky.actor.profile,4055,0.45
7,chat.bsky.actor.declaration,634,0.07
8,app.bsky.graph.listblock,448,0.05
9,app.bsky.feed.postgate,272,0.03


In [49]:
con.execute("""
    WITH parsed_records AS (
        SELECT 
            *,
            JSON_EXTRACT_STRING(record, '$.createdAt') as profile_created_at
        FROM records 
        WHERE collection = 'app.bsky.actor.profile'
    )
    SELECT
        MIN(profile_created_at) as earliest_profile,
        MAX(profile_created_at) as latest_profile
    FROM parsed_records
""").fetchdf()


Unnamed: 0,earliest_profile,latest_profile
0,2024-02-07T20:08:11.868Z,2025-01-08T17:59:24.403Z


In [50]:
# Query one random example from each collection
for collection_name in collections_df['collection'].head(7):
    query = f"""
    SELECT repo, rkey, at_rev, record
    FROM records 
    WHERE collection = '{collection_name}'
    ORDER BY RANDOM()
    LIMIT 1
    """
    
    result = con.execute(query).fetchdf()
    
    if not result.empty:
        # Extract all fields
        record = json.loads(result['record'][0])
        repo = result['repo'][0]
        rkey = result['rkey'][0]
        at_rev = result['at_rev'][0]
        
        filename = f"examples/{collection_name.replace('.', '_')}.json"
        
        # Combine all fields into one JSON object
        json_data = {
            'repo': repo,
            'rkey': rkey,
            'at_rev': at_rev,
            'record': record
        }
        
        with open(filename, 'w') as f:
            json.dump(json_data, f, indent=2)
        
        print(f"Saved example for {collection_name} to {filename}")

Saved example for app.bsky.feed.like to examples/app_bsky_feed_like.json
Saved example for app.bsky.graph.follow to examples/app_bsky_graph_follow.json
Saved example for app.bsky.feed.repost to examples/app_bsky_feed_repost.json
Saved example for app.bsky.feed.post to examples/app_bsky_feed_post.json
Saved example for app.bsky.graph.block to examples/app_bsky_graph_block.json
Saved example for app.bsky.graph.listitem to examples/app_bsky_graph_listitem.json
Saved example for app.bsky.actor.profile to examples/app_bsky_actor_profile.json


In [51]:
posts_query = """
WITH user_interactions AS (
    -- Posts
    SELECT 
        repo as user_id,
        'at://' || repo || '/app.bsky.feed.post/' || rkey as post_uri,
        'create' as interaction_type,
        TRY_CAST(JSON_EXTRACT_STRING(record, '$.createdAt') AS TIMESTAMP) as timestamp
    FROM records
    WHERE collection = 'app.bsky.feed.post'
        AND LENGTH(record) > 0
)
SELECT 
    user_id,
    post_uri,
    interaction_type,
    timestamp
FROM user_interactions
LIMIT 1000
"""

posts_df = con.execute(posts_query).fetchdf()
print("Shape:", posts_df.shape)
display(posts_df.head())

Shape: (1000, 4)


Unnamed: 0,user_id,post_uri,interaction_type,timestamp
0,did:plc:ti3b2ukif6nnkcljssqhjjxu,at://did:plc:ti3b2ukif6nnkcljssqhjjxu/app.bsky...,create,2023-11-01 20:05:04.988
1,did:plc:ti3b2ukif6nnkcljssqhjjxu,at://did:plc:ti3b2ukif6nnkcljssqhjjxu/app.bsky...,create,2023-11-06 17:23:12.606
2,did:plc:ti3b2ukif6nnkcljssqhjjxu,at://did:plc:ti3b2ukif6nnkcljssqhjjxu/app.bsky...,create,2023-11-06 17:26:16.625
3,did:plc:ti3b2ukif6nnkcljssqhjjxu,at://did:plc:ti3b2ukif6nnkcljssqhjjxu/app.bsky...,create,2024-01-21 14:31:26.472
4,did:plc:ti3b2ukif6nnkcljssqhjjxu,at://did:plc:ti3b2ukif6nnkcljssqhjjxu/app.bsky...,create,2024-02-06 17:08:48.610


In [58]:
likes_query = """
WITH user_interactions AS (
    -- Likes
    SELECT 
        repo as user_id,
        JSON_EXTRACT_STRING(record, '$.subject.uri') as post_uri,
        'like' as interaction_type,
        TRY_CAST(JSON_EXTRACT_STRING(record, '$.createdAt') AS TIMESTAMP) as timestamp
    FROM records
    WHERE collection = 'app.bsky.feed.like'
        AND LENGTH(record) > 0
)
SELECT 
    user_id,
    post_uri,
    interaction_type,
    timestamp
FROM user_interactions
LIMIT 1000
"""

likes_df = con.execute(likes_query).fetchdf()
print("Shape:", likes_df.shape)
display(likes_df.head())

Shape: (1000, 4)


Unnamed: 0,user_id,post_uri,interaction_type,timestamp
0,did:plc:gvbw6a2aljsqdwldefshadqu,at://did:plc:2gmt6lkqkh4uzrr2ovk5ypdu/app.bsky...,like,2024-08-29 22:18:43.095
1,did:plc:gvbw6a2aljsqdwldefshadqu,at://did:plc:twzs2gd63zghqlqbzzpnmchi/app.bsky...,like,2024-08-29 22:20:00.555
2,did:plc:gvbw6a2aljsqdwldefshadqu,at://did:plc:25w2qwhc5a44hj7goepvty4s/app.bsky...,like,2024-08-29 22:20:16.804
3,did:plc:gvbw6a2aljsqdwldefshadqu,at://did:plc:e4tomjseguozopelge6siysc/app.bsky...,like,2024-08-29 22:21:10.290
4,did:plc:gvbw6a2aljsqdwldefshadqu,at://did:plc:5trbo6ijzaizkszgcpvx7gnf/app.bsky...,like,2024-08-30 18:37:58.288


In [56]:
# Combine posts and likes dataframes
combined_df = pd.concat([posts_df, likes_df])

# Create user and post mappings to convert IDs to indices
user_to_idx = {uid: idx for idx, uid in enumerate(combined_df['user_id'].unique())}
post_to_idx = {pid: idx for idx, pid in enumerate(combined_df['post_uri'].unique())}

# Import required libraries
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD

# Create sparse interaction matrix
rows = [user_to_idx[user] for user in combined_df['user_id']]
cols = [post_to_idx[post] for post in combined_df['post_uri']]
data = [1.0 if interaction == 'like' else 0.5 for interaction in combined_df['interaction_type']]  # Weight likes more than creates

interaction_matrix = csr_matrix((data, (rows, cols)), 
                              shape=(len(user_to_idx), len(post_to_idx)))

# Create embeddings using SVD
n_components = 32  # You can adjust this
svd = TruncatedSVD(n_components=n_components, random_state=42)
user_embeddings = svd.fit_transform(interaction_matrix)
post_embeddings = svd.components_.T

# Print shapes to verify
print(f"User embeddings shape: {user_embeddings.shape}")
print(f"Post embeddings shape: {post_embeddings.shape}")
print(f"Explained variance ratio: {svd.explained_variance_ratio_.sum():.3f}")

User embeddings shape: (24, 24)
Post embeddings shape: (2000, 24)
Explained variance ratio: 1.000


In [54]:
user_embeddings

array([[ 2.40044084e-15, -4.54207119e-15,  7.24568837e+00,
        -3.18621742e-16, -5.38548436e-16,  1.46471749e-16,
         2.35348178e-17,  6.06282298e-19, -1.03408888e-19,
         2.00262218e-19,  1.11145625e-19,  1.93180581e-20,
        -1.96375506e-20, -2.09721679e-20,  6.98768033e-21,
         5.08756833e-22, -7.63429781e-23,  7.89661300e-23,
        -4.15763687e-23, -3.55485692e-24, -9.90383780e-25,
         7.52984784e-26,  3.22126937e-26,  1.28240538e-26],
       [ 4.79494856e-17, -6.13837306e-16,  1.39404483e-17,
        -1.11436757e-16, -5.60177111e-16,  8.08336012e-16,
         3.70809924e+00, -1.55653359e-15,  3.09679314e-17,
         1.97726024e-16, -1.93539778e-16,  6.40944141e-18,
        -1.35279472e-17,  2.29151277e-17,  3.30406116e-18,
        -9.37951253e-17,  5.59224645e-17, -2.57129470e-16,
        -2.86813855e-16,  6.11510635e-17,  1.90296387e-17,
        -5.57751457e-18, -1.19467765e-17,  5.24320765e-18],
       [ 1.74766655e-15,  8.07774721e+00,  1.09505257e