In [2]:
import duckdb
import pandas as pd

con = duckdb.connect('bluesky_180MB.duckdb')

collections_df = con.execute("""
    SELECT 
        collection,
        COUNT(*) as count,
        ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) FROM records), 2) as percentage
    FROM records 
    GROUP BY collection 
    ORDER BY count DESC 
    LIMIT 10
""").fetchdf()

total_rows = con.execute("SELECT COUNT(*) as total FROM records").fetchdf().iloc[0,0]
print(f"\nTotal records: {total_rows:,}")
print("\nTop 10 collections by record count:")
display(collections_df)


Total records: 900,000

Top 10 collections by record count:


Unnamed: 0,collection,count,percentage
0,app.bsky.feed.like,453558,50.4
1,app.bsky.graph.follow,205106,22.79
2,app.bsky.feed.repost,122556,13.62
3,app.bsky.feed.post,96415,10.71
4,app.bsky.graph.block,11719,1.3
5,app.bsky.graph.listitem,4926,0.55
6,app.bsky.actor.profile,4055,0.45
7,chat.bsky.actor.declaration,634,0.07
8,app.bsky.graph.listblock,448,0.05
9,app.bsky.feed.postgate,272,0.03


In [3]:
con.execute("""
    SELECT *
    FROM records 
    WHERE collection = 'app.bsky.actor.profile'
    AND repo IN (
        SELECT repo 
        FROM records
        WHERE collection = 'app.bsky.actor.profile'
        GROUP BY repo
        HAVING COUNT(*) > 1
    )
    ORDER BY repo, created_at
""").fetchdf()


Unnamed: 0,repo,collection,rkey,at_rev,created_at,deleted,record
0,did:plc:24nm3tw7chtmhvcyjpkeeda5,app.bsky.actor.profile,self,3ldu6z22rk22t,2024-12-22 03:16:26.835,False,"{""$type"":""app.bsky.actor.profile"",""avatar"":{""r..."
1,did:plc:24nm3tw7chtmhvcyjpkeeda5,app.bsky.actor.profile,self,3le6fncdjso23,2024-12-26 02:14:05.031,False,"{""$type"":""app.bsky.actor.profile"",""avatar"":{""r..."
2,did:plc:24nm3tw7chtmhvcyjpkeeda5,app.bsky.actor.profile,self,3le6ya7virx2t,2024-12-26 16:12:55.854,False,"{""$type"":""app.bsky.actor.profile"",""avatar"":{""r..."
3,did:plc:24nm3tw7chtmhvcyjpkeeda5,app.bsky.actor.profile,self,3lemiylyvoy24,2024-12-31 16:51:22.312,False,"{""$type"":""app.bsky.actor.profile"",""avatar"":{""r..."
4,did:plc:2bf6txy4s3oruf7hjdxot4bu,app.bsky.actor.profile,self,3lehjxzdi4u2j,2024-12-30 11:12:07.133,False,"{""$type"":""app.bsky.actor.profile"",""avatar"":{""r..."
...,...,...,...,...,...,...,...
352,did:plc:zkpopwaikky2b376gx6j4svn,app.bsky.actor.profile,self,3ldqsvoa4vo2h,2024-12-20 16:40:26.866,False,"{""$type"":""app.bsky.actor.profile"",""avatar"":{""r..."
353,did:plc:zkpopwaikky2b376gx6j4svn,app.bsky.actor.profile,self,3ldtoz7efkb2m,2024-12-21 20:02:31.097,False,"{""$type"":""app.bsky.actor.profile"",""avatar"":{""r..."
354,did:plc:zkpopwaikky2b376gx6j4svn,app.bsky.actor.profile,self,3ldtozmj6sn2q,2024-12-21 20:02:44.880,False,"{""$type"":""app.bsky.actor.profile"",""avatar"":{""r..."
355,did:plc:zkpopwaikky2b376gx6j4svn,app.bsky.actor.profile,self,3ldwxkb2qdp2m,2024-12-23 08:40:52.486,False,"{""$type"":""app.bsky.actor.profile"",""avatar"":{""r..."


In [4]:
con.execute("""
    WITH parsed_records AS (
        SELECT 
            *,
            JSON_EXTRACT_STRING(record, '$.createdAt') as profile_created_at
        FROM records 
        WHERE collection = 'app.bsky.actor.profile'
    )
    SELECT
        MIN(profile_created_at) as earliest_profile,
        MAX(profile_created_at) as latest_profile
    FROM parsed_records
""").fetchdf()


Unnamed: 0,earliest_profile,latest_profile
0,2024-02-07T20:08:11.868Z,2025-01-08T17:59:24.403Z


In [6]:
# Query one random example from each collection
for collection_name in collections_df['collection'].head(7):
    query = f"""
    SELECT record, repo
    FROM records 
    WHERE collection = '{collection_name}'
    ORDER BY RANDOM()
    LIMIT 1
    """
    
    result = con.execute(query).fetchdf()
    
    if not result.empty:
        # Extract the record and save to JSON file
        record = result['record'][0]
        repo = result['repo'][0]
        filename = f"examples/{collection_name.replace('.', '_')}.json"
        
        # Parse and write JSON with proper indentation
        import json
        json_data = json.loads(record)
        
        with open(filename, 'w') as f:
            json.dump(json_data, f, indent=2)
        
        print(f"Saved example for {collection_name} to {filename}")

Saved example for app.bsky.feed.like to examples/app_bsky_feed_like.json
Saved example for app.bsky.graph.follow to examples/app_bsky_graph_follow.json
Saved example for app.bsky.feed.repost to examples/app_bsky_feed_repost.json
Saved example for app.bsky.feed.post to examples/app_bsky_feed_post.json
Saved example for app.bsky.graph.block to examples/app_bsky_graph_block.json
Saved example for app.bsky.graph.listitem to examples/app_bsky_graph_listitem.json
Saved example for app.bsky.actor.profile to examples/app_bsky_actor_profile.json
