In [1]:
import duckdb 
import pandas as pd
import pickle

con = duckdb.connect()

# Read from Parquet file instead of CSV
con.execute("""
    CREATE TABLE embeddings AS SELECT * FROM read_parquet('data/bluesky_text_embeddings (1).parquet');
""")

# Let's see what columns we have
df = con.execute("""
    SELECT *
    FROM embeddings
""").fetchdf()

df.head()

Unnamed: 0,item_id,embeddings
0,3460233,"[185, 145, 87, 71, 59, 189, 187, 14, 75, 170, ..."
1,3044498,"[153, 144, 97, 118, 55, 247, 251, 7, 67, 72, 2..."
2,1582998,"[153, 56, 113, 103, 57, 243, 123, 68, 79, 88, ..."
3,5436174,"[187, 16, 101, 38, 57, 239, 91, 77, 203, 88, 1..."
4,1582999,"[153, 56, 113, 103, 105, 247, 123, 229, 77, 20..."


In [2]:
df['item_id'].max()

np.int64(5866225)

In [3]:
len(df)

4349970

In [4]:
df['embeddings'].iloc[0]

bytearray(b'\xb9\x91WG;\xbd\xbb\x0eK\xaa\xfd{\xcd\xb6\xce\xf5')

In [5]:
# Get count of unique item_ids
unique_items = con.execute("""
    SELECT COUNT(DISTINCT item_id) as unique_count
    FROM embeddings
""").fetchdf()

print(f"Number of unique item_ids: {unique_items['unique_count'][0]}")

# Get the data with unique item_ids (removing duplicates if any)
unique_df = con.execute("""
    SELECT DISTINCT item_id, embeddings
    FROM embeddings
""").fetchdf()

print(f"\nShape of dataframe with unique items: {unique_df.shape}")
unique_df.head() 

Number of unique item_ids: 4349970

Shape of dataframe with unique items: (4349970, 2)


Unnamed: 0,item_id,embeddings
0,3606809,"[187, 144, 85, 54, 120, 173, 99, 159, 99, 200,..."
1,5184074,"[57, 184, 92, 211, 148, 92, 193, 156, 103, 156..."
2,3799640,"[57, 48, 127, 115, 168, 118, 208, 174, 97, 229..."
3,1658601,"[185, 16, 85, 247, 57, 229, 246, 37, 99, 73, 2..."
4,5347981,"[169, 240, 21, 22, 86, 124, 110, 95, 216, 72, ..."


In [6]:
# Read the pickle file
with open('data/user_mapping.pkl', 'rb') as f:
    user_mapping = pickle.load(f)

with open('data/post_mapping.pkl', 'rb') as f:
    post_mapping = pickle.load(f)

user_mapping

{'did:plc:42kmtf65uqs765coei7bimwx': np.int64(0),
 'did:plc:6nm57nkqm642dipprjutntgy': np.int64(1),
 'did:plc:rf4k4tnlgn2rhpp2c24fmihd': np.int64(2),
 'did:plc:7bo3bipb4qeg43bm5v5oawlu': np.int64(3),
 'did:plc:h7kqugmh2mvqzemxpaoxakyg': np.int64(4),
 'did:plc:wjmpnnc2xgfzhbyrwywn3vfx': np.int64(5),
 'did:plc:zfs3uyktjfleqriqixjzufim': np.int64(6),
 'did:plc:i2klgxl4rzuym26g4gvfdvit': np.int64(7),
 'did:plc:6rtyggd257wbge7oauwvolkn': np.int64(8),
 'did:plc:nzvgo63bw5h5p7qg3zfm7q5y': np.int64(9),
 'did:plc:eegoq56xcpkc77rlo2s4seoi': np.int64(10),
 'did:plc:vb2tn23gmof5swuml3mlskw7': np.int64(11),
 'did:plc:byypxurudix7f5kom4747rnz': np.int64(12),
 'did:plc:ljmyslsp4xo2y7zulw6c5rgm': np.int64(13),
 'did:plc:getdgobkkcfs5bkefgorqyuv': np.int64(14),
 'did:plc:yd7pivfgibnxcect3aktpj6m': np.int64(15),
 'did:plc:p77mazfeyxc57ylgxx5zsfxn': np.int64(16),
 'did:plc:rteljm56xd4zgrfudkbbrvya': np.int64(17),
 'did:plc:mx7ifsn5pvbl5ahusidj6bnt': np.int64(18),
 'did:plc:6pym3d247baw7juwisya3lho': np.i

In [7]:
post_mapping

{'did:plc:5tgxxpsiv36w3e37im6kd2se_3ju7w7aqxoj2z': 0,
 'did:plc:p2cp5gopk7mgjegy6wadk3ep_3jumsjxf4o22b': 1,
 'did:plc:ey4an5j5m2h5v5qhcqs4hji2_3juodfizb7s2w': 2,
 'did:plc:njwcnt4qsdno2lhebdq34m2o_3jun7d2waiq2r': 3,
 'did:plc:42kmtf65uqs765coei7bimwx_3juoga45vsq2z': 4,
 'did:plc:oky5czdrnfjpqslsw2a5iclo_3juoqmpindd2r': 5,
 'did:plc:5cn7tila5pqvqk7jbkgfz6hd_3juorjhlzoq2n': 6,
 'did:plc:nzhvyk2tcsfhr4t6oijysqlc_3juormab5v42x': 7,
 'did:plc:nzhvyk2tcsfhr4t6oijysqlc_3juorqcnyzi2n': 8,
 'did:plc:kcydvqziedhbz7jyw4oydfkr_3juoteiluvz2g': 9,
 'did:plc:6vxkf3dayl4uyc77vv6deraj_3juouiqwx7a2r': 10,
 'did:plc:42kmtf65uqs765coei7bimwx_3jupbdpdacr2v': 11,
 'did:plc:e53k7tya42bgurd3cucp34a7_3juqqhigxks2o': 12,
 'did:plc:5lbewu4koddkz6lgwtmiv7mi_3juqziupj4u2t': 13,
 'did:plc:buofnbcavecxm3kr6x5npusi_3juqz3rdhpu2t': 14,
 'did:plc:5lbewu4koddkz6lgwtmiv7mi_3juqz3iurl52t': 15,
 'did:plc:buofnbcavecxm3kr6x5npusi_3juqz4npaj52t': 16,
 'did:plc:buofnbcavecxm3kr6x5npusi_3juqzqrcxzn2z': 17,
 'did:plc:cvzdmr3ssm

In [8]:
len(user_mapping)

106367

In [9]:
len(post_mapping)

5866226

In [10]:
# Read from CSV file
con.execute("""
    CREATE TABLE bluesky AS SELECT * FROM read_csv('data/bluesky.csv');
""")

<duckdb.duckdb.DuckDBPyConnection at 0x7f3472cce4b0>

In [11]:
df = con.execute("""
    SELECT *
    FROM bluesky
""").fetchdf()

df.head()

Unnamed: 0,source_node,destination_node,timestamp,edge_label
0,12248,1349,20230101024321,0
1,50947,3044497,20230101024954,0
2,24218,2347863,20230101035202,0
3,13743,1349,20230101051655,0
4,50947,1349,20230101053502,0


In [12]:
df['edge_label'].value_counts()

edge_label
0    22131398
Name: count, dtype: int64

In [13]:
import pandas as pd
import numpy as np
import duckdb

# 1. Load and unpack embeddings
con = duckdb.connect()
con.execute("""
    CREATE TABLE embeddings AS SELECT * FROM read_parquet('data/bluesky_text_embeddings (2).parquet');
""")
post_embeddings_df = con.execute("SELECT * FROM embeddings").fetchdf()

# def unpack_embeddings(packed_bytes):
#     return np.unpackbits(np.frombuffer(packed_bytes, dtype=np.uint8))

# post_embeddings_df['embeddings'] = post_embeddings_df['embeddings'].apply(unpack_embeddings)

# 2. Load interactions
con.execute("""
    CREATE TABLE interactions AS SELECT * FROM read_csv('data/bluesky.csv');
""")
interactions_df = con.execute("SELECT * FROM interactions").fetchdf()

# 3. Join interactions with post embeddings
joined_df = interactions_df.merge(
    post_embeddings_df,
    left_on='destination_node',
    right_on='item_id',
    how='inner'
)

# 4. Group by user and create user embeddings
user_embeddings = joined_df.groupby('source_node')['embeddings'].agg(
    lambda x: np.mean(list(x), axis=0)
).reset_index()
user_embeddings.columns = ['user_id', 'user_embedding']

# 5. Create final DataFrame with all information
final_df = joined_df.merge(
    user_embeddings,
    left_on='source_node',
    right_on='user_id',
    how='inner'
)

# Verify the data
print("Final DataFrame shape:", final_df.shape)
print("\nColumns:", final_df.columns.tolist())
print("\nSample user-post pair:")
sample = final_df.iloc[0]
print(f"User ID: {sample['source_node']}")
print(f"Post ID: {sample['destination_node']}")
print(f"User embedding (first 10):", sample['user_embedding'][:10])
print(f"Post embedding (first 10):", sample['embeddings'][:10])

Final DataFrame shape: (16943200, 8)

Columns: ['source_node', 'destination_node', 'timestamp', 'edge_label', 'item_id', 'embeddings', 'user_id', 'user_embedding']

Sample user-post pair:
User ID: 50947
Post ID: 3460233
User embedding (first 10): [ 0.15769888 -0.22164886  0.07231304 -0.00327762  0.13493281 -0.12219387
 -0.07193487  0.22267221 -0.01009528 -0.00120526]
Post embedding (first 10): [ 0.2746582  -0.1050415   0.07116699  0.0051651   0.07971191 -0.03967285
 -0.04690552  0.27392578  0.02571106 -0.04318237]


In [14]:
final_df

Unnamed: 0,source_node,destination_node,timestamp,edge_label,item_id,embeddings,user_id,user_embedding
0,50947,3460233,20230101054209,0,3460233,"[0.274658203125, -0.10504150390625, 0.07116699...",50947,"[0.15769888380192854, -0.22164885611644441, 0...."
1,50947,1582998,20230101062342,0,1582998,"[0.09759521484375, -0.233642578125, -0.0789794...",50947,"[0.15769888380192854, -0.22164885611644441, 0...."
2,24218,1582998,20230101065337,0,1582998,"[0.09759521484375, -0.233642578125, -0.0789794...",24218,"[0.13439939289449532, -0.2052124593859521, 0.0..."
3,65606,1582998,20230101072700,0,1582998,"[0.09759521484375, -0.233642578125, -0.0789794...",65606,"[0.1590409960065569, -0.22170182112809067, 0.0..."
4,95617,5436174,20230101085031,0,5436174,"[0.1181640625, -0.2105712890625, 0.00327110290...",95617,"[0.13951278411032197, -0.20915239184872478, 0...."
...,...,...,...,...,...,...,...,...
16943195,26730,2466070,20230630235958,0,2466070,"[0.1097412109375, -0.11407470703125, 0.0979614...",26730,"[0.13095803925248442, -0.22063742048753415, 0...."
16943196,94142,380415,20230630235958,0,380415,"[0.0293731689453125, -0.1512451171875, 0.22021...",94142,"[0.13742323905702622, -0.22445696876162574, 0...."
16943197,60002,172743,20230630235958,0,172743,"[0.121826171875, -0.1566162109375, 0.153442382...",60002,"[0.13576889038085938, -0.19163131713867188, 0...."
16943198,79881,4836642,20230630235958,0,4836642,"[0.162109375, -0.3369140625, 0.09149169921875,...",79881,"[0.15100748504823594, -0.21996993962518002, 0...."


In [23]:
# Create reverse mapping for posts
reverse_post_mapping = {v: k for k, v in post_mapping.items()}
reverse_user_mapping = {v: k for k, v in user_mapping.items()}

# Convert embeddings to numpy arrays and normalize them for cosine similarity
post_data = final_df[['destination_node', 'embeddings']].drop_duplicates('destination_node')
post_embeddings = np.stack(post_data['embeddings'].values)
post_ids = post_data['destination_node'].values

# Normalize the embeddings for faster cosine similarity computation
# normalized_embeddings = post_embeddings / np.linalg.norm(post_embeddings, axis=1)[:, np.newaxis]

def get_post_url(post_id):
    """Convert post ID to URL using the post_mapping"""
    if post_id in reverse_post_mapping:
        full_id = reverse_post_mapping[post_id]
        did, post_ref = full_id.split('_')
        return f"https://bsky.app/profile/{did}/post/{post_ref}"
    return f"Post ID: {post_id} (not found in mapping)"

# Sample some random posts and find their nearest neighbors
sample_indices = np.random.choice(len(post_embeddings), 5)

for idx in sample_indices:
    query_embedding = post_embeddings[idx]
    
    # Compute similarities with all posts
    similarities = np.dot(post_embeddings, query_embedding)
    
    # Get top 6 (including self) most similar posts
    most_similar_indices = np.argsort(similarities)[-6:][::-1]
    
    print(f"\nQuery Post: {get_post_url(post_ids[idx])}")
    print("\nMost similar posts:")
    
    for rank, similar_idx in enumerate(most_similar_indices[1:]):  # Skip first one (self)
        similarity = similarities[similar_idx]
        print(f"\nRank {rank + 1}: {get_post_url(post_ids[similar_idx])}")
        print(f"Similarity: {similarity:.4f}")


Query Post: https://bsky.app/profile/did:plc:gnvhtgkmq53r6tcfvvbphyx4/post/3jurgn2ek522s

Most similar posts:

Rank 1: https://bsky.app/profile/did:plc:gkhlgiv6hsui5gqyye2zrbd3/post/3jwzxgnopvc2k
Similarity: 0.8924

Rank 2: https://bsky.app/profile/did:plc:6lsbgmtok4wqpt4zyspevk3v/post/3juuplzmqfm27
Similarity: 0.8886

Rank 3: https://bsky.app/profile/did:plc:pfs5plbvxqwicww6fky6lmsl/post/3ju52pzktrn2u
Similarity: 0.8812

Rank 4: https://bsky.app/profile/did:plc:no4ezg3ozd6jmlpq4lrmedux/post/3jzakluw6lf26
Similarity: 0.8808

Rank 5: https://bsky.app/profile/did:plc:bguyhmlfkmegq37galldygmd/post/3jzdzudnkml22
Similarity: 0.8749

Query Post: https://bsky.app/profile/did:plc:lcytlkvzs3wslcgbk7i3ygak/post/3jwnp44c5o22b

Most similar posts:

Rank 1: https://bsky.app/profile/did:plc:yzaunobfvtqk2dze55f7k7oo/post/3jz2ivrejfr2x
Similarity: 0.8916

Rank 2: https://bsky.app/profile/did:plc:scppfp4gxifrjzax2pbud3g3/post/3jvxwdzihgc2c
Similarity: 0.8858

Rank 3: https://bsky.app/profile/did:plc:n