In [1]:
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch
from tqdm import tqdm
import pickle
from model import SimpleTwoTowerModel
import duckdb
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the parquet file
df = pd.read_parquet("data/bluesky_text_embeddings (2).parquet")

# Unpack the binary embeddings
# def unpack_embeddings(packed_bytes):
#     return np.unpackbits(np.frombuffer(packed_bytes, dtype=np.uint8))

# Apply unpacking to get original binary embeddings
# df['embeddings'] = df['embeddings'].apply(unpack_embeddings)

# Now you can look at the first few rows to verify
print("First embedding shape:", len(df['embeddings'].iloc[0]))
print(df[['item_id', 'embeddings']].head())

First embedding shape: 128
   item_id                                         embeddings
0  3460233  [0.274658203125, -0.10504150390625, 0.07116699...
1  3044498  [0.18701171875, -0.1715087890625, -0.009239196...
2  1582998  [0.09759521484375, -0.233642578125, -0.0789794...
3  5436174  [0.1181640625, -0.2105712890625, 0.00327110290...
4  1582999  [0.106201171875, -0.227294921875, -0.018081665...


## Load the Embeddings and make finals_df and interactions_df

In [2]:
# 1. Load and unpack embeddings
con = duckdb.connect()
con.execute("""
    CREATE TABLE embeddings AS SELECT * FROM read_parquet('data/bluesky_text_embeddings (2).parquet');
""")
post_embeddings_df = con.execute("SELECT * FROM embeddings").fetchdf()

# def unpack_embeddings(packed_bytes):
#     return np.unpackbits(np.frombuffer(packed_bytes, dtype=np.uint8))

# post_embeddings_df['embeddings'] = post_embeddings_df['embeddings'].apply(unpack_embeddings)

# 2. Load interactions
con.execute("""
    CREATE TABLE interactions AS SELECT * FROM read_csv('data/bluesky.csv');
""")
interactions_df = con.execute("SELECT * FROM interactions").fetchdf()

# 3. Join interactions with post embeddings
joined_df = interactions_df.merge(
    post_embeddings_df,
    left_on='destination_node',
    right_on='item_id',
    how='inner'
)

# 4. Group by user and create user embeddings
user_embeddings = joined_df.groupby('source_node')['embeddings'].agg(
    lambda x: np.mean(list(x), axis=0)
).reset_index()
user_embeddings.columns = ['user_id', 'user_embedding']

# 5. Create final DataFrame with all information
final_df = joined_df.merge(
    user_embeddings,
    left_on='source_node',
    right_on='user_id',
    how='inner'
)

# Verify the data
print("Final DataFrame shape:", final_df.shape)
print("\nColumns:", final_df.columns.tolist())
print("\nSample user-post pair:")
sample = final_df.iloc[0]
print(f"User ID: {sample['source_node']}")
print(f"Post ID: {sample['destination_node']}")
print(f"User embedding (first 10):", sample['user_embedding'][:10])
print(f"Post embedding (first 10):", sample['embeddings'][:10])

Final DataFrame shape: (16943200, 8)

Columns: ['source_node', 'destination_node', 'timestamp', 'edge_label', 'item_id', 'embeddings', 'user_id', 'user_embedding']

Sample user-post pair:
User ID: 50947
Post ID: 3460233
User embedding (first 10): [ 0.15769888 -0.22164886  0.07231304 -0.00327762  0.13493281 -0.12219387
 -0.07193487  0.22267221 -0.01009528 -0.00120526]
Post embedding (first 10): [ 0.2746582  -0.1050415   0.07116699  0.0051651   0.07971191 -0.03967285
 -0.04690552  0.27392578  0.02571106 -0.04318237]


In [3]:
final_df

Unnamed: 0,source_node,destination_node,timestamp,edge_label,item_id,embeddings,user_id,user_embedding
0,50947,3460233,20230101054209,0,3460233,"[0.274658203125, -0.10504150390625, 0.07116699...",50947,"[0.15769888380192854, -0.22164885611644441, 0...."
1,50947,1582998,20230101062342,0,1582998,"[0.09759521484375, -0.233642578125, -0.0789794...",50947,"[0.15769888380192854, -0.22164885611644441, 0...."
2,24218,1582998,20230101065337,0,1582998,"[0.09759521484375, -0.233642578125, -0.0789794...",24218,"[0.13439939289449532, -0.2052124593859521, 0.0..."
3,65606,1582998,20230101072700,0,1582998,"[0.09759521484375, -0.233642578125, -0.0789794...",65606,"[0.1590409960065569, -0.22170182112809067, 0.0..."
4,95617,5436174,20230101085031,0,5436174,"[0.1181640625, -0.2105712890625, 0.00327110290...",95617,"[0.13951278411032197, -0.20915239184872478, 0...."
...,...,...,...,...,...,...,...,...
16943195,26730,2466070,20230630235958,0,2466070,"[0.1097412109375, -0.11407470703125, 0.0979614...",26730,"[0.13095803925248442, -0.22063742048753415, 0...."
16943196,94142,380415,20230630235958,0,380415,"[0.0293731689453125, -0.1512451171875, 0.22021...",94142,"[0.13742323905702622, -0.22445696876162574, 0...."
16943197,60002,172743,20230630235958,0,172743,"[0.121826171875, -0.1566162109375, 0.153442382...",60002,"[0.13576889038085938, -0.19163131713867188, 0...."
16943198,79881,4836642,20230630235958,0,4836642,"[0.162109375, -0.3369140625, 0.09149169921875,...",79881,"[0.15100748504823594, -0.21996993962518002, 0...."


In [4]:
interactions_df

Unnamed: 0,source_node,destination_node,timestamp,edge_label
0,12248,1349,20230101024321,0
1,50947,3044497,20230101024954,0
2,24218,2347863,20230101035202,0
3,13743,1349,20230101051655,0
4,50947,1349,20230101053502,0
...,...,...,...,...
22131393,79881,4836642,20230630235958,0
22131394,103308,47287,20230630235959,0
22131395,87720,1073032,20230630235959,0
22131396,27780,1077586,20230630235959,0


In [5]:
# Pick a random user_id to examine
user_id = final_df['user_id'].iloc[21]

# Get their embedding
user_emb = final_df[final_df['user_id'] == user_id]['user_embedding'].iloc[0]

# Get all posts this user liked
liked_posts = final_df[final_df['user_id'] == user_id]['destination_node'].values

# Get embeddings for these posts
liked_post_embeddings = post_embeddings_df[post_embeddings_df['item_id'].isin(liked_posts)]['embeddings'].values

print(f"User {user_id}:")
print(f"Number of liked posts: {len(liked_posts)}")
print(f"\nUser embedding (first 20 values):\n{user_emb[:20]}")
print(f"\nLiked posts embeddings (first 3 posts, first 20 values):")
for i, emb in enumerate(liked_post_embeddings[:9]):
    print(f"Post {i}: {emb[:20]}")

# Verify that user embedding is indeed the average
avg_liked_embeddings = np.mean(liked_post_embeddings, axis=0)
print(f"\nVerification - are user embeddings the average of liked posts?")
print(f"Max difference: {np.max(np.abs(user_emb - avg_liked_embeddings))}")

User 35444:
Number of liked posts: 299

User embedding (first 20 values):
[ 0.14168766 -0.21814904  0.07155541  0.01467382  0.12980633 -0.12021306
 -0.0813206   0.21462916 -0.01470117  0.00618066 -0.03683629  0.23589965
 -0.02831887 -0.08918404 -0.12999587 -0.03339204 -0.15233823  0.11898526
  0.00622435  0.01602863]

Liked posts embeddings (first 3 posts, first 20 values):
Post 0: [ 0.18701172 -0.17150879 -0.0092392   0.06329346  0.05584717 -0.31567383
 -0.02438354  0.3347168   0.01186371 -0.04571533 -0.03271484  0.15649414
 -0.05749512 -0.08392334 -0.29541016 -0.09289551 -0.1776123   0.11749268
  0.02186584 -0.03771973]
Post 1: [ 0.10620117 -0.22729492 -0.01808167  0.00167561  0.1484375  -0.10430908
 -0.17626953  0.08087158 -0.05752563 -0.0049057   0.03503418  0.2097168
  0.10949707 -0.2244873  -0.21923828 -0.1116333  -0.20629883  0.11004639
  0.04299927  0.02284241]
Post 2: [ 0.08703613 -0.2019043  -0.01100922  0.09820557  0.14025879 -0.13244629
 -0.13562012  0.10418701  0.09564209 

## Create the Dataset and Dataloader and Train the Model

In [6]:
class UserPostDataset(Dataset):
    def __init__(self, df, negative_samples=1):
        self.df = df
        self.negative_samples = negative_samples
        # Precompute user's positive posts for faster lookup
        self.user_positives = {
            user: set(group['destination_node'].values) 
            for user, group in df.groupby('source_node')
        }
        self.all_posts = df['destination_node'].unique()
        # Calculate total length including negative samples
        self.length = len(df) * (self.negative_samples + 1)
        
    def __len__(self):
        return self.length  # Return integer length
    
    def __getitem__(self, idx):
        interaction_idx = idx // (self.negative_samples + 1)
        is_positive = idx % (self.negative_samples + 1) == 0
        
        row = self.df.iloc[interaction_idx]
        user_id = row['source_node']
        
        if is_positive:
            post_emb = row['embeddings']
            user_emb = row['user_embedding']
        else:
            # Simple random sampling without checking
            neg_post_idx = np.random.choice(len(self.df))
            neg_post = self.df.iloc[neg_post_idx]
            post_emb = neg_post['embeddings']
            user_emb = row['user_embedding']
        
        return (
            torch.tensor(user_emb, dtype=torch.float32),
            torch.tensor(post_emb, dtype=torch.float32),
            torch.tensor(1.0 if is_positive else 0.0, dtype=torch.float32)
        )

# Create datasets and dataloaders
train_df, val_df = train_test_split(final_df, test_size=0.2, random_state=42)

train_dataset = UserPostDataset(train_df)
val_dataset = UserPostDataset(val_df)

train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=1024, shuffle=False, num_workers=4)

embedding_dim = 128

In [7]:
model = SimpleTwoTowerModel(embedding_dim).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCEWithLogitsLoss()

# Training loop with gradient norm monitoring

# for batch_idx, (user_features, post_features, labels) in enumerate(tqdm(train_loader)):
#     user_features = user_features.to(device)
#     post_features = post_features.to(device)
#     labels = labels.to(device)
    
#     optimizer.zero_grad()
#     user_emb, post_emb = model(user_features, post_features)
#     scores = torch.sum(user_emb * post_emb, dim=1)
#     loss = criterion(scores, labels)
    
#     loss.backward()
    
#     # Monitor gradients
#     total_norm = 0
#     for p in model.parameters():
#         if p.grad is not None:
#             param_norm = p.grad.data.norm(2)
#             total_norm += param_norm.item() ** 2
#     total_norm = total_norm ** 0.5
    
#     optimizer.step()
    
#     if batch_idx % 3000 == 0:
#         print(f"Batch {batch_idx} statistics:")
#         print(f"  Loss: {loss.item():.4f}")
#         print(f"  Gradient norm: {total_norm:.4f}")
#         print(f"  Score range: [{scores.min().item():.4f}, {scores.max().item():.4f}]")
#         print(f"  Prediction mean: {torch.sigmoid(scores).mean().item():.4f}")

In [8]:
# 1. Save the model. Comment this out if you don't want to save the model to proceed to just loading the model.
# Save model
# torch.save({
#     'model_state_dict': model.state_dict(),
#     'optimizer_state_dict': optimizer.state_dict(),
# }, 'data/two_tower_model.pt')

## Evaluate the Model

In [11]:
# Load model
model = SimpleTwoTowerModel(embedding_dim).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

checkpoint = torch.load('data/two_tower_model.pt')
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

# 2. Test the model with some examples
model.eval()  # Set to evaluation mode

# Load mappings
with open('data/user_mapping.pkl', 'rb') as f:
    user_mapping = pickle.load(f)
reverse_user_mapping = {v: k for k, v in user_mapping.items()}

with open('data/post_mapping.pkl', 'rb') as f:
    post_mapping = pickle.load(f)
reverse_post_mapping = {v: k for k, v in post_mapping.items()}

def get_post_info(post_id):
    """Extract post info from mapping key"""
    full_id = reverse_post_mapping.get(post_id)
    if full_id:
        did, post_ref = full_id.split('_')
        return did, post_ref
    return None, None

def get_recommendations(user_id, n_recommendations=5):
    # Get user's embedding
    user_data = final_df[final_df['source_node'] == user_id].iloc[0]
    user_features = torch.tensor(user_data['user_embedding'], dtype=torch.float32).unsqueeze(0).to(device)
    
    # Get all post embeddings
    all_posts = final_df[['destination_node', 'embeddings']].drop_duplicates('destination_node')
    post_features = torch.tensor(np.stack(all_posts['embeddings']), dtype=torch.float32).to(device)
    
    with torch.no_grad():
        user_emb, post_emb = model(user_features, post_features)
        scores = torch.sum(user_emb * post_emb, dim=1)
        
    # Get top K recommendations
    top_k_scores, top_k_indices = torch.topk(scores, k=n_recommendations)
    recommendations = all_posts.iloc[top_k_indices.cpu().numpy()]
    
    return (
        recommendations['destination_node'].tolist(), 
        top_k_scores.cpu().numpy(),
        user_emb.cpu().numpy(),
        post_emb[top_k_indices].cpu().numpy(),
        user_features.cpu().numpy(),
        torch.tensor(np.stack(recommendations['embeddings'])).numpy()
    )

# Test for a few users
test_users = final_df['source_node'].unique()[1000:1003]

for user_id in test_users:
    user_did = reverse_user_mapping.get(user_id)
    print(f"\nUser: https://bsky.app/profile/{user_did}")
    
    # Get user's actual liked posts
    liked_posts = set(final_df[final_df['source_node'] == user_id]['destination_node'])
    print(f"Number of posts user actually liked: {len(liked_posts)}")
    
    # Get recommendations and embeddings
    rec_posts, rec_scores, user_emb, post_embs, orig_user_feat, orig_post_feats = get_recommendations(user_id, n_recommendations=5)
    
    print("\nTop 5 recommended posts:")
    for idx, (post_id, score, post_emb, orig_post_feat) in enumerate(zip(rec_posts, rec_scores, post_embs, orig_post_feats)):
        status = "Actually liked" if post_id in liked_posts else "Not liked"
        
        # Get post info
        author_did, post_ref = get_post_info(post_id)
        if author_did and post_ref:
            print(f"\nPost {idx+1}: https://bsky.app/profile/{author_did}/post/{post_ref}")
        else:
            print(f"\nPost {idx+1} ID {post_id} not found in mapping")
            
        print(f"Score: {score:.3f}, {status}")
        
        # Calculate and print similarity details
        cosine_sim = np.dot(user_emb[0], post_emb) / (np.linalg.norm(user_emb[0]) * np.linalg.norm(post_emb))
        print(f"Cosine Similarity: {cosine_sim:.3f}")

  checkpoint = torch.load('data/two_tower_model.pt')



User: https://bsky.app/profile/did:plc:jiymvt4mz2vyerrhe7o7sgdn
Number of posts user actually liked: 467

Top 5 recommended posts:

Post 1: https://bsky.app/profile/did:plc:p4474re3x6buxlzdeampcwqn/post/3jxnsga3xkf2r
Score: 0.992, Not liked
Cosine Similarity: 0.992

Post 2: https://bsky.app/profile/did:plc:oe52hqd375jatrxjdugzjnap/post/3jz5ciwxv3m2u
Score: 0.991, Not liked
Cosine Similarity: 0.991

Post 3: https://bsky.app/profile/did:plc:wd5brnxbsbcexgmvnkenkfm3/post/3jxn7rjrwfk2t
Score: 0.991, Not liked
Cosine Similarity: 0.991

Post 4: https://bsky.app/profile/did:plc:e2gj6s2ljmyen7msreqed536/post/3jwzv3aup6t2g
Score: 0.991, Not liked
Cosine Similarity: 0.991

Post 5: https://bsky.app/profile/did:plc:tbl752vbbw3buafdqzc4d7oz/post/3jwhyhnrj4p2m
Score: 0.990, Not liked
Cosine Similarity: 0.990

User: https://bsky.app/profile/did:plc:53ufa4fnn5w5jdtwnd6747th
Number of posts user actually liked: 23

Top 5 recommended posts:

Post 1: https://bsky.app/profile/did:plc:txandrhc7afdozk6a2it

In [13]:
def calculate_recall_at_k(model, test_df, k_values=[5, 10, 20, 50]):
    """Calculate recall@k for multiple k values"""
    model.eval()
    recalls = {k: [] for k in k_values}
    
    # Get unique users
    unique_users = test_df['source_node'].unique()
    
    # Get all posts for recommendations
    all_posts = test_df[['destination_node', 'embeddings']].drop_duplicates('destination_node')
    all_post_features = torch.tensor(np.stack(all_posts['embeddings']), dtype=torch.float32).to(device)
    
    for user_id in tqdm(unique_users):
        # Get user's actual liked posts
        actual_likes = set(test_df[test_df['source_node'] == user_id]['destination_node'])
        
        # Get user embedding
        user_data = test_df[test_df['source_node'] == user_id].iloc[0]
        user_features = torch.tensor(user_data['user_embedding'], dtype=torch.float32).unsqueeze(0).to(device)
        
        # Get recommendations
        with torch.no_grad():
            user_emb, post_emb = model(user_features, all_post_features)
            scores = torch.sum(user_emb * post_emb, dim=1)
            
        # Calculate recall@k for each k
        _, top_indices = torch.topk(scores, k=max(k_values))
        recommended_posts = all_posts.iloc[top_indices.cpu().numpy()]['destination_node'].tolist()
        
        for k in k_values:
            recommended_at_k = set(recommended_posts[:k])
            recall = len(recommended_at_k.intersection(actual_likes)) / len(actual_likes)
            recalls[k].append(recall)
    
    # Calculate average recall for each k
    avg_recalls = {k: np.mean(values) for k, values in recalls.items()}
    return avg_recalls

# Calculate recall@k for both validation set
print("Calculating recall@k on validation set...")
val_recalls = calculate_recall_at_k(model, val_df)
for k, recall in val_recalls.items():
    print(f"Recall@{k}: {recall:.3f}")

# Also calculate recall@k using direct embeddings for comparison
def calculate_direct_recall_at_k(test_df, k_values=[5, 10, 20, 50]):
    """Calculate recall@k using original embeddings directly"""
    recalls = {k: [] for k in k_values}
    unique_users = test_df['source_node'].unique()
    
    for user_id in tqdm(unique_users):
        # Get user's actual liked posts
        actual_likes = set(test_df[test_df['source_node'] == user_id]['destination_node'])
        
        # Get user embedding and all posts
        user_embedding = test_df[test_df['source_node'] == user_id]['user_embedding'].iloc[0]
        all_posts = test_df[['destination_node', 'embeddings']].drop_duplicates('destination_node')
        
        # Calculate similarities
        similarities = [
            np.dot(post_emb, user_embedding) / (np.linalg.norm(post_emb) * np.linalg.norm(user_embedding))
            for post_emb in all_posts['embeddings']
        ]
        
        # Get top k recommendations
        top_indices = np.argsort(similarities)[-max(k_values):][::-1]
        recommended_posts = all_posts.iloc[top_indices]['destination_node'].tolist()
        
        for k in k_values:
            recommended_at_k = set(recommended_posts[:k])
            recall = len(recommended_at_k.intersection(actual_likes)) / len(actual_likes)
            recalls[k].append(recall)
    
    avg_recalls = {k: np.mean(values) for k, values in recalls.items()}
    return avg_recalls

print("\nCalculating recall@k using direct embeddings...")
direct_recalls = calculate_direct_recall_at_k(val_df)
for k, recall in direct_recalls.items():
    print(f"Direct Recall@{k}: {recall:.3f}")

Calculating recall@k on validation set...


100%|██████████| 78322/78322 [44:59<00:00, 29.02it/s]


Recall@5: 0.000
Recall@10: 0.000
Recall@20: 0.000
Recall@50: 0.001

Calculating recall@k using direct embeddings...


  1%|          | 468/78322 [1:08:46<190:39:59,  8.82s/it]


KeyboardInterrupt: 

In [11]:
final_df

Unnamed: 0,source_node,destination_node,timestamp,edge_label,item_id,embeddings,user_id,user_embedding
0,50947,3460233,20230101054209,0,3460233,"[0.274658203125, -0.10504150390625, 0.07116699...",50947,"[0.15769888380192854, -0.22164885611644441, 0...."
1,50947,1582998,20230101062342,0,1582998,"[0.09759521484375, -0.233642578125, -0.0789794...",50947,"[0.15769888380192854, -0.22164885611644441, 0...."
2,24218,1582998,20230101065337,0,1582998,"[0.09759521484375, -0.233642578125, -0.0789794...",24218,"[0.13439939289449532, -0.2052124593859521, 0.0..."
3,65606,1582998,20230101072700,0,1582998,"[0.09759521484375, -0.233642578125, -0.0789794...",65606,"[0.1590409960065569, -0.22170182112809067, 0.0..."
4,95617,5436174,20230101085031,0,5436174,"[0.1181640625, -0.2105712890625, 0.00327110290...",95617,"[0.13951278411032197, -0.20915239184872478, 0...."
...,...,...,...,...,...,...,...,...
16943195,26730,2466070,20230630235958,0,2466070,"[0.1097412109375, -0.11407470703125, 0.0979614...",26730,"[0.13095803925248442, -0.22063742048753415, 0...."
16943196,94142,380415,20230630235958,0,380415,"[0.0293731689453125, -0.1512451171875, 0.22021...",94142,"[0.13742323905702622, -0.22445696876162574, 0...."
16943197,60002,172743,20230630235958,0,172743,"[0.121826171875, -0.1566162109375, 0.153442382...",60002,"[0.13576889038085938, -0.19163131713867188, 0...."
16943198,79881,4836642,20230630235958,0,4836642,"[0.162109375, -0.3369140625, 0.09149169921875,...",79881,"[0.15100748504823594, -0.21996993962518002, 0...."


In [12]:
def get_direct_recommendations(user_id, n_recommendations=5):
    """Get recommendations using original embeddings directly"""
    user_embedding = final_df[final_df['source_node'] == user_id]['user_embedding'].iloc[0]
    all_posts = final_df[['destination_node', 'embeddings']].drop_duplicates('destination_node')
    
    similarities = [
        np.dot(post_emb, user_embedding) / (np.linalg.norm(post_emb) * np.linalg.norm(user_embedding))
        for post_emb in all_posts['embeddings']
    ]
    
    top_k_indices = np.argsort(similarities)[-n_recommendations:][::-1]
    top_k_scores = [similarities[i] for i in top_k_indices]
    recommendations = all_posts.iloc[top_k_indices]
    
    return recommendations['destination_node'].tolist(), top_k_scores

# Test for a few users
test_users = final_df['source_node'].unique()[1000:1006]

for user_id in test_users:
    user_did = reverse_user_mapping.get(user_id)
    print(f"\nUser: https://bsky.app/profile/{user_did}")
    
    # Show user's recent likes
    user_likes = final_df[
        (final_df['source_node'] == user_id)
    ].sort_values('timestamp', ascending=False).head(5)
    
    print("\nUser's 5 Most Recent Likes:")
    for _, like in user_likes.iterrows():
        post_id = like['destination_node']
        author_did, post_ref = get_post_info(post_id)
        if author_did and post_ref:
            print(f"Post: https://bsky.app/profile/{author_did}/post/{post_ref}")
            print(f"Timestamp: {like['timestamp']}")
            print("---")
    
    # Get user's all liked posts
    liked_posts = set(final_df[final_df['source_node'] == user_id]['destination_node'])
    print(f"\nTotal number of posts user liked: {len(liked_posts)}")
    
    print("\nDirect Recommendations (using original embeddings):")
    rec_posts, rec_scores = get_direct_recommendations(user_id, n_recommendations=5)
    
    for post_id, score in zip(rec_posts, rec_scores):
        status = "Actually liked" if post_id in liked_posts else "Not liked"
        
        # Get post info
        author_did, post_ref = get_post_info(post_id)
        if author_did and post_ref:
            print(f"\nPost: https://bsky.app/profile/{author_did}/post/{post_ref}")
        else:
            print(f"\nPost ID {post_id} not found in mapping")
            
        print(f"Cosine Similarity: {score:.3f}, {status}")
        print("---")

    print("\nModel Recommendations (for comparison):")
    model_rec_posts, model_scores = get_recommendations(user_id, n_recommendations=5)
    
    for post_id, score in zip(model_rec_posts, model_scores):
        status = "Actually liked" if post_id in liked_posts else "Not liked"
        
        author_did, post_ref = get_post_info(post_id)
        if author_did and post_ref:
            print(f"\nPost: https://bsky.app/profile/{author_did}/post/{post_ref}")
        else:
            print(f"\nPost ID {post_id} not found in mapping")
            
        print(f"Model Score: {score:.3f}, {status}")
        print("---")


User: https://bsky.app/profile/did:plc:jiymvt4mz2vyerrhe7o7sgdn

User's 5 Most Recent Likes:
Post: https://bsky.app/profile/did:plc:33ihf27ze66oz3plaxucn47t/post/3jyvcdfbvuc2q
Timestamp: 20230624061811
---
Post: https://bsky.app/profile/did:plc:7gasecytjqmgvoptx6ypyhvo/post/3jybv5iftgk25
Timestamp: 20230618023235
---
Post: https://bsky.app/profile/did:plc:hwspbpfumiqyqzfeik7vilbu/post/3jxy2hdnhp72s
Timestamp: 20230612152619
---
Post: https://bsky.app/profile/did:plc:26xvddqtbyafothayvgvflzz/post/3jxxtyxhn4g2l
Timestamp: 20230612145333
---
Post: https://bsky.app/profile/did:plc:hwspbpfumiqyqzfeik7vilbu/post/3jxxt7jq7fa2j
Timestamp: 20230612145331
---

Total number of posts user liked: 467

Direct Recommendations (using original embeddings):

Post: https://bsky.app/profile/did:plc:ilsyluda2ek7zviuxr7k23yd/post/3jwr3ccfj2d2v
Cosine Similarity: 0.897, Not liked
---

Post: https://bsky.app/profile/did:plc:vipregezugaizr3kfcjijzrv/post/3jvntkqedvl26
Cosine Similarity: 0.897, Not liked
---



ValueError: too many values to unpack (expected 2)