In [1]:
import torch
from data_processing import load_interactions, create_interaction_matrices, create_adj_matrix
from model import LightGCN, UltraGCN
from training import train_lightgcn, train_ultragcn
from recommend import get_recommendations
from scipy.sparse import csr_matrix
from typing import List, Tuple, Dict
import umap
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
import duckdb


def print_user_interactions_and_recommendations(
    user_id: str,
    interaction_matrix: csr_matrix,
    follow_matrix: csr_matrix,
    recommendations: List[Tuple[str, str]],
    user_mapping: Dict,
    post_mapping: Dict,
    n_likes: int = 5,
    n_following: int = 10
):
    """Print a user's existing likes, follows, and recommendations with rich content."""
    # Connect to DuckDB
    con = duckdb.connect('../random_tests/scan_results.duckdb')
    
    # Get user's follows
    user_idx = user_mapping[user_id]
    user_follows = follow_matrix[user_idx].tocoo()
    following_indices = user_follows.col
    
    if len(following_indices) > n_following:
        following_indices = np.random.choice(following_indices, n_following, replace=False)
    
    # Convert indices back to URIs
    reverse_user_mapping = {v: k for k, v in user_mapping.items()}
    following_users = [reverse_user_mapping[idx] for idx in following_indices]
    
    # Print following users
    print(f"\nUser {user_id}'s {len(following_users)} random follows:")
    for i, following_uri in enumerate(following_users, 1):
        web_url = f"https://bsky.app/profile/{following_uri}"
        print(f"\n{i}. {following_uri}")
        print(f"   URL: {web_url}")
    
    # Get user's liked posts
    user_interactions = interaction_matrix[user_idx].tocoo()
    liked_post_indices = user_interactions.col[:n_likes]
    
    # Convert indices back to URIs
    reverse_post_mapping = {v: k for k, v in post_mapping.items()}
    liked_posts = [reverse_post_mapping[idx] for idx in liked_post_indices]
    
    # Fetch post content
    liked_posts_content = con.execute("""
        SELECT 
            'at://' || repo || '/app.bsky.feed.post/' || rkey as post_uri,
            json_extract_string(record, '$.text') as text,
            repo as author,
            createdAt
        FROM records 
        WHERE collection = 'app.bsky.feed.post'
            AND 'at://' || repo || '/app.bsky.feed.post/' || rkey IN (SELECT UNNEST(?))
    """, [liked_posts]).fetchdf()
    
    # Create post lookup
    post_content = {
        row['post_uri']: {
            'text': row['text'],
            'author': row['author'],
            'created_at': row['createdAt']
        }
        for _, row in liked_posts_content.iterrows()
    }
    
    # Print liked posts with content
    print(f"\nUser {user_id}'s {n_likes} most recent likes:")
    for i, post_uri in enumerate(liked_posts, 1):
        web_url = f"https://bsky.app/profile/{post_uri.split('/')[2]}/post/{post_uri.split('/')[-1]}"
        content = post_content.get(post_uri, {'text': 'Post not found', 'author': 'Unknown', 'created_at': 'Unknown'})
        print(f"\n{i}. By @{content['author']}")
        print(f"   {web_url}")
        print(f"   Posted: {content['created_at']}")
        print(f"   Text: {content['text'][:200]}...")  # Truncate long posts
    
    # Fetch recommendation content
    rec_uris = [uri for uri, _ in recommendations]
    rec_content = con.execute("""
        SELECT 
            'at://' || repo || '/app.bsky.feed.post/' || rkey as post_uri,
            json_extract_string(record, '$.text') as text,
            repo as author,
            createdAt
        FROM records 
        WHERE collection = 'app.bsky.feed.post'
            AND 'at://' || repo || '/app.bsky.feed.post/' || rkey IN (SELECT UNNEST(?))
    """, [rec_uris]).fetchdf()
    
    # Create recommendation lookup
    rec_lookup = {
        row['post_uri']: {
            'text': row['text'],
            'author': row['author'],
            'created_at': row['createdAt']
        }
        for _, row in rec_content.iterrows()
    }
    
    # Print recommendations with content
    print(f"\nTop {len(recommendations)} recommendations:")
    for i, (uri, web_url) in enumerate(recommendations, 1):
        content = rec_lookup.get(uri, {'text': 'Post not found', 'author': 'Unknown', 'created_at': 'Unknown'})
        print(f"\n{i}. By @{content['author']}")
        print(f"   {web_url}")
        print(f"   Posted: {content['created_at']}")
        print(f"   Text: {content['text'][:200]}...")  # Truncate long posts
    
    con.close()

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load and process data
print("Loading data...")
likes_df, follows_df, posts_df = load_interactions()
interaction_matrix, follow_matrix, user_mapping, post_mapping = create_interaction_matrices(likes_df, follows_df)

  from .autonotebook import tqdm as notebook_tqdm


Loading data...
Loaded 3618997 likes, 2426775 follows, and 3618975 unique posts before May 2023


In [2]:
# Print matrix dimensions
print(f"Interaction matrix shape: {interaction_matrix.shape}")
print(f"Follow matrix shape: {follow_matrix.shape}")
print(f"Number of users: {len(user_mapping)}")
print(f"Number of posts: {len(post_mapping)}")
print(f"Number of likes: {interaction_matrix.nnz}")
print(f"Number of follows: {follow_matrix.nnz}")

# Create adjacency matrix
print("\nCreating adjacency matrix...")
adj_matrix = create_adj_matrix(interaction_matrix, follow_matrix)
print(f"Adjacency matrix shape: {adj_matrix.size()}")

# Initialize model
print("\nInitializing model...")
model = LightGCN(
    num_users=len(user_mapping),
    num_items=len(post_mapping),
    embedding_dim=128,  # Reduced from 64
    num_layers=1      # Reduced from 3
)

# Training setup with memory-efficient parameters
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-4)

# Train model with smaller batch size
print("\nTraining model...")
train_lightgcn(
    model=model,
    optimizer=optimizer,
    train_data=interaction_matrix,
    adj_matrix=adj_matrix,
    epochs=2,
    batch_size=128,  # Smaller batch size
    device=device
)

# Example: Get recommendations for a user
print("\nGetting recommendations...")
user_id = list(user_mapping.keys())[17]  # Get first user as example
recommendations = get_recommendations(
    model=model,
    user_id=user_id,
    user_mapping=user_mapping,
    post_mapping=post_mapping,
    adj_matrix=adj_matrix,
    top_k=20,  # Explicitly setting to 20
    device=device
)

# Print both interactions and recommendations
print_user_interactions_and_recommendations(
    user_id=user_id,
    interaction_matrix=interaction_matrix,
    follow_matrix=follow_matrix,  # Added follow_matrix
    recommendations=recommendations,
    user_mapping=user_mapping,
    post_mapping=post_mapping,
    n_likes=10,
    n_following=10  # Added n_following
)

Interaction matrix shape: (61192, 1023480)
Follow matrix shape: (61192, 61192)
Number of users: 61192
Number of posts: 1023480
Number of likes: 3618040
Number of follows: 2412410

Creating adjacency matrix...


  adj = torch.sparse.FloatTensor(


Adjacency matrix shape: torch.Size([1084672, 1084672])

Initializing model...

Training model...


Training: 100%|██████████| 2/2 [02:26<00:00, 73.18s/it, avg_loss=0.6931]



Getting recommendations...

User did:plc:dhfmzwcqn6wbniomsekyidhy's 10 random follows:

1. did:plc:lesnvfd3lu3l4nv6c7w5j32r
   URL: https://bsky.app/profile/did:plc:lesnvfd3lu3l4nv6c7w5j32r

2. did:plc:hwwuhb7euc3tsivmoo4oq5u3
   URL: https://bsky.app/profile/did:plc:hwwuhb7euc3tsivmoo4oq5u3

3. did:plc:ocoqcmblonv3x3ffdztovf7t
   URL: https://bsky.app/profile/did:plc:ocoqcmblonv3x3ffdztovf7t

4. did:plc:lp6p54mbbesqispchdqwjx6y
   URL: https://bsky.app/profile/did:plc:lp6p54mbbesqispchdqwjx6y

5. did:plc:njguw7whj5obo6pt44xtg5hz
   URL: https://bsky.app/profile/did:plc:njguw7whj5obo6pt44xtg5hz

6. did:plc:kvdzawjtrc2h5rreicsh4noq
   URL: https://bsky.app/profile/did:plc:kvdzawjtrc2h5rreicsh4noq

7. did:plc:nm6og6lh5vd6obuozanjm4od
   URL: https://bsky.app/profile/did:plc:nm6og6lh5vd6obuozanjm4od

8. did:plc:x33ztdhow5mohxrpgvkbaacd
   URL: https://bsky.app/profile/did:plc:x33ztdhow5mohxrpgvkbaacd

9. did:plc:qafkjckuduuwecw62d6yen7d
   URL: https://bsky.app/profile/did:plc:qafkjckudu

In [3]:
# Print matrix dimensions
print(f"Interaction matrix shape: {interaction_matrix.shape}")
print(f"Follow matrix shape: {follow_matrix.shape}")
print(f"Number of users: {len(user_mapping)}")
print(f"Number of posts: {len(post_mapping)}")
print(f"Number of likes: {interaction_matrix.nnz}")
print(f"Number of follows: {follow_matrix.nnz}")

# Initialize model
print("\nInitializing model...")
model = UltraGCN(
    num_users=len(user_mapping),
    num_items=len(post_mapping),
    embedding_dim=128  # Reduced from 64
)

# Training setup
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Train model
print("\nTraining model...")
train_ultragcn(
    model=model,
    optimizer=optimizer,
    train_data=interaction_matrix,
    epochs=1,
    batch_size=128,
    device=device
)

# Example: Get recommendations for a user
print("\nGetting recommendations...")
user_id = list(user_mapping.keys())[17]  # Get first user as example
recommendations = get_recommendations(
    model=model,
    user_id=user_id,
    user_mapping=user_mapping,
    post_mapping=post_mapping,
    adj_matrix=create_adj_matrix(interaction_matrix, follow_matrix),
    top_k=20,
    device=device
)

# Print both interactions and recommendations
print_user_interactions_and_recommendations(
    user_id=user_id,
    interaction_matrix=interaction_matrix,
    follow_matrix=follow_matrix,  # Added follow_matrix
    recommendations=recommendations,
    user_mapping=user_mapping,
    post_mapping=post_mapping,
    n_likes=10,
    n_following=10  # Added n_following
)

Interaction matrix shape: (61192, 1023480)
Follow matrix shape: (61192, 61192)
Number of users: 61192
Number of posts: 1023480
Number of likes: 3618040
Number of follows: 2412410

Initializing model...

Training model...
Epoch 0: Loss = 0.7261

Getting recommendations...

User did:plc:dhfmzwcqn6wbniomsekyidhy's 10 random follows:

1. did:plc:7wzhuhnkgly4chjtbmv5i2ai
   URL: https://bsky.app/profile/did:plc:7wzhuhnkgly4chjtbmv5i2ai

2. did:plc:4kbj6g3iqy64qvkdfgzehgx6
   URL: https://bsky.app/profile/did:plc:4kbj6g3iqy64qvkdfgzehgx6

3. did:plc:onanopigdjgmhbjcvc7qe653
   URL: https://bsky.app/profile/did:plc:onanopigdjgmhbjcvc7qe653

4. did:plc:hoj2ei7terv5yqpq4sfppfb4
   URL: https://bsky.app/profile/did:plc:hoj2ei7terv5yqpq4sfppfb4

5. did:plc:rbbw3odu2obiuytel6vbx7db
   URL: https://bsky.app/profile/did:plc:rbbw3odu2obiuytel6vbx7db

6. did:plc:6gb3zuae6f3sohjauvnmhmji
   URL: https://bsky.app/profile/did:plc:6gb3zuae6f3sohjauvnmhmji

7. did:plc:x33ztdhow5mohxrpgvkbaacd
   URL: https

In [4]:
user_ids = list(user_mapping.keys())
print("Got user ids")
# Get number of likes and follows for each user
user_stats = []
for user_id in user_ids[:1000]:  # First 1000 users
    user_idx = user_mapping[user_id]
    n_likes = interaction_matrix[user_idx].nnz
    n_follows = follow_matrix[user_idx].nnz
    n_followers = follow_matrix[:,user_idx].nnz
    user_stats.append({
        'user_id': user_id,
        'likes': n_likes,
        'follows': n_follows,
        'followers': n_followers
    })

# Convert to DataFrame for nice display
import pandas as pd
stats_df = pd.DataFrame(user_stats)
print("\nUser Statistics:")
print(stats_df.head(20))

# Print some summary statistics
print("\nSummary Statistics:")
print(stats_df.describe())

Got user ids

User Statistics:
                             user_id  likes  follows  followers
0   did:plc:42kmtf65uqs765coei7bimwx      2      197         27
1   did:plc:7bo3bipb4qeg43bm5v5oawlu    336      118         90
2   did:plc:h7kqugmh2mvqzemxpaoxakyg      1       20         36
3   did:plc:i2klgxl4rzuym26g4gvfdvit      4       21         23
4   did:plc:nzvgo63bw5h5p7qg3zfm7q5y     15       14         17
5   did:plc:eegoq56xcpkc77rlo2s4seoi      1        2         16
6   did:plc:vb2tn23gmof5swuml3mlskw7      1        2          9
7   did:plc:yd7pivfgibnxcect3aktpj6m      2       10          5
8   did:plc:rteljm56xd4zgrfudkbbrvya      1        0         11
9   did:plc:yv7e3hl2eoi3ntqxa6ovisqw     11       12         20
10  did:plc:bjary5ts3e3fzasmuemntzdq      6       32         49
11  did:plc:lcv6xa22hl2qwqkjgyw422rf     31       17         40
12  did:plc:y7a6wpo3j2oqctfze77xw26l    231      102         72
13  did:plc:pwumafx2wqfczpwe2pfbnnmi     74       72         54
14  did:p

In [5]:
def visualize_embeddings(user_embeddings, item_embeddings, user_mapping, post_mapping, n_samples=1000):
    """Visualize user and item embeddings using UMAP with clickable links."""
    # Convert embeddings to CPU numpy arrays
    user_emb = user_embeddings.cpu().detach().numpy()
    item_emb = item_embeddings.cpu().detach().numpy()
    
    # Sample if too many points
    user_indices = np.arange(len(user_emb))
    item_indices = np.arange(len(item_emb))
    if len(user_emb) > n_samples:
        user_indices = np.random.choice(len(user_emb), n_samples, replace=False)
        user_emb = user_emb[user_indices]
    if len(item_emb) > n_samples:
        item_indices = np.random.choice(len(item_emb), n_samples, replace=False)
        item_emb = item_emb[item_indices]
    
    # Combine embeddings for UMAP
    combined_emb = np.vstack([user_emb, item_emb])
    
    # Fit UMAP
    reducer = umap.UMAP(random_state=42)
    embedding = reducer.fit_transform(combined_emb)
    
    # Split back into users and items
    user_umap = embedding[:len(user_emb)]
    item_umap = embedding[len(user_emb):]
    
    # Create hover texts with links
    reverse_user_mapping = {v: k for k, v in user_mapping.items()}
    reverse_post_mapping = {v: k for k, v in post_mapping.items()}
    
    # Create user links
    user_links = []
    for idx in user_indices:
        uri = reverse_user_mapping[idx]  # Get the full URI
        url = f"https://bsky.app/profile/{uri}"  # Use full URI
        user_links.append(f"<a href='{url}' target='_blank'>View Profile</a>")
    
    # Create post links
    post_links = []
    for idx in item_indices:
        uri = reverse_post_mapping[idx]
        parts = uri.split('/')
        did = parts[2]
        post_id = parts[-1]
        url = f"https://bsky.app/profile/{did}/post/{post_id}"
        post_links.append(f"<a href='{url}' target='_blank'>View Post</a>")
    
    # Create figure
    fig = go.Figure()
    
    # Add users
    fig.add_trace(
        go.Scatter(
            x=user_umap[:, 0],
            y=user_umap[:, 1],
            mode='markers',  # Removed text mode
            name='Users',
            hovertext=user_links,  # Changed to hovertext
            marker=dict(size=8, color='blue', opacity=0.6),
            hoverinfo='text'
        )
    )
    
    # Add posts
    fig.add_trace(
        go.Scatter(
            x=item_umap[:, 0],
            y=item_umap[:, 1],
            mode='markers',  # Removed text mode
            name='Posts',
            hovertext=post_links,  # Changed to hovertext
            marker=dict(size=8, color='red', opacity=0.6),
            hoverinfo='text'
        )
    )
    
    # Update layout
    fig.update_layout(
        title='UMAP visualization of user and post embeddings',
        showlegend=True,
        hovermode='closest'
    )
    
    fig.show()

# For LightGCN
user_embeddings, item_embeddings = model(adj_matrix)
visualize_embeddings(user_embeddings, item_embeddings, user_mapping, post_mapping)

# # For UltraGCN
# user_embeddings, item_embeddings = model.get_embeddings()
# visualize_embeddings(user_embeddings, item_embeddings)

TypeError: UltraGCN.forward() missing 1 required positional argument: 'items'