In [134]:
from collections import defaultdict

import json
import time
import datetime
import torch
from torch_geometric.data import HeteroData

In [135]:
with open("../data/all_users.json", "r") as f:
    users = json.load(f)

with open("../data/all_problems.json", "r") as f:
    problems = json.load(f)

with open("../data/all_holds.json", "r") as f:
    problem_holds = json.load(f)

Clean the data
- Remove any users without any solved problems
- Remove any problems that have not been solved by any user
- Save the cleaned data back to JSON files

In [136]:
# Create a dictionary where the unique holds are the keys and 
# the values are the problems associated with each hold
holds = defaultdict(lambda: {"start":[], "middle": [], "end": []})

for problem, pholds in problem_holds.items():
    for section in ("start", "middle", "end"):
        for hold in pholds[section]:
            holds[hold][section].append(problem)

holds = dict(holds)

print(f"Number of unique holds: {len(holds)}")

Number of unique holds: 198


In [137]:
# Rename the keys in problems to match those in holds
problems = {key.replace(" ", "_"): value for key, value in problems.items()}

In [138]:
# Remove problems that have no holds
valid_problems = set(problem_holds.keys())
problems_no_holds = []
for key in problems.keys():
    if key not in valid_problems:
        problems_no_holds.append(key)

print(f"Number of problems with no holds: {len(problems_no_holds)}")

# Remove users that have no problems or no valid problems
# and clean up their problems field accordingly
empty_users = []
for key, user in users.items():
    if not "problems" in user:
        # Remove the user if they have no problems field
        empty_users.append(key)
    else:
        # Remove all the problems that are not in valid_problems
        user_problems = user["problems"]
        filtered_problems = {
            problem.replace(" ", "_"): value
            for problem, value in user_problems.items()
            if problem.replace(" ", "_") in valid_problems
        }
        # If the user has no problems left, mark them for removal
        if len(filtered_problems) == 0:
            empty_users.append(key)
        else:
            users[key]["problems"] = filtered_problems

print(f"Number of empty users: {len(empty_users)}")

for user_id in empty_users:
    del users[user_id]
for problem_id in problems_no_holds:
    del problems[problem_id]

Number of problems with no holds: 4084
Number of empty users: 39


In [139]:
user_ids = list(users.keys())
problem_ids = list(problems.keys())
hold_ids = list(holds.keys())

# Integer ids are needed for PyG tensors
user_id_to_idx = {uid: i for i, uid in enumerate(user_ids)}
problem_id_to_idx = {pid: i for i, pid in enumerate(problem_ids)}
hold_id_to_idx = {hid: i for i, hid in enumerate(hold_ids)}

print(f"Num of user_ids: {len(user_ids)}\nNum of problems: {len(problem_ids)}\nNum of holds: {len(hold_ids)}")

Num of user_ids: 25826
Num of problems: 34572
Num of holds: 198


### Feature "Engineering" (Converting non-numeric to numeric)

### Grades

In [140]:
# Collect all GRADES observed in users and problems
grades = set()
for u in users.values():
    if u['highest_grade'] is not None:
        grades.add(u['highest_grade'])
    for pinfo in u['problems'].values():
        if pinfo['grade'] is not None:
            grades.add(pinfo['grade'])

for p in problems.values():
    if p['grade'] is not None:
        grades.add(p['grade'])

grades = sorted(grades)
grade_to_idx = {g: i for i, g in enumerate(grades)}

print(f"Grade to index mapping: {list(grade_to_idx.items())[:5]}...")

def encode_grade(g):
    # Encode grade g to its integer index, or -1 if g is None
    if g is None:
        return -1
    return grade_to_idx[g]

Grade to index mapping: [('5+', 0), ('6A', 1), ('6A+', 2), ('6B', 3), ('6B+', 4)]...


TODO: User Bio\
I see two options here:
- Use simple sentiment analysis to convert comments to numeric values between -1 and 1 (simple, low effort, low memory, low expressiveness)
- Ignore comments for now (easy, but loses information)
Computing embeddings would be too memory-intensive for possibly not much gain.

TODO: User problem comment
- Same as for User Bio

TODO: Problem Setter
- Hashing?

TODO: Problem holds ("Any marked holds" / "Footless" / "Screw ons only" / ...)
- One-hot encoding or hashing?

### Node feature matrices

In [141]:
user_features = []

for uid in user_ids:
    user = users[uid]

    # All current numerical features
    ranking = float(user['ranking']) if user['ranking'] is not None else 0.0
    highest_grade_idx = float(encode_grade(user['highest_grade']))
    height = float(user['height']) if user['height'] is not None else 0.0
    weight = float(user['weight']) if user['weight'] is not None else 0.0
    problems_sent = float(user['problems_sent']) if user['problems_sent'] is not None else 0.0
    # bio = 

    user_features.append([ranking, highest_grade_idx, height, weight, problems_sent])

user_x = torch.tensor(user_features, dtype=torch.float)
print(f"User feature matrix shape: {user_x.shape}")

User feature matrix shape: torch.Size([25826, 5])


In [142]:
problem_features = []

for pid in problem_ids:
    problem = problems[pid]

    # All current numerical features
    grade_idx = float(encode_grade(problem['grade']))
    rating = float(problem['rating']) if problem['rating'] is not None else 0.0
    num_sends = float(problem['num_sends']) if problem['num_sends'] is not None else 0.0
    # setter =
    # holds =

    problem_features.append([grade_idx, rating, num_sends])

problem_x = torch.tensor(problem_features, dtype=torch.float)
print(f"Problem feature matrix shape: {problem_x.shape}")

Problem feature matrix shape: torch.Size([34572, 3])


### Graph edge index matrices with edge attributes

User-Problem edges

In [143]:
up_user_indices = []
up_problem_indices = []
up_edge_grades = []
up_edge_ratings = []
up_edge_dates = []
up_edge_attempts = []
# edge_comments = []

for uid, u in users.items():
    u_idx = user_id_to_idx[uid]
    for prob_name, interaction in u["problems"].items():
        if prob_name not in problem_id_to_idx:
            # Just in case we missed removing some problems
            continue

        p_idx = problem_id_to_idx[prob_name]

        up_user_indices.append(u_idx)
        up_problem_indices.append(p_idx)

        up_edge_grades.append(float(encode_grade(interaction["grade"])))
        up_edge_ratings.append(
            float(interaction["rating"]) if interaction["rating"] is not None else 0.0
        )
        up_edge_attempts.append(
            float(interaction["attempts"])
            if interaction["attempts"] is not None
            else 0.0
        )
        up_edge_dates.append(
            time.mktime(datetime.datetime.strptime(interaction["date"], "%Y-%m-%d").timetuple())
        ) # Unix timestamp
        # edge_comments.append(...)

up_edge_index = torch.tensor([up_user_indices, up_problem_indices], dtype=torch.long)
up_edge_attr = torch.tensor(
    list(zip(up_edge_grades, up_edge_ratings, up_edge_attempts)),  # shape: [num_edges, 3]
    dtype=torch.float,
)
up_edge_time = torch.tensor(up_edge_dates, dtype=torch.float)

print(f"Edge index shape: {up_edge_index.shape}")
print(f"Edge attribute shape: {up_edge_attr.shape}")
print(f"Edge time shape: {up_edge_time.shape}")

Edge index shape: torch.Size([2, 1627580])
Edge attribute shape: torch.Size([1627580, 3])
Edge time shape: torch.Size([1627580])


Problem-Hold edges

In [144]:
hp_hold_indices = []
hp_problem_indices = []
hp_is_start = []
hp_is_middle = []
hp_is_end = []

for hold, problems in holds.items():
    h_idx = hold_id_to_idx[hold]
    for type in ("start", "middle", "end"):
        for problem in problems[type]:
            p_idx = problem_id_to_idx[problem]

            hp_hold_indices.append(h_idx)
            hp_problem_indices.append(p_idx)

            hp_is_start.append(int(type == "start"))
            hp_is_middle.append(int(type == "middle"))
            hp_is_end.append(int(type == "end"))

hp_edge_index = torch.tensor([hp_hold_indices, hp_problem_indices], dtype=torch.long)
hp_edge_attr = torch.tensor(list(zip(hp_is_start, hp_is_middle, hp_is_end)), dtype=torch.float)

print(f"Edge index shape: {hp_edge_index.shape}")
print(f"Edge attribute shape: {hp_edge_attr.shape}")

Edge index shape: torch.Size([2, 304852])
Edge attribute shape: torch.Size([304852, 3])


### Assembling the HeteroData object (heterogeneous graph)

- This graph is perfect for PinSAGE, which can prefers heterogeneous graphs with node and edge features.
- For GFormer, we might need to make some adjustments.
- For LightGCN, we need a homogeneous graph.

In [145]:
hetero_data = HeteroData()

# Add the nodes and their features
hetero_data['user'].x = user_x                    # [num_users, user_feat_dim]
hetero_data['problem'].x = problem_x              # [num_problems, problem_feat_dim]
hetero_data['hold'].x = torch.eye(len(hold_ids))  # One-hot encoding for holds

# Add edges between users and problems
hetero_data['user', 'rates', 'problem'].edge_index = up_edge_index      # [2, num_edges]
hetero_data['user', 'rates', 'problem'].edge_attr = up_edge_attr        # [num_edges, edge_feat_dim]
hetero_data['user', 'rates', 'problem'].edge_time = up_edge_time        # [num_edges,]

# Add reverse edges (apparently good for GNN message passing):
hetero_data['problem', 'rated_by', 'user'].edge_index = up_edge_index.flip(0)
hetero_data['problem', 'rated_by', 'user'].edge_attr = up_edge_attr  # usually same attrs
hetero_data['problem', 'rated_by', 'user'].edge_time = up_edge_time

# Add edges between problems and holds
hetero_data['problem', 'contains', 'hold'].edge_index = hp_edge_index
hetero_data['problem', 'contains', 'hold'].edge_attr = hp_edge_attr

# Add reverse edges
hetero_data['hold', 'contained_in', 'problem'].edge_index = hp_edge_index.flip(0)
hetero_data['hold', 'contained_in', 'problem'].edge_attr = hp_edge_attr

### Assembling the Data object (graph)
- This is for LightGCN, which only supports homogeneous graphs.

In [146]:
data = hetero_data.to_homogeneous()