In [4]:
import json
import time
import datetime
import torch
from torch_geometric.data import HeteroData, Data

In [5]:
with open("../data/all_users.json", "r") as f:
    users = json.load(f)

with open("../data/all_problems.json", "r") as f:
    problems = json.load(f)

Clean the data
- Remove any users without any solved problems
- Remove any problems that have not been solved by any user
- Save the cleaned data back to JSON files

In [6]:
# TODO: Here we will also need to remove any problems 
# for which we couldn't get the hold information

empty_users = []
problems_solved = set()
for key, user in users.items():
    if not "problems" in user or len(user["problems"]) == 0:
        empty_users.append(key)
    else:
        for prob in user["problems"].keys():
            problems_solved.add(prob)
print(f"Number of empty users: {len(empty_users)}")


empty_problems = []
for key in problems.keys():
    if key not in problems_solved:
        empty_problems.append(key)
print(f"Number of unsolved problems: {len(empty_problems)}")

for user_id in empty_users:
    del users[user_id]
for problem_id in empty_problems:
    del problems[problem_id]

Number of empty users: 28
Number of unsolved problems: 4


In [7]:
user_ids = list(users.keys())
problem_ids = list(problems.keys())

# Integer ids are needed for PyG tensors
user_id_to_idx = {uid: i for i, uid in enumerate(user_ids)}
problem_id_to_idx = {pid: i for i, pid in enumerate(problem_ids)}

print(f"Num of user_ids: {len(user_ids)}\nNum of problems: {len(problem_ids)}")

Num of user_ids: 25837
Num of problems: 38659


### Feature "Engineering" (Converting non-numeric to numeric)

### Grades

In [8]:
# Collect all GRADES observed in users and problems
grades = set()
for u in users.values():
    if u['highest_grade'] is not None:
        grades.add(u['highest_grade'])
    for pinfo in u['problems'].values():
        if pinfo['grade'] is not None:
            grades.add(pinfo['grade'])

for p in problems.values():
    if p['grade'] is not None:
        grades.add(p['grade'])

grades = sorted(grades)
grade_to_idx = {g: i for i, g in enumerate(grades)}

print(f"Grade to index mapping: {list(grade_to_idx.items())[:5]}...")

def encode_grade(g):
    # Encode grade g to its integer index, or -1 if g is None
    if g is None:
        return -1
    return grade_to_idx[g]

Grade to index mapping: [('5+', 0), ('6A', 1), ('6A+', 2), ('6B', 3), ('6B+', 4)]...


TODO: User Bio\
I see two options here:
- Use simple sentiment analysis to convert comments to numeric values between -1 and 1 (simple, low effort, low memory, low expressiveness)
- Ignore comments for now (easy, but loses information)
Computing embeddings would be too memory-intensive for possibly not much gain.

TODO: User problem comment
- Same as for User Bio

TODO: Problem Setter
- Hashing?

TODO: Problem holds ("Any marked holds" / "Footless" / "Screw ons only" / ...)
- One-hot encoding or hashing?

### Node feature matrices

In [9]:
user_features = []

for uid in user_ids:
    user = users[uid]

    # All current numerical features
    ranking = float(user['ranking']) if user['ranking'] is not None else 0.0
    highest_grade_idx = float(encode_grade(user['highest_grade']))
    height = float(user['height']) if user['height'] is not None else 0.0
    weight = float(user['weight']) if user['weight'] is not None else 0.0
    problems_sent = float(user['problems_sent']) if user['problems_sent'] is not None else 0.0
    # bio = 

    user_features.append([ranking, highest_grade_idx, height, weight, problems_sent])

user_x = torch.tensor(user_features, dtype=torch.float)
print(f"User feature matrix shape: {user_x.shape}")

User feature matrix shape: torch.Size([25837, 5])


In [10]:
problem_features = []

for pid in problem_ids:
    problem = problems[pid]

    # All current numerical features
    grade_idx = float(encode_grade(problem['grade']))
    rating = float(problem['rating']) if problem['rating'] is not None else 0.0
    num_sends = float(problem['num_sends']) if problem['num_sends'] is not None else 0.0
    # setter =
    # holds =

    problem_features.append([grade_idx, rating, num_sends])

problem_x = torch.tensor(problem_features, dtype=torch.float)
print(f"Problem feature matrix shape: {problem_x.shape}")

Problem feature matrix shape: torch.Size([38659, 3])


In [11]:
user_indices = []
problem_indices = []
edge_grades = []
edge_ratings = []
edge_dates = []
edge_attempts = []
# edge_comments = []

for uid, u in users.items():
    u_idx = user_id_to_idx[uid]
    for prob_name, interaction in u["problems"].items():
        if prob_name not in problem_id_to_idx:
            # Just in case we missed removing some problems
            continue

        p_idx = problem_id_to_idx[prob_name]

        user_indices.append(u_idx)
        problem_indices.append(p_idx)

        edge_grades.append(float(encode_grade(interaction["grade"])))
        edge_ratings.append(
            float(interaction["rating"]) if interaction["rating"] is not None else 0.0
        )
        edge_attempts.append(
            float(interaction["attempts"])
            if interaction["attempts"] is not None
            else 0.0
        )
        edge_dates.append(
            time.mktime(datetime.datetime.strptime(interaction["date"], "%Y-%m-%d").timetuple())
        ) # Unix timestamp
        # edge_comments.append(...)

edge_index = torch.tensor([user_indices, problem_indices], dtype=torch.long)
edge_attr = torch.tensor(
    list(zip(edge_grades, edge_ratings, edge_attempts)),  # shape: [num_edges, 3]
    dtype=torch.float,
)
edge_time = torch.tensor(edge_dates, dtype=torch.float)

print(f"Edge index shape: {edge_index.shape}")
print(f"Edge attribute shape: {edge_attr.shape}")
print(f"Edge time shape: {edge_time.shape}")

Edge index shape: torch.Size([2, 1668865])
Edge attribute shape: torch.Size([1668865, 3])
Edge time shape: torch.Size([1668865])


### Assembling the HeteroData object (heterogeneous graph)

- This graph is perfect for PinSAGE, which can prefers heterogeneous graphs with node and edge features.
- For GFormer, we might need to make some adjustments.
- For LightGCN, we need a homogeneous graph.

In [None]:
hetero_data = HeteroData()

hetero_data['user'].x = user_x                 # [num_users, user_feat_dim]
hetero_data['problem'].x = problem_x           # [num_problems, problem_feat_dim]

hetero_data['user', 'rates', 'problem'].edge_index = edge_index      # [2, num_edges]
hetero_data['user', 'rates', 'problem'].edge_attr = edge_attr        # [num_edges, edge_feat_dim]
hetero_data['user', 'rates', 'problem'].edge_time = edge_time        # [num_edges,]

# Add reverse edges (apparently good for GNN message passing):
rev_edge_index = edge_index.flip(0)
hetero_data['problem', 'rev_rates', 'user'].edge_index = rev_edge_index
hetero_data['problem', 'rev_rates', 'user'].edge_attr = edge_attr  # usually same attrs
hetero_data['problem', 'rev_rates', 'user'].edge_time = edge_time


### Assembling the Data object (graph)
- This is for LightGCN, which only supports homogeneous graphs.

In [13]:
data = hetero_data.to_homogeneous()