In [None]:
import sys, os
sys.path.append(os.path.abspath(".."))

from utils.graph_creation import create_hetero_graph
from utils.training_utils import train_val_test_split

Prepare data.

In [None]:
hetero_data = create_hetero_graph(holds_as_nodes=True, standardize=False)
edge_type = ("user", "rates", "problem")

message_data, train_data, val_data, test_data = train_val_test_split(
    hetero_data,
    edge_type=edge_type,
    message_p=0.7,
    train_p=0.1,
    val_p=0.1,
    by_user=True,
    standardize=False
)

In [4]:
import numpy as np
l = len(val_data["problem"].x)
train_problems = {u: np.zeros(l, dtype=bool) for u in range(len(val_data["user"].x))}
max_grade = {}
test_problems = {}
problem_grade_popularity = np.zeros((len(val_data["problem"].x), 3))

for data in [message_data, train_data, val_data]:
    for u, p in data[edge_type].edge_index.t().tolist():
        train_problems[u][p] = True

for u in range(len(val_data["user"].x)):
    max_grade.setdefault(u, val_data["user"].x[u, 0].item())

for u, p in test_data[edge_type].edge_index.t().tolist():
        test_problems.setdefault(u, []).append(p)

for p in range(len(val_data["problem"].x)):
    problem_grade_popularity[p, :] = [p, val_data["problem"].x[p, 0].item(), val_data["problem"].x[p, 2].item()]

In [13]:
def recall_at_k(topk, test_problems, k=20):
    recalls = []
    for u, pos_items in test_problems.items():
        hit_count = len(set(pos_items) & set(topk[u]))
        recall = hit_count / len(pos_items)
        recalls.append(recall)

    return recalls 

def mean_se(arr):
    return np.mean(arr), np.std(arr) / np.sqrt(len(arr))

### Regardless of grade

In [16]:
topk = {}
k = 20
for u, ps in train_problems.items():
    arr = problem_grade_popularity[~ps]
    arr = arr[arr[:, 2].argsort()[::-1]]
    topk[u] = list(arr[:20, 0])

recall = recall_at_k(topk, test_problems)
mean, se = mean_se(recall)
print(f"Popularity baseline Recall@20: {mean:.4} +- {se:.4}")

Popularity baseline Recall@20: 0.344 +- 0.002881


### Grades up to n grades below max climbed grade

In [17]:
for n in [5, 6, 7, 8]:
    for m in [0, 1, 2, 3, 4]:
        topk = {}
        k = 20
        for u, ps in train_problems.items():
            arr = problem_grade_popularity[(~ps) & 
                                            (problem_grade_popularity[:, 1] <= max_grade[u] + m) & 
                                            (max(max_grade[u] - n, 0) <= problem_grade_popularity[:, 1])]
            arr = arr[arr[:, 2].argsort()[::-1]]
            topk[u] = list(arr[:20, 0])
        recall = recall_at_k(topk, test_problems)
        mean, se = mean_se(recall)
        print(f"Recall@20 for {n} grades below, {m} grades above max climbed: {mean:.4} +- {se:.4}")

Recall@20 for 5 grades below, 0 grades above max climbed: 0.3207 +- 0.002708
Recall@20 for 5 grades below, 1 grades above max climbed: 0.3132 +- 0.002671
Recall@20 for 5 grades below, 2 grades above max climbed: 0.3073 +- 0.002646
Recall@20 for 5 grades below, 3 grades above max climbed: 0.3038 +- 0.002634
Recall@20 for 5 grades below, 4 grades above max climbed: 0.3034 +- 0.002632
Recall@20 for 6 grades below, 0 grades above max climbed: 0.3212 +- 0.002724
Recall@20 for 6 grades below, 1 grades above max climbed: 0.3137 +- 0.002687
Recall@20 for 6 grades below, 2 grades above max climbed: 0.3078 +- 0.002662
Recall@20 for 6 grades below, 3 grades above max climbed: 0.3043 +- 0.00265
Recall@20 for 6 grades below, 4 grades above max climbed: 0.3039 +- 0.002649
Recall@20 for 7 grades below, 0 grades above max climbed: 0.3191 +- 0.002733
Recall@20 for 7 grades below, 1 grades above max climbed: 0.3117 +- 0.002696
Recall@20 for 7 grades below, 2 grades above max climbed: 0.3057 +- 0.002671
