In [1]:
import json
import random
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import torch
from torch import nn, optim, Tensor

from torch_sparse import SparseTensor, matmul

from torch_geometric.utils import structured_negative_sampling, negative_sampling
from torch_geometric.nn.conv.gcn_conv import gcn_norm
from torch_geometric.nn.conv import MessagePassing

In [3]:
# defines LightGCN model
class LightGCN(MessagePassing):
    """LightGCN Model as proposed in https://arxiv.org/abs/2002.02126
    """

    def __init__(self, num_users, num_items, course_ada_emb=None, embedding_dim=64, K=3, add_self_loops=False):
        """Initializes LightGCN Model

        Args:
            num_users (int): Number of users
            num_items (int): Number of items
            embedding_dim (int, optional): Dimensionality of embeddings. Defaults to 8.
            K (int, optional): Number of message passing layers. Defaults to 3.
            add_self_loops (bool, optional): Whether to add self loops for message passing. Defaults to False.
        """
        super().__init__()
        self.num_users, self.num_items = num_users, num_items
        self.embedding_dim, self.K = embedding_dim, K
        self.add_self_loops = add_self_loops
        self.cours_ada_emb = course_ada_emb
        ada_emb_len = course_ada_emb.shape[-1]
        
        # pre_diffusion embedding will be used for regularization computing
        self.users_emb = nn.Embedding(num_embeddings=self.num_users, embedding_dim=self.embedding_dim) # e_u^0
        self.items_emb = nn.Embedding(num_embeddings=self.num_items, embedding_dim=self.embedding_dim) # e_i^0
        self.emb_transform = nn.Sequential(nn.Linear(in_features=ada_emb_len, out_features=embedding_dim),
                                           nn.GELU(),
                                           nn.LayerNorm(embedding_dim),
                                           nn.Linear(in_features=embedding_dim, out_features=embedding_dim),
                                           nn.GELU(),
                                           nn.LayerNorm(embedding_dim))
        self.score_mat = nn.Parameter(torch.eye(embedding_dim),requires_grad=False)
        
        # embedding after multi-scale diffusion
        # this will be used to give final recommendation/compute brp loss
        self.users_emb_final = None
        self.items_emb_final = None
        

        nn.init.normal_(self.users_emb.weight, std=0.1)
        nn.init.normal_(self.items_emb.weight, std=0.1)

    def forward(self, edge_index: SparseTensor, weight=1):
        """Forward propagation of LightGCN Model.

        Args:
            edge_index (SparseTensor): adjacency matrix

        Returns:
            tuple (Tensor): e_u_k, e_u_0, e_i_k, e_i_0
        """
        # compute \tilde{A}: symmetrically normalized adjacency matrix
        edge_index_norm = gcn_norm(edge_index, add_self_loops=self.add_self_loops)
        
        course_ada_emb = self.emb_transform(self.cours_ada_emb)
        course_emb = course_ada_emb * weight + self.items_emb.weight

        emb_0 = torch.cat([self.users_emb.weight, course_emb]) # E^0
        embs = [emb_0]
        emb_k = emb_0

        # multi-scale diffusion
        for i in range(self.K):
            emb_k = self.propagate(edge_index_norm, x=emb_k)
            embs.append(emb_k)

        embs = torch.stack(embs, dim=1)
        emb_final = torch.mean(embs, dim=1) # E^K

        users_emb_final, items_emb_final = torch.split(emb_final, [self.num_users, self.num_items]) # splits into e_u^K and e_i^K
        
        self.users_emb_final = users_emb_final
        self.items_emb_final = items_emb_final
        # rating_mat = torch.matmul(users_emb_final, items_emb_final.T)
        # returns the embedding of both the original and after multiscale diffusion
        
        return users_emb_final, self.users_emb.weight, items_emb_final,  self.items_emb.weight

    def message(self, x_j: Tensor) -> Tensor:
        return x_j

    def message_and_aggregate(self, adj_t: SparseTensor, x: Tensor) -> Tensor:
        # computes \tilde{A} @ x
        return matmul(adj_t, x)

In [4]:
model: LightGCN = torch.load('LightGCN/lightgcn_12_10.pt')

## evaluation

### load data

In [7]:
with open('new_planned_courses.json') as f:
    planned_courses = json.load(f)
course_counter = {}

# Iterating through each student's data to count enrollments and courses, excluding 'Unplanned'
for student, quarters in planned_courses.items():
    for quarter, courses in quarters.items():
        if (quarter != 'None'):
            # Counting course frequency
            for course in courses:
                if course== 'RESTRICTED':
                    continue
                if not (course in course_counter):
                    course_counter[course] = 1
                else:
                    course_counter[course] += 1  
                    
# course_counter
min_enrollments_thre = 4
course_count_df = pd.DataFrame(course_counter.items(), columns=['Course', 'Enrollments'])
course_count_df = course_count_df.drop(index=course_count_df[course_count_df.Enrollments < min_enrollments_thre].index)
selected_courses = list(course_count_df.Course)

In [8]:
userIds = {}
courseIds = {}
numEdges = 0
min_courses_thre = 4

# clean up data, create userIds and courseIds
num_course_each_student = []
Ids = []

for user in list(planned_courses.keys()):
    
    # Pt.1 we count the total amount of courses each user already took
    # and we get rid of the users with only a few course pinned
    num = 0
    for quarter in planned_courses[user]:
        num += len(planned_courses[user][quarter])
    
    if num < min_courses_thre:
        del planned_courses[user]
        continue
    else:
        userIds[user] = len(userIds)
    
    #Pt.2 build the userIds and CourseIds dict
    for quarter in planned_courses[user]:
        numEdges += len(planned_courses[user][quarter])
        for course in planned_courses[user][quarter]:
            if course == 'RESTRICTED' or (course not in selected_courses):
                numEdges -= 1
                continue
            if (course not in courseIds) :
                courseIds[course] = len(courseIds)
    Ids.append(len(userIds))
    num_course_each_student.append(num)

#The course' IDs will be followed after the user's IDs

for item in courseIds:
    courseIds[item] += len(userIds)

print('# of real users:', len(userIds))
print('# of courses:', len(courseIds))
print('# of edges:', numEdges)

# of real users: 1895
# of courses: 1518
# of edges: 27086


### create edge index, get num users and courses

In [11]:
# create edge_index
j = 0
edge_index = torch.zeros((2, numEdges), dtype=int)
train_edge_index = []
val_edge_index_sup = []

for user in planned_courses:
    user_edge_index = []
    for quarter in planned_courses[user]:
        for course in planned_courses[user][quarter]:
            if (course== 'RESTRICTED') or (course not in selected_courses):
                continue
            edge_index[0][j] = userIds[user]
            edge_index[1][j] = courseIds[course]
            user_edge_index.append((userIds[user], courseIds[course]))
            j += 1
            
    # Here we split 
    user_num_courses = len(user_edge_index)
    user_train_indices = random.sample([i for i in range(user_num_courses)], k=int(0.8*user_num_courses))
    user_val_indices = list(set([i for i in range(user_num_courses)]) - set(user_train_indices))
    train_edge_index += [user_edge_index[i] for i in user_train_indices]
    val_edge_index_sup += [user_edge_index[i] for i in user_val_indices]
    
train_edge_index = torch.tensor(train_edge_index).T
val_edge_index_sup = torch.tensor(val_edge_index_sup).T
val_edge_index_msg = train_edge_index

num_users, num_courses = len(userIds), len(courseIds)
numNodes = num_users + num_courses

### helper functions for evaluation

In [9]:
# helper function to get N_u
def get_user_positive_items(edge_index):
    """Generates dictionary of positive items for each user

    Args:
        edge_index (torch.Tensor): 2 by N list of edges

    Returns:
        dict: dictionary of positive items for each user
    """
    user_pos_items = {}
    for i in range(edge_index.shape[1]):
        user = edge_index[0][i].item()
        item = edge_index[1][i].item()
        if user not in user_pos_items:
            user_pos_items[user] = []
        user_pos_items[user].append(item)
    return user_pos_items

In [12]:
model.eval()
user_pos_items = get_user_positive_items(edge_index)
id_to_course = {v: k for k, v in courseIds.items()}

def make_predictions(user, num_recs, only_new=True):
    e_u = model.users_emb_final[user]
    scores = model.items_emb_final @ e_u

    values, indices = torch.topk(scores, k=len(user_pos_items[user]) + num_recs)
    indices = [index.item() for index in indices]
    
    print(f"Here are classes user {user} has already taken:")
    for index in user_pos_items[user]:
        print(id_to_course[index])
    print()
    
    print(f'Here are the top recommended courses{" (* means already taken)" if not only_new else ""}:')
    i = 0
    not_yet_taken = 0
    while not_yet_taken < num_recs:
        if not only_new or indices[i]+num_users not in user_pos_items[user]:
            print(f"{id_to_course[indices[i]+num_users]} {'(*)' if indices[i]+num_users in user_pos_items[user] else ''}")
        if indices[i]+num_users not in user_pos_items[user]:
            not_yet_taken += 1
        i += 1

### find user

In [41]:
def find_user_with_classes(classes):
    classes = set(classes)
    for user in user_pos_items:
        user_classes = set({id_to_course[id] for id in user_pos_items[user]})
        if classes.issubset(user_classes):
            print(user, user_classes)

In [49]:
find_user_with_classes(["CS109", "ME30", "BIO81", "CS11SI"])

480 {'CS109', 'ME108', 'CS103', 'JAPAN82N', 'PHYSWELL103', 'ME248', 'EMED124', 'PHYSICS41', 'CME100', 'CS106B', 'COLLEGE101', 'CS12SI', 'PWR1RB', 'CS129', 'STATS216', 'CME102', 'ME298', 'STATS203', 'DATASCI112', 'EMED127', 'CS107', 'PSYCH15N', 'ME30', 'ENGR14', 'BIO81', 'CS111', 'CS106A', 'CS131', 'STATS200', 'ME1', 'COLLEGE102', 'CS11SI', 'CS124'}


### final cell: make predictions

In [50]:
USER_ID = 480 #115, 675, 208
NUM_RECS = 15

make_predictions(USER_ID, NUM_RECS)

Here are classes user 480 has already taken:
CME100
COLLEGE101
CS106A
ME1
COLLEGE102
CS106B
ME30
PHYSICS41
CS107
ENGR14
JAPAN82N
PWR1RB
BIO81
CS109
CS111
CS11SI
CME102
CS103
CS124
CS129
CS12SI
CS131
DATASCI112
EMED124
EMED127
ME298
PHYSWELL103
PSYCH15N
STATS200
STATS203
STATS216
ME108
ME248

Here are the top recommended courses:
CS107E 
PHYSICS43 
DESIGN172 
UAR101J 
MATH51 
COLLEGE112 
MATH104 
CS161 
CME192 
CLASSICS84 
FRENLANG1 
ECON1 
MATH53 
ME210 
PHYSICS41E 
