In [685]:
import numpy as np
import torch
import pandas as pd
import torch.nn.functional as F
from torch.nn import Linear
import torch_geometric.transforms as T
from torch_geometric.nn import SAGEConv, to_hetero, GCNConv
from torch_geometric.transforms import RandomLinkSplit, ToUndirected
from torch_geometric.data import HeteroData

data_folder = "../data/interm/"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = "cpu"

In [686]:
users = pd.read_csv(data_folder + "users.csv")
items = pd.read_csv(data_folder + "items.csv")
ratings = pd.read_csv(data_folder + "ratings.csv")
genres = pd.read_csv("../data/raw/ml-100k/u.genre", delimiter="|", names=["name","index"])

Edges: ratings
Nodes: users, items
Graph type: bipartite

In [687]:
def create_torch_edges(ratings):
    src = ratings["user_id"] - 1
    dst = ratings["item_id"] - 1
    attrs = ratings["rating"]
    
    edge_index = torch.tensor([src, dst], dtype=torch.int64)
    edge_attr = torch.tensor(attrs)
    
    return edge_index, edge_attr

In [688]:
edge_index, edge_attr = create_torch_edges(ratings)

In [689]:
edge_index.shape

torch.Size([2, 100000])

In [690]:
items

Unnamed: 0,movie_id,movie_title,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,release_year
0,1,Toy Story (1995),0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,1995.0
1,2,GoldenEye (1995),0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1995.0
2,3,Four Rooms (1995),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1995.0
3,4,Get Shorty (1995),0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1995.0
4,5,Copycat (1995),0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,1995.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1998.0
1678,1679,B. Monkey (1998),0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,1998.0
1679,1680,Sliding Doors (1998),0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1998.0
1680,1681,You So Crazy (1994),0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1994.0


In [691]:
from sentence_transformers import SentenceTransformer

def SequenceEncoder(movie_titles , model_name=None):
    model = SentenceTransformer(model_name, device=device)
    title_embeddings = model.encode(movie_titles, show_progress_bar=True,
                              convert_to_tensor=True, device=device)
    
    return title_embeddings.to("cpu")

item_title = SequenceEncoder(items["movie_title"], model_name='all-MiniLM-L6-v2')
item_genres = torch.tensor(items[genres.name].to_numpy(), dtype=torch.bool)
item_release_year = torch.tensor(items["release_year"].to_numpy()[:,np.newaxis], dtype=torch.int32)

item_x = torch.cat((item_title, item_genres), dim=-1).float()

Batches:   0%|          | 0/53 [00:00<?, ?it/s]

In [692]:
users

Unnamed: 0,user_id,age,zip_code,male,female,occupation_technician,occupation_other,occupation_writer,occupation_executive,occupation_administrator,...,occupation_librarian,occupation_homemaker,occupation_artist,occupation_engineer,occupation_marketing,occupation_none,occupation_healthcare,occupation_retired,occupation_salesman,occupation_doctor
0,1,24,85711,True,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,2,53,94043,False,True,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,3,23,32067,True,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
3,4,24,43537,True,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,5,33,15213,False,True,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
938,939,26,33319,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
939,940,32,02215,True,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
940,941,20,97229,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
941,942,48,78209,False,True,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False


In [693]:
users.zip_code.to_numpy()

array(['85711', '94043', '32067', '43537', '15213', '98101', '91344',
       '05201', '01002', '90703', '30329', '06405', '29206', '55106',
       '97301', '10309', '06355', '37212', '02138', '95660', '30068',
       '40206', '48197', '94533', '55107', '21044', '30030', '55369',
       '94043', '55436', '10003', '78741', '27510', '42141', '42459',
       '93117', '55105', '54467', '01040', '27514', '80525', '17870',
       '20854', '46260', '50233', '46538', '07102', '12550', '76111',
       '52245', '16509', '55105', '55414', '66315', '01331', '46260',
       '84010', '52246', '08403', '06472', '30040', '97214', '75240',
       '43202', '48118', '80521', '60402', '22904', '55337', '60067',
       '98034', '73034', '41850', 'T8H1N', '08816', '02215', '29379',
       '61801', '03755', '52241', '21218', '22902', '44133', '55369',
       '20003', '46005', '89503', '11701', '68106', '78155', '01913',
       '80525', '23112', '71457', '10707', '75206', '98006', '90291',
       '63129', '902

In [694]:
user_ages = torch.tensor(users["age"].to_numpy()[:,np.newaxis], dtype=torch.uint8)
user_sex = torch.tensor(users[["male", "female"]].to_numpy(), dtype=torch.bool)
occupations = [i for i in users.keys() if i.startswith("occupation_")]
user_occupation = torch.tensor(users[occupations].to_numpy(), dtype=torch.bool)
user_x = torch.cat((user_ages, user_sex, user_occupation), dim=-1).float()

In [695]:
data = HeteroData()

# data['user'].num_nodes = len(users)
# data['user'].x = torch.eye(data['user'].num_nodes, device=device)
# del data['user'].num_nodes
data['user'].x = user_x
data['item'].x = item_x
data['user', 'rates', 'item'].edge_index = edge_index
data['user', 'rates', 'item'].edge_label = edge_attr

In [696]:
data = ToUndirected()(data)
del data['item', 'rev_rates', 'user'].edge_label  # Remove "reverse" label.
data = data.to(device)

# Perform a link-level split into training, validation, and test edges.
train_data, val_data, test_data = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    neg_sampling_ratio=0.0,
    edge_types=[('user', 'rates', 'item')],
    rev_edge_types=[('item', 'rev_rates', 'user')],
)(data)

In [697]:
weight = torch.bincount(train_data['user', 'rates', 'item'].edge_label)
weight = weight.max() / weight
def weighted_mse_loss(pred, target, weight=None):
    weight = 1. if weight is None else weight[target].to(pred.dtype)
    return (weight * (pred - target.to(pred.dtype)).pow(2)).mean()

In [698]:
from torch.nn import Dropout
from torch_geometric.nn import GATv2Conv, RGCNConv, HeteroConv, GINConv
from torch_geometric.utils.dropout import *


class GNNEncoder(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        # these convolutions have been replicated to match the number of edge types\
        self.conv1 = SAGEConv((-1, -1), hidden_channels)
        self.conv2 = SAGEConv((-1, -1), hidden_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x
class EdgeDecoder(torch.nn.Module):
    def __init__(self, n_factors, hidden_channels):
        super().__init__()
        self.lin1 = Linear(2 * n_factors, hidden_channels)
        self.dropout1 = Dropout(p=0.5)
        self.lin2 = Linear(hidden_channels, hidden_channels)
        self.dropout2 = Dropout(p=0.5)
        self.lin3 = Linear(hidden_channels, hidden_channels)
        self.dropout3 = Dropout(p=0.25)
        self.lin4 = Linear(hidden_channels, 1)
        
    def forward(self, z_dict, edge_label_index):
        row, col = edge_label_index
        # concat user and movie embeddings
        z = torch.cat([z_dict['user'][row], z_dict['item'][col]], dim=-1)
        # concatenated embeddings passed to linear layer
        z = self.lin1(z).relu()
        z = self.dropout1(z)
        z = self.lin2(z).relu()
        z = self.dropout2(z)
        z = self.lin3(z).relu()
        z = self.dropout3(z)
        z = self.lin4(z)
        return z.view(-1)
class Model(torch.nn.Module):
    def __init__(self, n_factors, hidden_channels):
        super().__init__()
        self.encoder = GNNEncoder(n_factors)
        self.encoder = to_hetero(self.encoder, data.metadata(), aggr='sum')
        self.decoder = EdgeDecoder(n_factors, hidden_channels)

    def forward(self, x_dict, edge_index_dict, edge_label_index):
        # z_dict contains dictionary of movie and user embeddings returned from GraphSage
        edge_label_index, mask = dropout_edge(edge_label_index, p=0.02, training=self.training)
        z_dict = self.encoder(x_dict, edge_index_dict)
        output = self.decoder(z_dict, edge_label_index)
        output = torch.sigmoid(output)
        output = output * 4 + 1
        return output, mask
model = Model(n_factors=150, hidden_channels=200).to(device)
# Due to lazy initialization, we need to run one model step so the number
# of parameters can be inferred:
with torch.no_grad():
    model.encoder(train_data.x_dict, train_data.edge_index_dict)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [699]:
print(model)

Model(
  (encoder): GraphModule(
    (conv1): ModuleDict(
      (user__rates__item): SAGEConv((-1, -1), 150, aggr=mean)
      (item__rev_rates__user): SAGEConv((-1, -1), 150, aggr=mean)
    )
    (conv2): ModuleDict(
      (user__rates__item): SAGEConv((-1, -1), 150, aggr=mean)
      (item__rev_rates__user): SAGEConv((-1, -1), 150, aggr=mean)
    )
  )
  (decoder): EdgeDecoder(
    (lin1): Linear(in_features=300, out_features=200, bias=True)
    (dropout1): Dropout(p=0.5, inplace=False)
    (lin2): Linear(in_features=200, out_features=200, bias=True)
    (dropout2): Dropout(p=0.5, inplace=False)
    (lin3): Linear(in_features=200, out_features=200, bias=True)
    (dropout3): Dropout(p=0.25, inplace=False)
    (lin4): Linear(in_features=200, out_features=1, bias=True)
  )
)


In [700]:
from torch.nn.functional import mse_loss
from torch.nn import MSELoss

loss_f = MSELoss()

def train():
    model.train()
    optimizer.zero_grad()
    pred, mask = model(train_data.x_dict, train_data.edge_index_dict,
                 train_data['user', 'rates', 'item'].edge_label_index)
    target = train_data['user', 'rates', 'item'].edge_label
    loss = weighted_mse_loss(pred, target[mask], weight)
    loss.backward()
    optimizer.step()
    return float(loss)

In [701]:
@torch.no_grad()
def test(data):
    model.eval()
    pred, _ = model(data.x_dict, data.edge_index_dict,
                 data['user', 'rates', 'item'].edge_label_index)
    target = data['user', 'rates', 'item'].edge_label.float()
    rmse = F.mse_loss(pred, target).sqrt()
    return float(rmse)

In [709]:
for epoch in range(1, 100):
    loss = train()
    train_rmse = test(train_data)
    val_rmse = test(val_data)
    test_rmse = test(test_data)
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train: {train_rmse:.4f}, '
          f'Val: {val_rmse:.4f}, Test: {test_rmse:.4f}')

Epoch: 001, Loss: 3.3564, Train: 1.2022, Val: 1.2068, Test: 1.2006
Epoch: 002, Loss: 3.3476, Train: 1.2554, Val: 1.2599, Test: 1.2529
Epoch: 003, Loss: 3.3454, Train: 1.2023, Val: 1.2067, Test: 1.2007
Epoch: 004, Loss: 3.3234, Train: 1.2388, Val: 1.2429, Test: 1.2364
Epoch: 005, Loss: 3.3044, Train: 1.1974, Val: 1.2015, Test: 1.1958
Epoch: 006, Loss: 3.2884, Train: 1.2843, Val: 1.2887, Test: 1.2817
Epoch: 007, Loss: 3.3063, Train: 1.1239, Val: 1.1287, Test: 1.1247
Epoch: 008, Loss: 3.4592, Train: 1.1989, Val: 1.2038, Test: 1.1985
Epoch: 009, Loss: 3.2758, Train: 1.3758, Val: 1.3815, Test: 1.3748
Epoch: 010, Loss: 3.4464, Train: 1.2596, Val: 1.2654, Test: 1.2597
Epoch: 011, Loss: 3.3096, Train: 1.2118, Val: 1.2174, Test: 1.2125
Epoch: 012, Loss: 3.3261, Train: 1.2003, Val: 1.2058, Test: 1.2009
Epoch: 013, Loss: 3.3287, Train: 1.2066, Val: 1.2121, Test: 1.2070
Epoch: 014, Loss: 3.3177, Train: 1.2205, Val: 1.2255, Test: 1.2204
Epoch: 015, Loss: 3.3214, Train: 1.2116, Val: 1.2162, Test: 1.

In [710]:
print(len(items))
print(items["movie_id"].max())

1682
1682


In [711]:
from tqdm import tqdm

model.eval()
total_users = len(users)
total_movies = len(items)
movie_recs = []
for user_id in tqdm(range(0, 5)):
    all_movie_ids = torch.arange(total_movies)
    seen_movie_ids = ratings[ratings["user_id"] == user_id + 1]["item_id"].unique()
    seen_movie_ids = np.array(seen_movie_ids)
    check_movies = []
    for i in all_movie_ids:
        if not np.any(seen_movie_ids == i):
            check_movies.append(i)
    check_movies = torch.tensor(check_movies)
    user_row = torch.tensor([user_id] * check_movies.shape[0])
    edge_label_index = torch.stack([user_row, check_movies], dim=0)
    pred, _ = model(data.x_dict, data.edge_index_dict,
             edge_label_index)
    print("Max-min diff:", torch.max(pred) - torch.min(pred))
    # we will only select movies for the user where the predicting rating is =5
    rec_movie_ids = torch.argsort(pred, descending=True)[:10]
    top_ten_recs = [(rec_movies + 1, pred[rec_movies].item()) for rec_movies in rec_movie_ids.tolist()] 
    movie_recs.append({'user': user_id + 1, 'rec_movies': top_ten_recs})

  0%|          | 0/5 [00:00<?, ?it/s]

Max-min diff:

 20%|██        | 1/5 [00:00<00:00,  5.62it/s]

 tensor(2.9038, device='cuda:0', grad_fn=<SubBackward0>)
Max-min diff: tensor(2.8942, device='cuda:0', grad_fn=<SubBackward0>)


100%|██████████| 5/5 [00:00<00:00,  8.37it/s]

Max-min diff: tensor(2.9220, device='cuda:0', grad_fn=<SubBackward0>)
Max-min diff: tensor(2.8833, device='cuda:0', grad_fn=<SubBackward0>)
Max-min diff: tensor(2.8587, device='cuda:0', grad_fn=<SubBackward0>)





In [712]:
movie_recs

[{'user': 1,
  'rec_movies': [(1453, 4.2366132736206055),
   (1455, 4.215634822845459),
   (1201, 4.142112731933594),
   (1447, 4.1298723220825195),
   (1452, 4.123170852661133),
   (1461, 4.070978164672852),
   (1515, 4.070932388305664),
   (822, 4.04030704498291),
   (1460, 4.039809226989746),
   (1458, 4.036356449127197)]},
 {'user': 2,
  'rec_movies': [(1453, 4.247032165527344),
   (1455, 4.228955268859863),
   (1201, 4.156125068664551),
   (1447, 4.144463539123535),
   (1452, 4.137075424194336),
   (1515, 4.085538387298584),
   (1461, 4.085407257080078),
   (1460, 4.055061340332031),
   (822, 4.052642822265625),
   (1458, 4.050773620605469)]},
 {'user': 3,
  'rec_movies': [(1453, 4.190402030944824),
   (1455, 4.171685218811035),
   (1201, 4.095126152038574),
   (1447, 4.082378387451172),
   (1452, 4.075459003448486),
   (1461, 4.02131462097168),
   (1515, 4.021267890930176),
   (822, 3.9893100261688232),
   (1460, 3.989076852798462),
   (1458, 3.9855778217315674)]},
 {'user': 4,
 

In [713]:
user = 4

In [714]:
movie_ids = ratings[ratings.user_id == user].iterrows()
for i in movie_ids:
    _, row = i
    item = items[items["movie_id"] == row.item_id]
    mean_rating = ratings[ratings["item_id"] == row.item_id]["rating"].mean()
    print(item["movie_title"].tolist()[0], row.rating, str(mean_rating)[:3])

Mimic (1997) 3 2.7
Ulee's Gold (1997) 5 3.6
Incognito (1997) 5 3.5
One Flew Over the Cuckoo's Nest (1975) 4 4.2
Event Horizon (1997) 4 2.5
Client, The (1994) 3 3.3
Liar Liar (1997) 5 3.1
Scream (1996) 4 3.4
Star Wars (1977) 5 4.3
Wedding Singer, The (1998) 5 3.4
Starship Troopers (1997) 4 3.2
Air Force One (1997) 5 3.6
Conspiracy Theory (1997) 3 3.4
Contact (1997) 5 3.8
Indiana Jones and the Last Crusade (1989) 3 3.9
Desperate Measures (1998) 5 3.3
Seven (Se7en) (1995) 4 3.8
Cop Land (1997) 5 3.3
Lost Highway (1997) 5 3.1
Assignment, The (1997) 5 3.5
Blues Brothers 2000 (1998) 5 2.8
Spawn (1997) 2 2.6
Wonderland (1997) 5 3.2
In & Out (1997) 5 3.3


In [715]:
for i, r in movie_recs[user - 1]["rec_movies"]:
    movie = items[items["movie_id"] == i]
    movie_id = movie["movie_id"].tolist()[0]
    mean_rating = ratings[ratings["item_id"] == movie_id]["rating"].mean()
    rated = ratings[ratings["item_id"] == movie_id]["user_id"].notnull().sum()
    print(movie["movie_title"].tolist()[0], str(r)[:3], str(mean_rating)[:3], rated)

Angel on My Shoulder (1946) 4.2 2.0 1
Outlaw, The (1943) 4.2 2.5 2
Marlene Dietrich: Shadow and Light (1996)  4.1 5.0 1
Century (1993) 4.1 3.0 1
Lady of Burlesque (1943) 4.1 4.0 1
Here Comes Cookie (1935) 4.0 2.0 1
Wings of Courage (1995) 4.0 4.0 1
Faces (1968) 4.0 3.0 4
Sleepover (1995) 4.0 3.0 1
Damsel in Distress, A (1937) 4.0 4.0 1
