In [97]:
import numpy as np
import torch
import pandas as pd
import torch.nn.functional as F
from torch.nn import Linear
import torch_geometric.transforms as T
from torch_geometric.nn import SAGEConv, to_hetero, GCNConv
from torch_geometric.transforms import RandomLinkSplit, ToUndirected
from torch_geometric.data import HeteroData

data_folder = "../data/interm/"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = "cpu"

In [98]:
users = pd.read_csv(data_folder + "users.csv")
items = pd.read_csv(data_folder + "items.csv")
ratings = pd.read_csv(data_folder + "ratings.csv")
genres = pd.read_csv("../data/raw/ml-100k/u.genre", delimiter="|", names=["name","index"])

Edges: ratings
Nodes: users, items
Graph type: bipartite

In [99]:
def create_torch_edges(ratings):
    src = ratings["user_id"] - 1
    dst = ratings["item_id"] - 1
    attrs = ratings["rating"]
    
    edge_index = torch.tensor([src, dst], dtype=torch.int64)
    edge_attr = torch.tensor(attrs)
    
    return edge_index, edge_attr

In [100]:
edge_index, edge_attr = create_torch_edges(ratings)

In [101]:
edge_index.shape

torch.Size([2, 100000])

In [102]:
items

Unnamed: 0,movie_id,movie_title,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,release_year
0,1,Toy Story (1995),0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,1995.0
1,2,GoldenEye (1995),0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1995.0
2,3,Four Rooms (1995),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1995.0
3,4,Get Shorty (1995),0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1995.0
4,5,Copycat (1995),0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,1995.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1998.0
1678,1679,B. Monkey (1998),0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,1998.0
1679,1680,Sliding Doors (1998),0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1998.0
1680,1681,You So Crazy (1994),0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1994.0


In [103]:
from sentence_transformers import SentenceTransformer

def SequenceEncoder(movie_titles , model_name=None):
    model = SentenceTransformer(model_name, device=device)
    title_embeddings = model.encode(movie_titles, show_progress_bar=True,
                              convert_to_tensor=True, device=device)
    
    return title_embeddings.to("cpu")

item_title = SequenceEncoder(items["movie_title"], model_name='all-MiniLM-L6-v2')
item_genres = torch.tensor(items[genres.name].to_numpy(), dtype=torch.bool)
item_release_year = torch.tensor(items["release_year"].to_numpy()[:,np.newaxis], dtype=torch.int32)

item_x = torch.cat((item_title, item_genres), dim=-1).float()

Batches:   0%|          | 0/53 [00:00<?, ?it/s]

In [104]:
users

Unnamed: 0,user_id,age,zip_code,male,female,occupation_technician,occupation_other,occupation_writer,occupation_executive,occupation_administrator,...,occupation_librarian,occupation_homemaker,occupation_artist,occupation_engineer,occupation_marketing,occupation_none,occupation_healthcare,occupation_retired,occupation_salesman,occupation_doctor
0,1,24,85711,True,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,2,53,94043,False,True,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,3,23,32067,True,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
3,4,24,43537,True,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,5,33,15213,False,True,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
938,939,26,33319,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
939,940,32,02215,True,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
940,941,20,97229,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
941,942,48,78209,False,True,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False


In [105]:
users.zip_code.to_numpy()

array(['85711', '94043', '32067', '43537', '15213', '98101', '91344',
       '05201', '01002', '90703', '30329', '06405', '29206', '55106',
       '97301', '10309', '06355', '37212', '02138', '95660', '30068',
       '40206', '48197', '94533', '55107', '21044', '30030', '55369',
       '94043', '55436', '10003', '78741', '27510', '42141', '42459',
       '93117', '55105', '54467', '01040', '27514', '80525', '17870',
       '20854', '46260', '50233', '46538', '07102', '12550', '76111',
       '52245', '16509', '55105', '55414', '66315', '01331', '46260',
       '84010', '52246', '08403', '06472', '30040', '97214', '75240',
       '43202', '48118', '80521', '60402', '22904', '55337', '60067',
       '98034', '73034', '41850', 'T8H1N', '08816', '02215', '29379',
       '61801', '03755', '52241', '21218', '22902', '44133', '55369',
       '20003', '46005', '89503', '11701', '68106', '78155', '01913',
       '80525', '23112', '71457', '10707', '75206', '98006', '90291',
       '63129', '902

In [106]:
user_ages = torch.tensor(users["age"].to_numpy()[:,np.newaxis], dtype=torch.uint8)
user_sex = torch.tensor(users[["male", "female"]].to_numpy(), dtype=torch.bool)
occupations = [i for i in users.keys() if i.startswith("occupation_")]
user_occupation = torch.tensor(users[occupations].to_numpy(), dtype=torch.bool)
user_x = torch.cat((user_ages, user_sex, user_occupation), dim=-1).float()

In [107]:
data = HeteroData()

data['user'].x = user_x
data['item'].x = item_x
data['user', 'rates', 'item'].edge_index = edge_index
data['user', 'rates', 'item'].edge_label = edge_attr

In [108]:
data = ToUndirected()(data)
del data['item', 'rev_rates', 'user'].edge_label  # Remove "reverse" label.
data = data.to(device)

# Perform a link-level split into training, validation, and test edges.
train_data, val_data, test_data = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    neg_sampling_ratio=0.0,
    edge_types=[('user', 'rates', 'item')],
    rev_edge_types=[('item', 'rev_rates', 'user')],
)(data)

In [109]:
weight = torch.bincount(train_data['user', 'rates', 'item'].edge_label)
weight = weight.max() / weight
def weighted_mse_loss(pred, target, weight=None):
    weight = 1. if weight is None else weight[target].to(pred.dtype)
    return (weight * (pred - target.to(pred.dtype)).pow(2)).mean()

In [110]:
from torch.nn import Dropout
from torch_geometric.nn import GATv2Conv, RGCNConv, HeteroConv, GINConv
from torch_geometric.utils.dropout import *


class GNNEncoder(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        # these convolutions have been replicated to match the number of edge types\
        self.conv1 = GATv2Conv((-1, -1), hidden_channels, add_self_loops=False)
        self.conv2 = GATv2Conv((-1, -1), hidden_channels, add_self_loops=False)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x
class EdgeDecoder(torch.nn.Module):
    def __init__(self, n_factors, hidden_channels):
        super().__init__()
        self.lin1 = Linear(2 * n_factors, hidden_channels)
        self.dropout = Dropout()
        self.lin2 = Linear(hidden_channels, hidden_channels)
        self.lin3 = Linear(hidden_channels, hidden_channels)
        self.lin4 = Linear(hidden_channels, 1)
        
    def forward(self, z_dict, edge_label_index):
        row, col = edge_label_index
        # concat user and movie embeddings
        z = torch.cat([z_dict['user'][row], z_dict['item'][col]], dim=-1)
        # concatenated embeddings passed to linear layer
        z = self.lin1(z).relu()
        z = self.dropout(z)
        z = self.lin2(z).relu()
        z = self.dropout(z)
        z = self.lin3(z).relu()
        z = self.dropout(z)
        z = self.lin4(z)
        return z.view(-1)
class Model(torch.nn.Module):
    def __init__(self, n_factors, hidden_channels):
        super().__init__()
        self.encoder = GNNEncoder(n_factors)
        self.encoder = to_hetero(self.encoder, data.metadata(), aggr='sum')
        self.decoder = EdgeDecoder(n_factors, hidden_channels)

    def forward(self, x_dict, edge_index_dict, edge_label_index):
        # z_dict contains dictionary of movie and user embeddings returned from GraphSage
        edge_label_index, mask = dropout_edge(edge_label_index, p=0.25, training=self.training)
        z_dict = self.encoder(x_dict, edge_index_dict)
        return self.decoder(z_dict, edge_label_index), mask
model = Model(n_factors=150, hidden_channels=500).to(device)
# Due to lazy initialization, we need to run one model step so the number
# of parameters can be inferred:
with torch.no_grad():
    model.encoder(train_data.x_dict, train_data.edge_index_dict)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [111]:
from torch.nn import MSELoss

loss_f = MSELoss()

def train():
    model.train()
    optimizer.zero_grad()
    pred, mask = model(train_data.x_dict, train_data.edge_index_dict,
                 train_data['user', 'rates', 'item'].edge_label_index)
    target = train_data['user', 'rates', 'item'].edge_label
    loss = weighted_mse_loss(pred, target[mask], weight)
    loss.backward()
    optimizer.step()
    return float(loss)

In [112]:
@torch.no_grad()
def test(data):
    model.eval()
    pred, _ = model(data.x_dict, data.edge_index_dict,
                 data['user', 'rates', 'item'].edge_label_index)
    pred = pred.clamp(min=0, max=5)
    target = data['user', 'rates', 'item'].edge_label.float()
    rmse = F.mse_loss(pred, target).sqrt()
    return float(rmse)

In [None]:
for epoch in range(1, 1000):
    loss = train()
    train_rmse = test(train_data)
    val_rmse = test(val_data)
    test_rmse = test(test_data)
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train: {train_rmse:.4f}, '
          f'Val: {val_rmse:.4f}, Test: {test_rmse:.4f}')

Epoch: 001, Loss: 19.1996, Train: 3.0232, Val: 3.0255, Test: 3.0288
Epoch: 002, Loss: 11.4756, Train: 1.9265, Val: 1.9305, Test: 1.9363
Epoch: 003, Loss: 4.7593, Train: 1.4633, Val: 1.4573, Test: 1.4460
Epoch: 004, Loss: 11.8741, Train: 1.1884, Val: 1.1886, Test: 1.1911
Epoch: 005, Loss: 5.1398, Train: 1.7156, Val: 1.7183, Test: 1.7225
Epoch: 006, Loss: 4.0277, Train: 2.1691, Val: 2.1709, Test: 2.1750
Epoch: 007, Loss: 5.3483, Train: 2.3788, Val: 2.3803, Test: 2.3837
Epoch: 008, Loss: 6.3330, Train: 2.4130, Val: 2.4144, Test: 2.4173
Epoch: 009, Loss: 6.5116, Train: 2.3166, Val: 2.3178, Test: 2.3204
Epoch: 010, Loss: 6.0788, Train: 2.1055, Val: 2.1065, Test: 2.1088
Epoch: 011, Loss: 5.1364, Train: 1.7929, Val: 1.7936, Test: 1.7953
Epoch: 012, Loss: 4.1781, Train: 1.4297, Val: 1.4294, Test: 1.4304
Epoch: 013, Loss: 3.7496, Train: 1.1775, Val: 1.1759, Test: 1.1756
Epoch: 014, Loss: 4.3516, Train: 1.1249, Val: 1.1225, Test: 1.1215
Epoch: 015, Loss: 5.1964, Train: 1.1384, Val: 1.1363, Test:

In [None]:
from tqdm import tqdm

total_users = len(users)
total_movies = len(items)
movie_recs = []
for user_id in tqdm(range(0, total_users)):
    user_row = torch.tensor([user_id] * total_movies)
    all_movie_ids = torch.arange(total_movies)
    edge_label_index = torch.stack([user_row, all_movie_ids], dim=0)
    pred, _ = model(data.x_dict, data.edge_index_dict,
             edge_label_index)
    # we will only select movies for the user where the predicting rating is =5
    rec_movie_ids = (pred > 4.9).nonzero(as_tuple=True)
    top_ten_recs = [rec_movies + 1 for rec_movies in rec_movie_ids[0].tolist()] 
    movie_recs.append({'user': user_id + 1, 'rec_movies': top_ten_recs})

In [None]:
movie_recs

In [None]:
user = 481

In [None]:
movie_ids = ratings[ratings.user_id == user].iterrows()
for i in movie_ids:
    _, row = i
    item = items[items["movie_id"] == row.item_id]
    mean_rating = ratings[ratings["item_id"] == row.item_id]["rating"].mean()
    print(item["movie_title"].tolist()[0], row.rating, str(mean_rating)[:3])

In [None]:
for i in movie_recs[user - 1]["rec_movies"]:
    movie = items[items["movie_id"] == i]
    movie_id = movie["movie_id"].tolist()[0]
    mean_rating = ratings[ratings["item_id"] == movie_id]["rating"].mean()
    rated = ratings[ratings["item_id"] == movie_id]["user_id"].notnull().sum()
    print(movie["movie_title"].tolist()[0], str(mean_rating)[:3], rated)