In [1]:
import numpy as np
import torch
import pandas as pd
import torch.nn.functional as F
from torch.nn import Linear
import torch_geometric.transforms as T
from torch_geometric.nn import SAGEConv, to_hetero, GCNConv
from torch_geometric.transforms import RandomLinkSplit, ToUndirected
from torch_geometric.data import HeteroData

data_folder = "../data/interm/"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = "cpu"

In [2]:
users = pd.read_csv(data_folder + "users.csv")
items = pd.read_csv(data_folder + "items.csv")
ratings = pd.read_csv(data_folder + "ratings.csv")
genres = pd.read_csv("../data/raw/ml-100k/u.genre", delimiter="|", names=["name","index"])

Edges: ratings
Nodes: users, items
Graph type: bipartite

In [3]:
def create_torch_edges(ratings):
    src = ratings["user_id"] - 1
    dst = ratings["item_id"] - 1
    attrs = ratings["rating"]
    
    edge_index = torch.tensor([src, dst], dtype=torch.int64)
    edge_attr = torch.tensor(attrs)
    
    return edge_index, edge_attr

In [4]:
edge_index, edge_attr = create_torch_edges(ratings)

In [5]:
edge_index.shape

torch.Size([2, 100000])

In [6]:
items

Unnamed: 0,movie_id,movie_title,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,release_year
0,1,Toy Story (1995),0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,1995.0
1,2,GoldenEye (1995),0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1995.0
2,3,Four Rooms (1995),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1995.0
3,4,Get Shorty (1995),0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1995.0
4,5,Copycat (1995),0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,1995.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1998.0
1678,1679,B. Monkey (1998),0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,1998.0
1679,1680,Sliding Doors (1998),0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1998.0
1680,1681,You So Crazy (1994),0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1994.0


In [7]:
from sentence_transformers import SentenceTransformer

def SequenceEncoder(movie_titles , model_name=None):
    model = SentenceTransformer(model_name, device=device)
    title_embeddings = model.encode(movie_titles, show_progress_bar=True,
                              convert_to_tensor=True, device=device)
    
    return title_embeddings.to("cpu")

item_title = SequenceEncoder(items["movie_title"], model_name='all-MiniLM-L6-v2')
item_genres = torch.tensor(items[genres.name].to_numpy(), dtype=torch.bool)
item_release_year = torch.tensor(items["release_year"].to_numpy()[:,np.newaxis], dtype=torch.int32)

item_x = torch.cat((item_title, item_genres), dim=-1).float()

Batches:   0%|          | 0/53 [00:00<?, ?it/s]

In [8]:
users

Unnamed: 0,user_id,age,zip_code,male,female,occupation_technician,occupation_other,occupation_writer,occupation_executive,occupation_administrator,...,occupation_librarian,occupation_homemaker,occupation_artist,occupation_engineer,occupation_marketing,occupation_none,occupation_healthcare,occupation_retired,occupation_salesman,occupation_doctor
0,1,24,85711,True,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,2,53,94043,False,True,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,3,23,32067,True,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
3,4,24,43537,True,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,5,33,15213,False,True,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
938,939,26,33319,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
939,940,32,02215,True,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
940,941,20,97229,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
941,942,48,78209,False,True,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False


In [9]:
users.zip_code.to_numpy()

array(['85711', '94043', '32067', '43537', '15213', '98101', '91344',
       '05201', '01002', '90703', '30329', '06405', '29206', '55106',
       '97301', '10309', '06355', '37212', '02138', '95660', '30068',
       '40206', '48197', '94533', '55107', '21044', '30030', '55369',
       '94043', '55436', '10003', '78741', '27510', '42141', '42459',
       '93117', '55105', '54467', '01040', '27514', '80525', '17870',
       '20854', '46260', '50233', '46538', '07102', '12550', '76111',
       '52245', '16509', '55105', '55414', '66315', '01331', '46260',
       '84010', '52246', '08403', '06472', '30040', '97214', '75240',
       '43202', '48118', '80521', '60402', '22904', '55337', '60067',
       '98034', '73034', '41850', 'T8H1N', '08816', '02215', '29379',
       '61801', '03755', '52241', '21218', '22902', '44133', '55369',
       '20003', '46005', '89503', '11701', '68106', '78155', '01913',
       '80525', '23112', '71457', '10707', '75206', '98006', '90291',
       '63129', '902

In [10]:
user_ages = torch.tensor(users["age"].to_numpy()[:,np.newaxis], dtype=torch.uint8)
user_sex = torch.tensor(users[["male", "female"]].to_numpy(), dtype=torch.bool)
occupations = [i for i in users.keys() if i.startswith("occupation_")]
user_occupation = torch.tensor(users[occupations].to_numpy(), dtype=torch.bool)
user_x = torch.cat((user_ages, user_sex, user_occupation), dim=-1).float()

In [11]:
data = HeteroData()

data['user'].x = user_x
data['item'].x = item_x
data['user', 'rates', 'item'].edge_index = edge_index
data['user', 'rates', 'item'].edge_label = edge_attr

In [12]:
data = ToUndirected()(data)
del data['item', 'rev_rates', 'user'].edge_label  # Remove "reverse" label.
data = data.to(device)

# Perform a link-level split into training, validation, and test edges.
train_data, val_data, test_data = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    neg_sampling_ratio=0.0,
    edge_types=[('user', 'rates', 'item')],
    rev_edge_types=[('item', 'rev_rates', 'user')],
)(data)

In [13]:
weight = torch.bincount(train_data['user', 'rates', 'item'].edge_label)
weight = weight.max() / weight
def weighted_mse_loss(pred, target, weight=None):
    weight = 1. if weight is None else weight[target].to(pred.dtype)
    return (weight * (pred - target.to(pred.dtype)).pow(2)).mean()

In [14]:
from torch_geometric.nn import GATv2Conv, RGCNConv, HeteroConv, GINConv
from torch_geometric.utils.dropout import *


class GNNEncoder(torch.nn.Module):
    def __init__(self):
        super().__init__()
        # these convolutions have been replicated to match the number of edge types\
        self.conv1 = GATv2Conv((-1, -1), 32, add_self_loops=False)
        self.conv2 = GATv2Conv((-1, -1), 32, add_self_loops=False)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x
class EdgeDecoder(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.lin1 = Linear(2 * hidden_channels, hidden_channels)
        self.lin2 = Linear(hidden_channels, 1)
        
    def forward(self, z_dict, edge_label_index):
        row, col = edge_label_index
        # concat user and movie embeddings
        z = torch.cat([z_dict['user'][row], z_dict['item'][col]], dim=-1)
        # concatenated embeddings passed to linear layer
        z = self.lin1(z).relu()
        z = self.lin2(z)
        return z.view(-1)
class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.encoder = GNNEncoder()
        self.encoder = to_hetero(self.encoder, data.metadata(), aggr='sum')
        self.decoder = EdgeDecoder(hidden_channels)

    def forward(self, x_dict, edge_index_dict, edge_label_index):
        # z_dict contains dictionary of movie and user embeddings returned from GraphSage
        edge_label_index, mask = dropout_edge(edge_label_index, p=0.25, training=self.training)
        z_dict = self.encoder(x_dict, edge_index_dict)
        return self.decoder(z_dict, edge_label_index), mask
model = Model(hidden_channels=32).to(device)
# Due to lazy initialization, we need to run one model step so the number
# of parameters can be inferred:
with torch.no_grad():
    model.encoder(train_data.x_dict, train_data.edge_index_dict)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [15]:
from torch.nn import MSELoss

loss_f = MSELoss()

def train():
    model.train()
    optimizer.zero_grad()
    pred, mask = model(train_data.x_dict, train_data.edge_index_dict,
                 train_data['user', 'rates', 'item'].edge_label_index)
    target = train_data['user', 'rates', 'item'].edge_label
    loss = weighted_mse_loss(pred, target[mask], weight)
    loss.backward()
    optimizer.step()
    return float(loss)

In [16]:
@torch.no_grad()
def test(data):
    model.eval()
    pred, _ = model(data.x_dict, data.edge_index_dict,
                 data['user', 'rates', 'item'].edge_label_index)
    pred = pred.clamp(min=0, max=5)
    target = data['user', 'rates', 'item'].edge_label.float()
    rmse = F.mse_loss(pred, target).sqrt()
    return float(rmse)

In [17]:
for epoch in range(1, 1000):
    loss = train()
    train_rmse = test(train_data)
    val_rmse = test(val_data)
    test_rmse = test(test_data)
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train: {train_rmse:.4f}, '
          f'Val: {val_rmse:.4f}, Test: {test_rmse:.4f}')

Epoch: 001, Loss: 40.4109, Train: 3.7056, Val: 3.7032, Test: 3.7023
Epoch: 002, Loss: 33.1963, Train: 3.7056, Val: 3.7032, Test: 3.7023
Epoch: 003, Loss: 27.0470, Train: 3.7056, Val: 3.7032, Test: 3.7023
Epoch: 004, Loss: 22.3955, Train: 3.6809, Val: 3.6785, Test: 3.6778
Epoch: 005, Loss: 18.5881, Train: 3.3596, Val: 3.3571, Test: 3.3570
Epoch: 006, Loss: 15.3204, Train: 3.0564, Val: 3.0538, Test: 3.0544
Epoch: 007, Loss: 12.5113, Train: 2.7686, Val: 2.7659, Test: 2.7671
Epoch: 008, Loss: 10.2765, Train: 2.4999, Val: 2.4971, Test: 2.4991
Epoch: 009, Loss: 8.3768, Train: 2.2629, Val: 2.2600, Test: 2.2627
Epoch: 010, Loss: 6.9199, Train: 2.0467, Val: 2.0437, Test: 2.0471
Epoch: 011, Loss: 5.7861, Train: 1.8492, Val: 1.8463, Test: 1.8504
Epoch: 012, Loss: 4.9025, Train: 1.6670, Val: 1.6641, Test: 1.6690
Epoch: 013, Loss: 4.2639, Train: 1.5022, Val: 1.4995, Test: 1.5052
Epoch: 014, Loss: 3.7881, Train: 1.3654, Val: 1.3629, Test: 1.3694
Epoch: 015, Loss: 3.5150, Train: 1.2598, Val: 1.2577, 

In [18]:
from tqdm import tqdm

total_users = len(users)
total_movies = len(items)
movie_recs = []
for user_id in tqdm(range(0, total_users)):
    user_row = torch.tensor([user_id] * total_movies)
    all_movie_ids = torch.arange(total_movies)
    edge_label_index = torch.stack([user_row, all_movie_ids], dim=0)
    pred, _ = model(data.x_dict, data.edge_index_dict,
             edge_label_index)
    # we will only select movies for the user where the predicting rating is =5
    rec_movie_ids = (pred > 4.9).nonzero(as_tuple=True)
    top_ten_recs = [rec_movies + 1 for rec_movies in rec_movie_ids[0].tolist()] 
    movie_recs.append({'user': user_id + 1, 'rec_movies': top_ten_recs})

100%|██████████| 943/943 [00:03<00:00, 277.41it/s]


In [19]:
movie_recs

[{'user': 1, 'rec_movies': []},
 {'user': 2, 'rec_movies': []},
 {'user': 3, 'rec_movies': []},
 {'user': 4, 'rec_movies': []},
 {'user': 5, 'rec_movies': []},
 {'user': 6, 'rec_movies': []},
 {'user': 7, 'rec_movies': []},
 {'user': 8, 'rec_movies': []},
 {'user': 9, 'rec_movies': []},
 {'user': 10, 'rec_movies': []},
 {'user': 11, 'rec_movies': []},
 {'user': 12, 'rec_movies': []},
 {'user': 13, 'rec_movies': []},
 {'user': 14, 'rec_movies': []},
 {'user': 15, 'rec_movies': []},
 {'user': 16, 'rec_movies': []},
 {'user': 17, 'rec_movies': []},
 {'user': 18, 'rec_movies': []},
 {'user': 19, 'rec_movies': []},
 {'user': 20, 'rec_movies': []},
 {'user': 21, 'rec_movies': []},
 {'user': 22, 'rec_movies': []},
 {'user': 23, 'rec_movies': []},
 {'user': 24, 'rec_movies': []},
 {'user': 25, 'rec_movies': []},
 {'user': 26, 'rec_movies': []},
 {'user': 27, 'rec_movies': []},
 {'user': 28, 'rec_movies': []},
 {'user': 29, 'rec_movies': []},
 {'user': 30, 'rec_movies': []},
 {'user': 31, 'rec_

In [20]:
user = 481

In [21]:
movie_ids = ratings[ratings.user_id == user].iterrows()
for i in movie_ids:
    _, row = i
    item = items[items["movie_id"] == row.item_id]
    mean_rating = ratings[ratings["item_id"] == row.item_id]["rating"].mean()
    print(item["movie_title"].tolist()[0], row.rating, str(mean_rating)[:3])

Schindler's List (1993) 1 4.4
Indiana Jones and the Last Crusade (1989) 4 3.9
Speed 2: Cruise Control (1997) 3 2.1
While You Were Sleeping (1995) 3 3.5
Titanic (1997) 4 4.2
Clerks (1994) 3 3.8
Hunchback of Notre Dame, The (1996) 4 3.3
Great Dictator, The (1940) 5 4.0
Streetcar Named Desire, A (1951) 4 3.8
Butch Cassidy and the Sundance Kid (1969) 5 3.9
Annie Hall (1977) 4 3.9
Seventh Seal, The (Sjunde inseglet, Det) (1957) 3 3.5
When Harry Met Sally... (1989) 5 3.9
Raising Arizona (1987) 4 3.8
Quiet Man, The (1952) 5 4.0
Clueless (1995) 3 3.4
Arsenic and Old Lace (1944) 5 4.0
Dumb & Dumber (1994) 1 3.0
Dial M for Murder (1954) 5 4.0
American President, The (1995) 4 3.6
Get Shorty (1995) 3 3.5
Silence of the Lambs, The (1991) 4 4.2
Hamlet (1996) 4 4.0
To Kill a Mockingbird (1962) 4 4.2
Graduate, The (1967) 3 4.1
Fish Called Wanda, A (1988) 5 3.7
Henry V (1989) 5 4.1
Volcano (1997) 3 2.8
Star Wars (1977) 4 4.3
Duck Soup (1933) 4 4.0
Emma (1996) 5 3.7
Fargo (1996) 4 4.1
Return of the Pink

In [22]:
for i in movie_recs[user - 1]["rec_movies"]:
    movie = items[items["movie_id"] == i]
    movie_id = movie["movie_id"].tolist()[0]
    mean_rating = ratings[ratings["item_id"] == movie_id]["rating"].mean()
    rated = ratings[ratings["item_id"] == movie_id]["user_id"].notnull().sum()
    print(movie["movie_title"].tolist()[0], str(mean_rating)[:3], rated)