In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import GCNConv, GATConv, SAGEConv, global_mean_pool
from datasets import load_dataset
import networkx as nx
from sklearn.model_selection import train_test_split


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_All_Beauty", trust_remote_code=True)


In [3]:
dataset

DatasetDict({
    full: Dataset({
        features: ['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase'],
        num_rows: 701528
    })
})

In [4]:
import torch
from torch_geometric.data import Data

def create_graph_data(review):
    # Her bir incelemeyi bir embedding vektörüne dönüştürme
    review_vector = torch.randn(128)  # Her düğüm için 768 boyutlu özellik vektörü

    # Basit bir self-loop kenar yapısı oluşturuyoruz (her node kendisine bağlı)
    edge_index = torch.tensor([[0], [0]], dtype=torch.long)  # [2, num_edges] boyutunda olmalı

    # Node'u oluşturmak
    node = Data(x=review_vector.unsqueeze(0), edge_index=edge_index, y=torch.tensor([review['rating']], dtype=torch.float))
    
    return node

# İlk 1000 örneği alıyoruz
reviews = dataset['full'].select(range(100000))  # Dataset'in ilk 1000 örneğini seçiyoruz

# Her bir incelemeyi işleyerek graph verisi oluşturuyoruz
graph_data = [create_graph_data(review) for review in reviews]

train_data, test_data = train_test_split(graph_data, test_size=0.2, random_state=42)

train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)




In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.nn import GCNConv, GATConv, global_mean_pool, SAGEConv
from torch_geometric.data import Data, DataLoader

class Encoder(nn.Module):
    def __init__(self, in_channels, hidden_channels):
        super(Encoder, self).__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = torch.relu(x)
        x = self.conv2(x, edge_index)
        return x

class Decoder(nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super(Decoder, self).__init__()
        self.linear = nn.Linear(hidden_channels, out_channels)

    def forward(self, x):
        x = self.linear(x)
        return x

class GNNModel(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GNNModel, self).__init__()
        self.encoder = Encoder(in_channels, hidden_channels)
        self.decoder = Decoder(hidden_channels, out_channels)

    def forward(self, x, edge_index, batch):
        x = self.encoder(x, edge_index)
        x = global_mean_pool(x, batch)  # Global mean pooling over all nodes
        x = self.decoder(x)
        return x


In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GNNModel(in_channels=128, hidden_channels=128, out_channels=1).to(device)  # 768 input features, 128 hidden units, 1 output (rating)
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.MSELoss()  # Mean Squared Error Loss, çünkü rating tahmin ediyoruz

def train():
    model.train()
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        out = model(data.x, data.edge_index, data.batch)
        loss = criterion(out.squeeze(), data.y)
        loss.backward()
        optimizer.step()

def test(loader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for data in loader:
            data = data.to(device)
            out = model(data.x, data.edge_index, data.batch)
            loss = criterion(out.squeeze(), data.y)
            total_loss += loss.item()
    return total_loss / len(loader)

for epoch in range(1, 100):  # 100 epoch boyunca eğitiyoruz
    train()
    train_loss = test(train_loader)
    test_loss = test(test_loader)
    print(f'Epoch {epoch}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}')


Epoch 1, Train Loss: 2.2323, Test Loss: 2.2759
Epoch 2, Train Loss: 2.1863, Test Loss: 2.1911
Epoch 3, Train Loss: 2.1928, Test Loss: 2.2004
Epoch 4, Train Loss: 2.2306, Test Loss: 2.2190
Epoch 5, Train Loss: 2.3041, Test Loss: 2.2629
Epoch 6, Train Loss: 2.0136, Test Loss: 2.0084
Epoch 7, Train Loss: 1.9954, Test Loss: 1.9998
Epoch 8, Train Loss: 1.9863, Test Loss: 1.9898
Epoch 9, Train Loss: 2.4622, Test Loss: 2.4415
Epoch 10, Train Loss: 1.9667, Test Loss: 1.9635
Epoch 11, Train Loss: 1.9570, Test Loss: 1.9540
Epoch 12, Train Loss: 2.1394, Test Loss: 2.0847
Epoch 13, Train Loss: 1.9691, Test Loss: 1.9629
Epoch 14, Train Loss: 1.9660, Test Loss: 1.9616
Epoch 15, Train Loss: 1.9693, Test Loss: 1.9705
Epoch 16, Train Loss: 1.9390, Test Loss: 1.9402
Epoch 17, Train Loss: 2.1679, Test Loss: 2.0448
Epoch 18, Train Loss: 1.9365, Test Loss: 1.9392
Epoch 19, Train Loss: 1.9816, Test Loss: 1.9742
Epoch 20, Train Loss: 2.1389, Test Loss: 2.0444
Epoch 21, Train Loss: 1.9410, Test Loss: 1.9401
E

In [7]:
# Encoder katmanlarındaki ağırlıkları kaydetme
encoder_state_dict = {
    'conv1': model.encoder.conv1.state_dict(),
    'conv2': model.encoder.conv2.state_dict(),
}

In [8]:
for param_name, param_tensor in encoder_state_dict['conv2'].items():
    print(f"{param_name}: {param_tensor.shape}")

lin_l.weight: torch.Size([128, 128])
lin_l.bias: torch.Size([128])
lin_r.weight: torch.Size([128, 128])


In [9]:
import torch
from torch import Tensor
from torch.nn.functional import normalize
from torch_geometric_temporal.nn.recurrent import GConvGRU
print(torch.__version__)

2.3.0+cu121


In [10]:
torch.cuda.is_available()

True

In [11]:
# Install required packages.
import os
import pandas as pd
import numpy as np
os.environ['TORCH'] = torch.__version__

# !pip install torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
# !pip install torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
# !pip install pyg-lib -f https://data.pyg.org/whl/nightly/torch-${TORCH}.html
# !pip install git+https://github.com/pyg-team/pytorch_geometric.git

In [12]:
os.getcwd()

'/home/cevher/movielens'

### Link Prediction on MovieLens

### Heterogeneous Graph Creation

In [13]:
movies_path = './ml-latest-small/movies_cleaned.csv'
ratings_path = './ml-latest-small/ratings_cleaned.csv'
import pandas as pd

print('movies.csv:')
print('===========')
movies = pd.read_csv(movies_path)
print(movies[["movieId", "genres"]].head())
print()
print('ratings.csv:')
print('============')
ratings_df = pd.read_csv(ratings_path)
print(ratings_df[["userId", "movieId"]].head())

movies.csv:
   movieId                                             genres
0        1  ['Adventure', 'Animation', 'Children', 'Comedy...
1        2               ['Adventure', 'Children', 'Fantasy']
2        3                              ['Comedy', 'Romance']
3        4                     ['Comedy', 'Drama', 'Romance']
4        5                                         ['Comedy']

ratings.csv:
   userId  movieId
0       1        1
1       1        3
2       1        6
3       1       47
4       1       50


In [14]:
movies.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,movieId,title,genres,wikipedia_page_name,movie_plot,token_ids,plot_ids,mean_tokens
0,0,0,0,1,Toy Story (1995),"['Adventure', 'Animation', 'Children', 'Comedy...",Toy Story,"A group of sentient toys, who pretend to be li...","[1037, 2177, 1997, 2741, 11638, 10899, 1010, 2...",[ 1037. 2177. 1997. 2741. 11638. 10899. 10...,4984.423077
1,1,1,1,2,Jumanji (1995),"['Adventure', 'Children', 'Fantasy']",Jumanji,"In 1969, Alan Parrish lives in Brantford, New ...","[1999, 3440, 1010, 5070, 11968, 18774, 3268, 1...",[ 1999. 3440. 1010. 5070. 11968. 18774. 32...,4644.15875
2,2,2,2,3,Grumpier Old Men (1995),"['Comedy', 'Romance']",Grumpier Old Men,The feud between Max and John has cooled and t...,"[1996, 13552, 2090, 4098, 1998, 2198, 2038, 12...",[ 1996. 13552. 2090. 4098. 1998. 2198. 20...,4044.667638
3,3,3,3,4,Waiting to Exhale (1995),"['Comedy', 'Drama', 'Romance']",Waiting to Exhale,"Four friends (Savannah, Robin, Bernadine, and ...","[2176, 2814, 1006, 10891, 1010, 5863, 1010, 16...",[ 2176. 2814. 1006. 10891. 1010. 5863. 10...,4013.604545
4,4,4,4,5,Father of the Bride Part II (1995),['Comedy'],Father of the Bride Part II,"Four years after the events of the first film,...","[2176, 2086, 2044, 1996, 2824, 1997, 1996, 203...",[ 2176. 2086. 2044. 1996. 2824. 1997. 19...,4053.462801


In [15]:
ratings_df.head()

Unnamed: 0.1,Unnamed: 0,userId,movieId,rating,timestamp,month
0,0,1,1,4.0,964982703,7
1,1,1,3,4.0,964981247,7
2,2,1,6,4.0,964982224,7
3,3,1,47,5.0,964983815,7
4,4,1,50,5.0,964982931,7


In [16]:
movies2 = pd.read_csv('./ml-latest-small/movies.csv')
movies_df = pd.merge(movies, movies2, on='movieId', how='left')
movies_df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,movieId,title_x,genres_x,wikipedia_page_name,movie_plot,token_ids,plot_ids,mean_tokens,title_y,genres_y
0,0,0,0,1,Toy Story (1995),"['Adventure', 'Animation', 'Children', 'Comedy...",Toy Story,"A group of sentient toys, who pretend to be li...","[1037, 2177, 1997, 2741, 11638, 10899, 1010, 2...",[ 1037. 2177. 1997. 2741. 11638. 10899. 10...,4984.423077,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,1,1,2,Jumanji (1995),"['Adventure', 'Children', 'Fantasy']",Jumanji,"In 1969, Alan Parrish lives in Brantford, New ...","[1999, 3440, 1010, 5070, 11968, 18774, 3268, 1...",[ 1999. 3440. 1010. 5070. 11968. 18774. 32...,4644.15875,Jumanji (1995),Adventure|Children|Fantasy
2,2,2,2,3,Grumpier Old Men (1995),"['Comedy', 'Romance']",Grumpier Old Men,The feud between Max and John has cooled and t...,"[1996, 13552, 2090, 4098, 1998, 2198, 2038, 12...",[ 1996. 13552. 2090. 4098. 1998. 2198. 20...,4044.667638,Grumpier Old Men (1995),Comedy|Romance
3,3,3,3,4,Waiting to Exhale (1995),"['Comedy', 'Drama', 'Romance']",Waiting to Exhale,"Four friends (Savannah, Robin, Bernadine, and ...","[2176, 2814, 1006, 10891, 1010, 5863, 1010, 16...",[ 2176. 2814. 1006. 10891. 1010. 5863. 10...,4013.604545,Waiting to Exhale (1995),Comedy|Drama|Romance
4,4,4,4,5,Father of the Bride Part II (1995),['Comedy'],Father of the Bride Part II,"Four years after the events of the first film,...","[2176, 2086, 2044, 1996, 2824, 1997, 1996, 203...",[ 2176. 2086. 2044. 1996. 2824. 1997. 19...,4053.462801,Father of the Bride Part II (1995),Comedy


In [17]:
movies_df = movies_df.drop(columns=['Unnamed: 0', 'Unnamed: 0.1', 'title_x', 'genres_x'])
movies_df.rename(columns={'title_y':'title', 'genres_y':'genres' },inplace=True)
movies_df.head()

Unnamed: 0,Unnamed: 0.1.1,movieId,wikipedia_page_name,movie_plot,token_ids,plot_ids,mean_tokens,title,genres
0,0,1,Toy Story,"A group of sentient toys, who pretend to be li...","[1037, 2177, 1997, 2741, 11638, 10899, 1010, 2...",[ 1037. 2177. 1997. 2741. 11638. 10899. 10...,4984.423077,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,2,Jumanji,"In 1969, Alan Parrish lives in Brantford, New ...","[1999, 3440, 1010, 5070, 11968, 18774, 3268, 1...",[ 1999. 3440. 1010. 5070. 11968. 18774. 32...,4644.15875,Jumanji (1995),Adventure|Children|Fantasy
2,2,3,Grumpier Old Men,The feud between Max and John has cooled and t...,"[1996, 13552, 2090, 4098, 1998, 2198, 2038, 12...",[ 1996. 13552. 2090. 4098. 1998. 2198. 20...,4044.667638,Grumpier Old Men (1995),Comedy|Romance
3,3,4,Waiting to Exhale,"Four friends (Savannah, Robin, Bernadine, and ...","[2176, 2814, 1006, 10891, 1010, 5863, 1010, 16...",[ 2176. 2814. 1006. 10891. 1010. 5863. 10...,4013.604545,Waiting to Exhale (1995),Comedy|Drama|Romance
4,4,5,Father of the Bride Part II,"Four years after the events of the first film,...","[2176, 2086, 2044, 1996, 2824, 1997, 1996, 203...",[ 2176. 2086. 2044. 1996. 2824. 1997. 19...,4053.462801,Father of the Bride Part II (1995),Comedy


In [18]:
print(f'There are {movies_df["movie_plot"].isna().sum()} NaN movie plots')

There are 0 NaN movie plots


In [19]:
movies_plots = movies_df.index[movies_df['movie_plot'] == ''].tolist()

In [20]:
len(movies_df)

8168

In [21]:
movies_df.head()

Unnamed: 0,Unnamed: 0.1.1,movieId,wikipedia_page_name,movie_plot,token_ids,plot_ids,mean_tokens,title,genres
0,0,1,Toy Story,"A group of sentient toys, who pretend to be li...","[1037, 2177, 1997, 2741, 11638, 10899, 1010, 2...",[ 1037. 2177. 1997. 2741. 11638. 10899. 10...,4984.423077,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,2,Jumanji,"In 1969, Alan Parrish lives in Brantford, New ...","[1999, 3440, 1010, 5070, 11968, 18774, 3268, 1...",[ 1999. 3440. 1010. 5070. 11968. 18774. 32...,4644.15875,Jumanji (1995),Adventure|Children|Fantasy
2,2,3,Grumpier Old Men,The feud between Max and John has cooled and t...,"[1996, 13552, 2090, 4098, 1998, 2198, 2038, 12...",[ 1996. 13552. 2090. 4098. 1998. 2198. 20...,4044.667638,Grumpier Old Men (1995),Comedy|Romance
3,3,4,Waiting to Exhale,"Four friends (Savannah, Robin, Bernadine, and ...","[2176, 2814, 1006, 10891, 1010, 5863, 1010, 16...",[ 2176. 2814. 1006. 10891. 1010. 5863. 10...,4013.604545,Waiting to Exhale (1995),Comedy|Drama|Romance
4,4,5,Father of the Bride Part II,"Four years after the events of the first film,...","[2176, 2086, 2044, 1996, 2824, 1997, 1996, 203...",[ 2176. 2086. 2044. 1996. 2824. 1997. 19...,4053.462801,Father of the Bride Part II (1995),Comedy


#### ÖNEMLİ::: Wikipedia'da plot'ı olmayan filmleri çıkartmıştık. Bunlar ratings'de duruyor. Ratings'den bunları da temizlememiz gerekiyor

Ya da # ratings_df = pd.read_csv('./ml-latest-small/ratings_cleanest.csv')

In [22]:
movies_df.reset_index()

Unnamed: 0,index,Unnamed: 0.1.1,movieId,wikipedia_page_name,movie_plot,token_ids,plot_ids,mean_tokens,title,genres
0,0,0,1,Toy Story,"A group of sentient toys, who pretend to be li...","[1037, 2177, 1997, 2741, 11638, 10899, 1010, 2...",[ 1037. 2177. 1997. 2741. 11638. 10899. 10...,4984.423077,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,1,2,Jumanji,"In 1969, Alan Parrish lives in Brantford, New ...","[1999, 3440, 1010, 5070, 11968, 18774, 3268, 1...",[ 1999. 3440. 1010. 5070. 11968. 18774. 32...,4644.158750,Jumanji (1995),Adventure|Children|Fantasy
2,2,2,3,Grumpier Old Men,The feud between Max and John has cooled and t...,"[1996, 13552, 2090, 4098, 1998, 2198, 2038, 12...",[ 1996. 13552. 2090. 4098. 1998. 2198. 20...,4044.667638,Grumpier Old Men (1995),Comedy|Romance
3,3,3,4,Waiting to Exhale,"Four friends (Savannah, Robin, Bernadine, and ...","[2176, 2814, 1006, 10891, 1010, 5863, 1010, 16...",[ 2176. 2814. 1006. 10891. 1010. 5863. 10...,4013.604545,Waiting to Exhale (1995),Comedy|Drama|Romance
4,4,4,5,Father of the Bride Part II,"Four years after the events of the first film,...","[2176, 2086, 2044, 1996, 2824, 1997, 1996, 203...",[ 2176. 2086. 2044. 1996. 2824. 1997. 19...,4053.462801,Father of the Bride Part II (1995),Comedy
...,...,...,...,...,...,...,...,...,...,...
8163,8163,9734,193571,Silver Spoon (manga),After failing to pass the entrance examination...,"[2044, 7989, 2000, 3413, 1996, 4211, 14912, 20...",[ 2044. 7989. 2000. 3413. 1996. 4211. 149...,3927.593220,Silver Spoon (2014),Comedy|Drama
8164,8164,9735,193573,Love Live! The School Idol Movie,The movie begins with a scene from the second ...,"[1996, 3185, 4269, 2007, 1037, 3496, 2013, 199...",[1996. 3185. 4269. ... 2345. 2836. 1012.],4155.394251,Love Live! The School Idol Movie (2015),Animation
8165,8165,9737,193581,Black Butler: Book of the Atlantic,The story is based on the sinking of the Titan...,"[1996, 2466, 2003, 2241, 2006, 1996, 10186, 19...",[ 1996. 2466. 2003. 2241. 2006. 1996. 101...,5494.832941,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
8166,8166,9738,193583,No Game No Life: Zero,"In the present, Izuna and Tet play a game of c...","[1999, 1996, 2556, 1010, 1045, 9759, 2532, 199...",[ 1999. 1996. 2556. 1010. 1045. 9759. 25...,4360.566018,No Game No Life: Zero (2017),Animation|Comedy|Fantasy


In [23]:
valid_movie_ids = movies_df['movieId'].tolist()
valid_ratings_movie_ids = ratings_df['movieId'].tolist()
movies_df = movies_df[movies_df['movieId'].isin(valid_ratings_movie_ids)]
# ratings_df'deki movieId değerlerini filtreleyelim

len(movies_df)

8152

In [24]:
ratings_df = ratings_df[ratings_df['movieId'].isin(valid_movie_ids)]
ratings_df.shape

(93589, 6)

In [25]:
len(ratings_df['movieId'].unique())

8152

In [26]:
def normalize_timestamps(df, movie_col='movieId', timestamp_col='timestamp'):
    """
    Normalize the timestamp values within the dataframe for each unique movieId.
    
    Parameters:
    df (pd.DataFrame): DataFrame containing the data
    movie_col (str): Name of the column containing movie IDs
    timestamp_col (str): Name of the column containing timestamp values
    
    Returns:
    pd.DataFrame: DataFrame with an additional column 'normalized_timestamp'
    """
    min_timestamp = df[timestamp_col].min()
    max_timestamp = df[timestamp_col].max()
    
    # Compute normalized timestamp for each row
    df['normalized_timestamp'] = (df[timestamp_col] - min_timestamp) / (max_timestamp - min_timestamp)
    
    return df

In [27]:
avg_timestamps = normalize_timestamps(ratings_df)

In [28]:
# movieId'ye göre gruplama ve timestamp'lerin ortalamasını hesaplama
grouped = ratings_df.groupby('movieId')['normalized_timestamp'].mean()

# SeriesGroupBy nesnesini DataFrame'e dönüştürme
normalized_timestamps_df = grouped.reset_index()
len(normalized_timestamps_df)

8152

In [29]:
normalized_timestamps_df.tail()

Unnamed: 0,movieId,normalized_timestamp
8147,193571,0.999073
8148,193573,0.999074
8149,193581,0.999087
8150,193583,0.999088
8151,193587,0.999088


In [30]:
ratings_df.tail()

Unnamed: 0.1,Unnamed: 0,userId,movieId,rating,timestamp,month,normalized_timestamp
100831,100831,610,166534,4.0,1493848402,5,0.938125
100832,100832,610,168248,5.0,1493850091,5,0.938127
100833,100833,610,168250,5.0,1494273047,5,0.938723
100834,100834,610,168252,5.0,1493846352,5,0.938122
100835,100835,610,170875,3.0,1493846415,5,0.938122


In [31]:
movies_df.tail()

Unnamed: 0,Unnamed: 0.1.1,movieId,wikipedia_page_name,movie_plot,token_ids,plot_ids,mean_tokens,title,genres
8163,9734,193571,Silver Spoon (manga),After failing to pass the entrance examination...,"[2044, 7989, 2000, 3413, 1996, 4211, 14912, 20...",[ 2044. 7989. 2000. 3413. 1996. 4211. 149...,3927.59322,Silver Spoon (2014),Comedy|Drama
8164,9735,193573,Love Live! The School Idol Movie,The movie begins with a scene from the second ...,"[1996, 3185, 4269, 2007, 1037, 3496, 2013, 199...",[1996. 3185. 4269. ... 2345. 2836. 1012.],4155.394251,Love Live! The School Idol Movie (2015),Animation
8165,9737,193581,Black Butler: Book of the Atlantic,The story is based on the sinking of the Titan...,"[1996, 2466, 2003, 2241, 2006, 1996, 10186, 19...",[ 1996. 2466. 2003. 2241. 2006. 1996. 101...,5494.832941,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
8166,9738,193583,No Game No Life: Zero,"In the present, Izuna and Tet play a game of c...","[1999, 1996, 2556, 1010, 1045, 9759, 2532, 199...",[ 1999. 1996. 2556. 1010. 1045. 9759. 25...,4360.566018,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
8167,9740,193587,Bungo Stray Dogs: Dead Apple,"Two years before the Dark Era, the most violen...","[2048, 2086, 2077, 1996, 2601, 3690, 1010, 199...",[ 2048. 2086. 2077. 1996. 2601. 3690. 10...,5319.952886,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [32]:
# movies = new_movies
# # movies = movies.drop(columns=["Unnamed: 0"])
movies_df.set_index("movieId")

# movies["movies_id"] = movies["movie_id"] +1
# movies.to_csv(movies_path)

Unnamed: 0_level_0,Unnamed: 0.1.1,wikipedia_page_name,movie_plot,token_ids,plot_ids,mean_tokens,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,Toy Story,"A group of sentient toys, who pretend to be li...","[1037, 2177, 1997, 2741, 11638, 10899, 1010, 2...",[ 1037. 2177. 1997. 2741. 11638. 10899. 10...,4984.423077,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,1,Jumanji,"In 1969, Alan Parrish lives in Brantford, New ...","[1999, 3440, 1010, 5070, 11968, 18774, 3268, 1...",[ 1999. 3440. 1010. 5070. 11968. 18774. 32...,4644.158750,Jumanji (1995),Adventure|Children|Fantasy
3,2,Grumpier Old Men,The feud between Max and John has cooled and t...,"[1996, 13552, 2090, 4098, 1998, 2198, 2038, 12...",[ 1996. 13552. 2090. 4098. 1998. 2198. 20...,4044.667638,Grumpier Old Men (1995),Comedy|Romance
4,3,Waiting to Exhale,"Four friends (Savannah, Robin, Bernadine, and ...","[2176, 2814, 1006, 10891, 1010, 5863, 1010, 16...",[ 2176. 2814. 1006. 10891. 1010. 5863. 10...,4013.604545,Waiting to Exhale (1995),Comedy|Drama|Romance
5,4,Father of the Bride Part II,"Four years after the events of the first film,...","[2176, 2086, 2044, 1996, 2824, 1997, 1996, 203...",[ 2176. 2086. 2044. 1996. 2824. 1997. 19...,4053.462801,Father of the Bride Part II (1995),Comedy
...,...,...,...,...,...,...,...,...
193571,9734,Silver Spoon (manga),After failing to pass the entrance examination...,"[2044, 7989, 2000, 3413, 1996, 4211, 14912, 20...",[ 2044. 7989. 2000. 3413. 1996. 4211. 149...,3927.593220,Silver Spoon (2014),Comedy|Drama
193573,9735,Love Live! The School Idol Movie,The movie begins with a scene from the second ...,"[1996, 3185, 4269, 2007, 1037, 3496, 2013, 199...",[1996. 3185. 4269. ... 2345. 2836. 1012.],4155.394251,Love Live! The School Idol Movie (2015),Animation
193581,9737,Black Butler: Book of the Atlantic,The story is based on the sinking of the Titan...,"[1996, 2466, 2003, 2241, 2006, 1996, 10186, 19...",[ 1996. 2466. 2003. 2241. 2006. 1996. 101...,5494.832941,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
193583,9738,No Game No Life: Zero,"In the present, Izuna and Tet play a game of c...","[1999, 1996, 2556, 1010, 1045, 9759, 2532, 199...",[ 1999. 1996. 2556. 1010. 1045. 9759. 25...,4360.566018,No Game No Life: Zero (2017),Animation|Comedy|Fantasy


In [33]:
print('movies.csv:')
print('===========')

print(movies_df[["movieId", "genres"]].head())
print()
print('ratings.csv:')
print('============')

print(ratings_df[["userId", "movieId"]].head())

movies.csv:
   movieId                                       genres
0        1  Adventure|Animation|Children|Comedy|Fantasy
1        2                   Adventure|Children|Fantasy
2        3                               Comedy|Romance
3        4                         Comedy|Drama|Romance
4        5                                       Comedy

ratings.csv:
   userId  movieId
0       1        1
1       1        3
2       1        6
3       1       47
4       1       50


In [34]:
# Split genres and convert into indicator variables:
genres = movies_df['genres'].str.get_dummies('|')
print(genres[["Action", "Adventure", "Drama", "Horror"]].head())

# Use genres as movie input features:
movie_feat_1 = torch.from_numpy(genres.values).to(torch.float)
# assert movie_feat_1.size() == (9742, 20)  # 20 genres in total.

   Action  Adventure  Drama  Horror
0       0          1      0       0
1       0          1      0       0
2       0          0      0       0
3       0          0      1       0
4       0          0      0       0


In [35]:
# def convert_strings_to_floats(input_array):
#     output_array = []
#     for element in input_array:
#         converted_float = float(element)
#         output_array.append(converted_float)
#     return output_array
# movies_df['token_ids'] = output_array

In [36]:
# token_ids = movies_df['token_ids'].str.strip("[]").str.split(", ").map(np.array)

movie_feat_2 = torch.from_numpy(movies_df['mean_tokens'].values).to(torch.float)
movie_feat_2 = normalize(movie_feat_2,p=1.0, dim = 0)
movie_feat_3 = torch.from_numpy(normalized_timestamps_df['normalized_timestamp'].values).to(torch.float)
# movie_feat_3 = normalize(movie_feat_2,p=1.0, dim = 0)
# movie_feat_3 = torch.from_numpy(movies_df['month'].values).to(torch.float)
# movie_feat_3 = normalize(movie_feat_3,p=1.0, dim = 0)
# movie_feat = torch.column_stack([movie_feat_1, movie_feat_2, movie_feat_3])
movie_feat = torch.column_stack([movie_feat_1, movie_feat_2, movie_feat_3])

In [37]:
movie_feat.shape


torch.Size([8152, 22])

In [38]:
# movie_feat = torch.column_stack([movie_feat_1, movie_feat_2, movie_feat_3])


In [39]:
movie_feat = torch.nan_to_num(movie_feat, nan=2.0, posinf=1.0)
movie_feat.isnan().any()

tensor(False)

In [40]:
unique_movie_id = ratings_df['movieId'].unique()
len(unique_movie_id)
len(movies_df)

8152

In [41]:
movies_df = movies_df.drop(columns=['Unnamed: 0.1.1'])
movies_df.head()

Unnamed: 0,movieId,wikipedia_page_name,movie_plot,token_ids,plot_ids,mean_tokens,title,genres
0,1,Toy Story,"A group of sentient toys, who pretend to be li...","[1037, 2177, 1997, 2741, 11638, 10899, 1010, 2...",[ 1037. 2177. 1997. 2741. 11638. 10899. 10...,4984.423077,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji,"In 1969, Alan Parrish lives in Brantford, New ...","[1999, 3440, 1010, 5070, 11968, 18774, 3268, 1...",[ 1999. 3440. 1010. 5070. 11968. 18774. 32...,4644.15875,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men,The feud between Max and John has cooled and t...,"[1996, 13552, 2090, 4098, 1998, 2198, 2038, 12...",[ 1996. 13552. 2090. 4098. 1998. 2198. 20...,4044.667638,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale,"Four friends (Savannah, Robin, Bernadine, and ...","[2176, 2814, 1006, 10891, 1010, 5863, 1010, 16...",[ 2176. 2814. 1006. 10891. 1010. 5863. 10...,4013.604545,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II,"Four years after the events of the first film,...","[2176, 2086, 2044, 1996, 2824, 1997, 1996, 203...",[ 2176. 2086. 2044. 1996. 2824. 1997. 19...,4053.462801,Father of the Bride Part II (1995),Comedy


In [42]:
ratings_df = ratings_df.drop(columns=['Unnamed: 0'])
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,month,normalized_timestamp
0,1,1,4.0,964982703,7,0.192858
1,1,3,4.0,964981247,7,0.192856
2,1,6,4.0,964982224,7,0.192857
3,1,47,5.0,964983815,7,0.192859
4,1,50,5.0,964982931,7,0.192858


In [43]:
import numpy as np

def get_positional_encoding(d, position):
    """
    Generate positional encoding for a given position.
    """
    pe = np.zeros(d)
    for i in range(d):
        if i % 2 == 0:
            pe[i] = np.sin(position / (10000 ** (i / d)))
        else:
            pe[i] = np.cos(position / (10000 ** (i / d)))
    return pe

# Let's assume 16-dimensional positional encoding
d = 16

# Apply positional encoding to movie and user ids
movies_df['positional_encoding'] = movies_df['movieId'].apply(lambda x: get_positional_encoding(d, x))
ratings_df['positional_encoding'] = ratings_df['userId'].apply(lambda x: get_positional_encoding(d, x))

# Convert these into tensor form
movie_pe = torch.tensor(np.vstack(movies_df['positional_encoding'].values), dtype=torch.float)
user_pe = torch.tensor(np.vstack(ratings_df['positional_encoding'].values), dtype=torch.float)

# Now we can include these in the final feature vector


In [44]:
import networkx as nx

# Öncelikle grafı oluşturalım
G = nx.Graph()

# Kenarları ekleyelim
edges = list(zip(ratings_df['userId'], ratings_df['movieId']))
G.add_edges_from(edges)

# Topolojik özellikleri hesaplayalım
degree_centrality = nx.degree_centrality(G)
closeness_centrality = nx.closeness_centrality(G)
betweenness_centrality = nx.betweenness_centrality(G)

# Bunları DataFrame'e ekleyelim
movies_df['degree_centrality'] = movies_df['movieId'].map(degree_centrality)
movies_df['closeness_centrality'] = movies_df['movieId'].map(closeness_centrality)
movies_df['betweenness_centrality'] = movies_df['movieId'].map(betweenness_centrality)

# Eksik değerleri sıfırla doldur
movies_df.fillna(0, inplace=True)

# Bu özellikleri tensor olarak ekleyelim
topological_features = torch.tensor(movies_df[['degree_centrality', 'closeness_centrality', 'betweenness_centrality']].values, dtype=torch.float)




In [45]:
# # Rating değerlerini ikili hale getirme
mean_rating = ratings_df['rating'].mean()
ratings_df['binary_rating'] = (ratings_df['rating'] >= mean_rating).astype(int)

## Burası yeni _____ exponential weight decay ile yakın zamanlı değerlendirmelerin ağırlıkları fazla olacak
ratings_df['timestamp'] = (ratings_df['timestamp'] - ratings_df['timestamp'].min()) / \
                        (ratings_df['timestamp'].max() - ratings_df['timestamp'].min())

# Zaman faktörünü exponential decay ile ağırlık olarak kullanacağız
ratings_df['weight'] = np.exp(-ratings_df['timestamp'])


In [46]:
# normalizasyon gerekli mi kontrol ediyoruz, sonuç gerekli değil
print(movies_df['degree_centrality'].min(), movies_df['degree_centrality'].max())
print(movies_df['closeness_centrality'].min(), movies_df['closeness_centrality'].max())
print(movies_df['betweenness_centrality'].min(), movies_df['betweenness_centrality'].max())

0.00012038040207054291 0.29782111472252315
0.2480812304016724 0.5383668178872326
0.0 0.13506609853040205


In [47]:
# Nihai film özelliklerine ekleyelim
movie_features = torch.column_stack([movie_feat, movie_pe, topological_features])

print(movie_features.shape)

torch.Size([8152, 41])


In [48]:
# Load the entire ratings data frame into memory:
# ratings_df = pd.read_csv(ratings_path)

# Create a mapping from unique user indices to range [0, num_user_nodes):
# Create a mapping from unique user indices to range [0, num_user_nodes):
unique_user_id = ratings_df['userId'].unique()
unique_user_id = pd.DataFrame(data={
    'userId': unique_user_id,
    'mappedID': pd.RangeIndex(len(unique_user_id)),
})
print("Mapping of user IDs to consecutive values:")
print("==========================================")
print(unique_user_id.head())
print()
# Create a mapping from unique movie indices to range [0, num_movie_nodes):
unique_movie_id = ratings_df['movieId'].unique()
unique_movie_id = pd.DataFrame(data={
    'movieId': unique_movie_id,
    'mappedID': pd.RangeIndex(len(movies_df)),
})
print("Mapping of movie IDs to consecutive values:")
print("===========================================")
print(unique_movie_id.head())

# Perform merge to obtain the edges from users and movies:
ratings_user_id = pd.merge(ratings_df['userId'], unique_user_id,
                            left_on='userId', right_on='userId', how='left')
ratings_user_id = torch.from_numpy(ratings_user_id['mappedID'].values)
ratings_movie_id = pd.merge(ratings_df['movieId'], unique_movie_id,
                            left_on='movieId', right_on='movieId', how='left')
ratings_movie_id = torch.from_numpy(ratings_movie_id['mappedID'].values)

# With this, we are ready to construct our `edge_index` in COO format
# following PyG semantics:
edge_index_user_to_movie = torch.stack([ratings_user_id, ratings_movie_id], dim=0)
# assert edge_index_user_to_movie.size() == (2, 100836)

print()
print("Final edge indices pointing from users to movies:")
print("=================================================")
print(edge_index_user_to_movie)



Mapping of user IDs to consecutive values:
   userId  mappedID
0       1         0
1       2         1
2       3         2
3       4         3
4       5         4

Mapping of movie IDs to consecutive values:
   movieId  mappedID
0        1         0
1        3         1
2        6         2
3       47         3
4       50         4

Final edge indices pointing from users to movies:
tensor([[   0,    0,    0,  ...,  609,  609,  609],
        [   0,    1,    2,  ..., 2827, 1290, 2605]])


In [49]:
torch.isnan(edge_index_user_to_movie).any()

tensor(False)

In [50]:
len(unique_movie_id) 
# len(movies_df)

8152

In [51]:
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T

data = HeteroData()

# Save node indices:
data["user"].node_id = torch.arange(len(unique_user_id))
data["movie"].node_id = torch.arange(len(unique_movie_id))
weight = torch.tensor(ratings_df['weight'].values, dtype=torch.float)
# Add the node features and edge indices:
data["movie"].x = movie_features
data["user", "rates", "movie"].edge_index = edge_index_user_to_movie.long()
# data["user", "rates", "movie"].edge_attr = ratings_month.long()
data['user', 'rates', 'movie'].edge_attr = weight

# We also need to make sure to add the reverse edges from movies to users
# in order to let a GNN be able to pass messages in both directions.
# We can leverage the `T.ToUndirected()` transform for this from PyG:
data = T.ToUndirected()(data)

print(data)

HeteroData(
  user={ node_id=[610] },
  movie={
    node_id=[8152],
    x=[8152, 41],
  },
  (user, rates, movie)={
    edge_index=[2, 93589],
    edge_attr=[93589],
  },
  (movie, rev_rates, user)={
    edge_index=[2, 93589],
    edge_attr=[93589],
  }
)


In [52]:
# For this, we first split the set of edges into
# training (80%), validation (10%), and testing edges (10%).
# Across the training edges, we use 70% of edges for message passing,
# and 30% of edges for supervision.
# We further want to generate fixed negative edges for evaluation with a ratio of 2:1.
# Negative edges during training will be generated on-the-fly.
# We can leverage the `RandomLinkSplit()` transform for this from PyG:
transform = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    disjoint_train_ratio=0.3,
    neg_sampling_ratio=2.0,
    add_negative_train_samples=False,
    edge_types=("user", "rates", "movie"),
    rev_edge_types=("movie", "rev_rates", "user"),
)

train_data, val_data, test_data = transform(data)
print("Training data:")
print("==============")
print(train_data)
print()
print("Validation data:")
print("================")
print(val_data)

# assert train_data["user", "rates", "movie"].num_edges == 56469
# assert train_data["user", "rates", "movie"].edge_label_index.size(1) == 24201
# assert train_data["movie", "rev_rates", "user"].num_edges == 56469
# # No negative edges added:
# assert train_data["user", "rates", "movie"].edge_label.min() == 1
# assert train_data["user", "rates", "movie"].edge_label.max() == 1

# assert val_data["user", "rates", "movie"].num_edges == 80670
# assert val_data["user", "rates", "movie"].edge_label_index.size(1) == 30249
# assert val_data["movie", "rev_rates", "user"].num_edges == 80670
# # Negative edges with ratio 2:1:
# assert val_data["user", "rates", "movie"].edge_label.long().bincount().tolist() == [20166, 10083]

Training data:
HeteroData(
  user={ node_id=[610] },
  movie={
    node_id=[8152],
    x=[8152, 41],
  },
  (user, rates, movie)={
    edge_index=[2, 52412],
    edge_attr=[52412],
    edge_label=[22461],
    edge_label_index=[2, 22461],
  },
  (movie, rev_rates, user)={
    edge_index=[2, 52412],
    edge_attr=[52412],
  }
)

Validation data:
HeteroData(
  user={ node_id=[610] },
  movie={
    node_id=[8152],
    x=[8152, 41],
  },
  (user, rates, movie)={
    edge_index=[2, 74873],
    edge_attr=[74873],
    edge_label=[28074],
    edge_label_index=[2, 28074],
  },
  (movie, rev_rates, user)={
    edge_index=[2, 74873],
    edge_attr=[74873],
  }
)


In [53]:
from torch_geometric.data import Data
edge_label_index = train_data["user", "rates", "movie"].edge_label_index

In [54]:
edge_label_index.shape

torch.Size([2, 22461])

### Mini-batch Loaders


In [55]:
# In the first hop, we sample at most 20 neighbors.
# In the second hop, we sample at most 10 neighbors.
# In addition, during training, we want to sample negative edges on-the-fly with
# a ratio of 2:1.
# We can make use of the `loader.LinkNeighborLoader` from PyG:
from torch_geometric.loader import LinkNeighborLoader, NeighborLoader, DataLoader

# Define seed edges:
edge_label_index = train_data["user", "rates", "movie"].edge_label_index

edge_label = train_data["user", "rates", "movie"].edge_label

if edge_label_index.dtype != torch.long:
    edge_label_index = edge_label_index.long()

if edge_label.dtype != torch.long:
    edge_label = edge_label.long()
    
train_loader = LinkNeighborLoader(
    data=train_data,
    num_neighbors=[22, 10],
    neg_sampling_ratio=2.0,
    edge_label_index=(("user", "rates", "movie"), edge_label_index),
    edge_label=edge_label,
    batch_size=64,
    shuffle=True,
)

# Inspect a sample:



# assert sampled_data["user", "rates", "movie"].edge_label_index.size(1) == 3 * 128
# assert sampled_data["user", "rates", "movie"].edge_label.min() == 0
# assert sampled_data["user", "rates", "movie"].edge_label.max() == 1

In [56]:
sampled_data = next(iter(train_loader))
print("Sampled mini-batch:")
print("===================")
print(sampled_data)

Sampled mini-batch:
HeteroData(
  user={
    node_id=[599],
    n_id=[599],
    num_sampled_nodes=[3],
  },
  movie={
    node_id=[2252],
    x=[2252, 41],
    n_id=[2252],
    num_sampled_nodes=[3],
  },
  (user, rates, movie)={
    edge_index=[2, 12886],
    edge_attr=[12886],
    edge_label=[192],
    edge_label_index=[2, 192],
    e_id=[12886],
    num_sampled_edges=[2],
    input_id=[64],
  },
  (movie, rev_rates, user)={
    edge_index=[2, 5970],
    edge_attr=[5970],
    e_id=[5970],
    num_sampled_edges=[2],
  }
)


### Heterogeneous Link-level GNN


In [57]:
from torch_geometric.nn import SAGEConv, to_hetero
import torch.nn.functional as F
import coord_transforms
from torch_geometric_temporal.nn.recurrent import DCRNN
class GNN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
         
        self.conv1 = SAGEConv(hidden_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)
        
         

    def forward(self, x: Tensor, edge_index: Tensor) -> Tensor:
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        return x

# Our final classifier applies the dot-product between source and destination
# node embeddings to derive edge-level predictions:
class Classifier(torch.nn.Module):
    def forward(self, x_user: Tensor, x_movie: Tensor, edge_label_index: Tensor) -> Tensor:
        # Convert node embeddings to edge-level representations:
        edge_feat_user = x_user[edge_label_index[0]]
        edge_feat_movie = x_movie[edge_label_index[1]]

        # Apply dot-product to get a prediction per supervision edge:
        return (edge_feat_user * edge_feat_movie).sum(dim=-1)


class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        # Since the dataset does not come with rich features, we also learn two
        # embedding matrices for users and movies:
        self.movie_lin = torch.nn.Linear(41, hidden_channels)
        self.user_emb = torch.nn.Embedding(data["user"].num_nodes, hidden_channels)
        self.movie_emb = torch.nn.Embedding(data["movie"].num_nodes, hidden_channels)

        # Instantiate homogeneous GNN:
        self.gnn = GNN(hidden_channels)

        # Convert GNN model into a heterogeneous variant:
        self.gnn = to_hetero(self.gnn, metadata=data.metadata())

        self.classifier = Classifier()

    def forward(self, data: HeteroData) -> Tensor:
        x_dict = {
          "user": self.user_emb(data["user"].node_id),
          "movie": self.movie_lin(data["movie"].x) + self.movie_emb(data["movie"].node_id),
        }

        # `x_dict` holds feature matrices of all node types
        # `edge_index_dict` holds all edge indices of all edge types
        x_dict = self.gnn(x_dict, data.edge_index_dict)
        pred = self.classifier(
            x_dict["user"],
            x_dict["movie"],
            data["user", "rates", "movie"].edge_label_index,
        )

        return pred


model = Model(hidden_channels=128)

print(model)

Model(
  (movie_lin): Linear(in_features=41, out_features=128, bias=True)
  (user_emb): Embedding(610, 128)
  (movie_emb): Embedding(8152, 128)
  (gnn): GraphModule(
    (conv1): ModuleDict(
      (user__rates__movie): SAGEConv(128, 128, aggr=mean)
      (movie__rev_rates__user): SAGEConv(128, 128, aggr=mean)
    )
    (conv2): ModuleDict(
      (user__rates__movie): SAGEConv(128, 128, aggr=mean)
      (movie__rev_rates__user): SAGEConv(128, 128, aggr=mean)
    )
  )
  (classifier): Classifier()
)


In [58]:
model.gnn.conv1.user__rates__movie.load_state_dict(encoder_state_dict['conv1'])
model.gnn.conv2.user__rates__movie.load_state_dict(encoder_state_dict['conv2'])


<All keys matched successfully>

### Training GNN


In [59]:
import tqdm
import torch.nn.functional as F

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: '{device}'")

model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(1, 50): # 50, 100, 200 dene
    total_loss = total_examples = 0
    for sampled_data in tqdm.tqdm(train_loader):
        optimizer.zero_grad()

        sampled_data.to(device)
        pred = model(sampled_data)

        ground_truth = sampled_data["user", "rates", "movie"].edge_label
        ground_truth = ground_truth.float()
        loss = F.binary_cross_entropy_with_logits(pred, ground_truth)

        loss.backward()
        optimizer.step()
        total_loss += float(loss) * pred.numel()
        total_examples += pred.numel()
    print(f"Epoch: {epoch:03d}, Loss: {total_loss / total_examples:.4f}")

Device: 'cuda'


100%|█████████████████████████████████████████████████████████████████████████████████| 351/351 [00:13<00:00, 25.41it/s]


Epoch: 001, Loss: 0.4680


100%|█████████████████████████████████████████████████████████████████████████████████| 351/351 [00:12<00:00, 27.23it/s]


Epoch: 002, Loss: 0.3739


100%|█████████████████████████████████████████████████████████████████████████████████| 351/351 [00:12<00:00, 27.03it/s]


Epoch: 003, Loss: 0.3569


100%|█████████████████████████████████████████████████████████████████████████████████| 351/351 [00:13<00:00, 26.97it/s]


Epoch: 004, Loss: 0.3429


100%|█████████████████████████████████████████████████████████████████████████████████| 351/351 [00:13<00:00, 26.86it/s]


Epoch: 005, Loss: 0.3360


100%|█████████████████████████████████████████████████████████████████████████████████| 351/351 [00:13<00:00, 26.67it/s]


Epoch: 006, Loss: 0.3333


100%|█████████████████████████████████████████████████████████████████████████████████| 351/351 [00:13<00:00, 26.09it/s]


Epoch: 007, Loss: 0.3327


100%|█████████████████████████████████████████████████████████████████████████████████| 351/351 [00:13<00:00, 26.84it/s]


Epoch: 008, Loss: 0.3302


100%|█████████████████████████████████████████████████████████████████████████████████| 351/351 [00:13<00:00, 26.04it/s]


Epoch: 009, Loss: 0.3249


100%|█████████████████████████████████████████████████████████████████████████████████| 351/351 [00:13<00:00, 26.80it/s]


Epoch: 010, Loss: 0.3236


100%|█████████████████████████████████████████████████████████████████████████████████| 351/351 [00:13<00:00, 26.32it/s]


Epoch: 011, Loss: 0.3208


100%|█████████████████████████████████████████████████████████████████████████████████| 351/351 [00:13<00:00, 26.74it/s]


Epoch: 012, Loss: 0.3205


100%|█████████████████████████████████████████████████████████████████████████████████| 351/351 [00:13<00:00, 26.71it/s]


Epoch: 013, Loss: 0.3177


100%|█████████████████████████████████████████████████████████████████████████████████| 351/351 [00:13<00:00, 26.77it/s]


Epoch: 014, Loss: 0.3231


100%|█████████████████████████████████████████████████████████████████████████████████| 351/351 [00:13<00:00, 25.69it/s]


Epoch: 015, Loss: 0.3173


100%|█████████████████████████████████████████████████████████████████████████████████| 351/351 [00:13<00:00, 26.05it/s]


Epoch: 016, Loss: 0.3128


100%|█████████████████████████████████████████████████████████████████████████████████| 351/351 [00:13<00:00, 26.14it/s]


Epoch: 017, Loss: 0.3157


100%|█████████████████████████████████████████████████████████████████████████████████| 351/351 [00:14<00:00, 24.22it/s]


Epoch: 018, Loss: 0.3137


100%|█████████████████████████████████████████████████████████████████████████████████| 351/351 [00:14<00:00, 24.90it/s]


Epoch: 019, Loss: 0.3144


100%|█████████████████████████████████████████████████████████████████████████████████| 351/351 [00:13<00:00, 25.26it/s]


Epoch: 020, Loss: 0.3107


100%|█████████████████████████████████████████████████████████████████████████████████| 351/351 [00:14<00:00, 24.45it/s]


Epoch: 021, Loss: 0.3100


100%|█████████████████████████████████████████████████████████████████████████████████| 351/351 [00:14<00:00, 24.11it/s]


Epoch: 022, Loss: 0.3095


100%|█████████████████████████████████████████████████████████████████████████████████| 351/351 [00:14<00:00, 23.63it/s]


Epoch: 023, Loss: 0.3036


100%|█████████████████████████████████████████████████████████████████████████████████| 351/351 [00:13<00:00, 25.87it/s]


Epoch: 024, Loss: 0.3093


100%|█████████████████████████████████████████████████████████████████████████████████| 351/351 [00:13<00:00, 25.75it/s]


Epoch: 025, Loss: 0.3077


100%|█████████████████████████████████████████████████████████████████████████████████| 351/351 [00:13<00:00, 25.84it/s]


Epoch: 026, Loss: 0.3076


100%|█████████████████████████████████████████████████████████████████████████████████| 351/351 [00:13<00:00, 25.81it/s]


Epoch: 027, Loss: 0.3096


100%|█████████████████████████████████████████████████████████████████████████████████| 351/351 [00:13<00:00, 25.99it/s]


Epoch: 028, Loss: 0.3040


100%|█████████████████████████████████████████████████████████████████████████████████| 351/351 [00:13<00:00, 26.13it/s]


Epoch: 029, Loss: 0.3066


100%|█████████████████████████████████████████████████████████████████████████████████| 351/351 [00:13<00:00, 25.98it/s]


Epoch: 030, Loss: 0.3073


100%|█████████████████████████████████████████████████████████████████████████████████| 351/351 [00:13<00:00, 25.88it/s]


Epoch: 031, Loss: 0.3047


100%|█████████████████████████████████████████████████████████████████████████████████| 351/351 [00:13<00:00, 26.13it/s]


Epoch: 032, Loss: 0.3047


100%|█████████████████████████████████████████████████████████████████████████████████| 351/351 [00:13<00:00, 26.04it/s]


Epoch: 033, Loss: 0.3054


100%|█████████████████████████████████████████████████████████████████████████████████| 351/351 [00:13<00:00, 26.14it/s]


Epoch: 034, Loss: 0.3017


100%|█████████████████████████████████████████████████████████████████████████████████| 351/351 [00:13<00:00, 25.72it/s]


Epoch: 035, Loss: 0.3016


100%|█████████████████████████████████████████████████████████████████████████████████| 351/351 [00:13<00:00, 25.49it/s]


Epoch: 036, Loss: 0.2965


100%|█████████████████████████████████████████████████████████████████████████████████| 351/351 [00:14<00:00, 23.83it/s]


Epoch: 037, Loss: 0.3057


100%|█████████████████████████████████████████████████████████████████████████████████| 351/351 [00:15<00:00, 23.37it/s]


Epoch: 038, Loss: 0.3000


100%|█████████████████████████████████████████████████████████████████████████████████| 351/351 [00:14<00:00, 24.99it/s]


Epoch: 039, Loss: 0.3010


100%|█████████████████████████████████████████████████████████████████████████████████| 351/351 [00:14<00:00, 24.74it/s]


Epoch: 040, Loss: 0.2999


100%|█████████████████████████████████████████████████████████████████████████████████| 351/351 [00:14<00:00, 25.04it/s]


Epoch: 041, Loss: 0.3010


100%|█████████████████████████████████████████████████████████████████████████████████| 351/351 [00:15<00:00, 21.99it/s]


Epoch: 042, Loss: 0.3011


100%|█████████████████████████████████████████████████████████████████████████████████| 351/351 [00:14<00:00, 24.73it/s]


Epoch: 043, Loss: 0.2995


100%|█████████████████████████████████████████████████████████████████████████████████| 351/351 [00:13<00:00, 25.09it/s]


Epoch: 044, Loss: 0.2994


100%|█████████████████████████████████████████████████████████████████████████████████| 351/351 [00:17<00:00, 20.45it/s]


Epoch: 045, Loss: 0.2984


100%|█████████████████████████████████████████████████████████████████████████████████| 351/351 [00:17<00:00, 20.10it/s]


Epoch: 046, Loss: 0.2994


100%|█████████████████████████████████████████████████████████████████████████████████| 351/351 [00:16<00:00, 21.71it/s]


Epoch: 047, Loss: 0.2965


100%|█████████████████████████████████████████████████████████████████████████████████| 351/351 [00:15<00:00, 23.16it/s]


Epoch: 048, Loss: 0.2942


100%|█████████████████████████████████████████████████████████████████████████████████| 351/351 [00:16<00:00, 20.98it/s]

Epoch: 049, Loss: 0.2946





### Evaluating a Heterogeneous Link-level GNN


In [60]:
# Define the validation seed edges:
edge_label_index = val_data["user", "rates", "movie"].edge_label_index
edge_label = val_data["user", "rates", "movie"].edge_label

val_loader = LinkNeighborLoader(
    data=val_data,
    num_neighbors=[22, 10],
    edge_label_index=(("user", "rates", "movie"), edge_label_index),
    edge_label=edge_label,
    batch_size=3 * 128,
    shuffle=False,
)

sampled_data = next(iter(val_loader))

print("Sampled mini-batch:")
print("===================")
print(sampled_data)

# assert sampled_data["user", "rates", "movie"].edge_label_index.size(1) == 3 * 128
# assert sampled_data["user", "rates", "movie"].edge_label.min() >= 0
# assert sampled_data["user", "rates", "movie"].edge_label.max() <= 1

Sampled mini-batch:
HeteroData(
  user={
    node_id=[607],
    n_id=[607],
    num_sampled_nodes=[3],
  },
  movie={
    node_id=[2656],
    x=[2656, 41],
    n_id=[2656],
    num_sampled_nodes=[3],
  },
  (user, rates, movie)={
    edge_index=[2, 19753],
    edge_attr=[19753],
    edge_label=[384],
    edge_label_index=[2, 384],
    e_id=[19753],
    num_sampled_edges=[2],
    input_id=[384],
  },
  (movie, rev_rates, user)={
    edge_index=[2, 8208],
    edge_attr=[8208],
    e_id=[8208],
    num_sampled_edges=[2],
  }
)


In [61]:
from sklearn.metrics import roc_auc_score

preds = []
ground_truths = []
for sampled_data in tqdm.tqdm(val_loader):
    with torch.no_grad():
        sampled_data.to(device)
        preds.append(model(sampled_data))
        ground_truths.append(sampled_data["user", "rates", "movie"].edge_label)

pred = torch.cat(preds, dim=0).cpu().numpy()
ground_truth = torch.cat(ground_truths, dim=0).cpu().numpy()
auc = roc_auc_score(ground_truth, pred)
print()
print(f"Validation AUC: {auc:.4f}")

100%|███████████████████████████████████████████████████████████████████████████████████| 74/74 [00:01<00:00, 44.90it/s]


Validation AUC: 0.9212



