In [1]:
import pickle
import argparse
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torchtext.legacy import data
import dgl
import tqdm

import layers
import sampler as sampler_module
import evaluation
from model import PinSAGEModel

import optuna

In [2]:
from dataclasses import dataclass

@dataclass
class TrainArgs:
    output_model_path: str
    random_walk_length: int = 2
    random_walk_restart_prob: float = 0.5
    num_random_walks: int = 10
    num_neighbors: int = 5
    num_layers: int = 2
    num_heads: int = 2
    hidden_dims: int = 16
    batch_size: int = 64
    device: str = 'cpu'
    num_epochs: int = 1
    batches_per_epoch: int = 20000
    num_workers: int = 0
    lr: float = 3e-5
    k: int = 10
    n_latest_items: int = 10

In [3]:
args = TrainArgs(output_model_path='abc', num_epochs=5, batches_per_epoch=10000, device='cuda')

ML

In [4]:
with open('data/data_ml.pkl', 'rb') as f:
    dataset = pickle.load(f)

In [5]:
g = dataset['train-graph']
val_matrix = dataset['val-matrix'].tocsr()
test_matrix = dataset['test-matrix'].tocsr()
item_texts = dataset['item-texts']
user_ntype = dataset['user-type']
item_ntype = dataset['item-type']
user_to_item_etype = dataset['user-to-item-type']
timestamp = dataset['timestamp-edge-column']
device = torch.device(args.device)
# Assign user and movie IDs and use them as features (to learn an individual trainable
# embedding for each entity)
g.nodes[user_ntype].data['id'] = torch.arange(g.number_of_nodes(user_ntype))
g.nodes[item_ntype].data['id'] = torch.arange(g.number_of_nodes(item_ntype))
# Prepare torchtext dataset and vocabulary
if item_texts is not None:
    fields = {}
    examples = []
    for key, texts in item_texts.items():
        fields[key] = data.Field(include_lengths=True, lower=True, batch_first=True)
    for i in range(g.number_of_nodes(item_ntype)):
        example = data.Example.fromlist(
            [item_texts[key][i] for key in item_texts.keys()],
            [(key, fields[key]) for key in item_texts.keys()])
        examples.append(example)
    textset = data.Dataset(examples, fields)
    for key, field in fields.items():
        field.build_vocab(getattr(textset, key))
        #field.build_vocab(getattr(textset, key), vectors='fasttext.simple.300d')
else:
    textset = None
# Sampler

In [6]:
def objective(trial):

    # 2. Suggest values of the hyperparameters using a trial object.
    n_layers = trial.suggest_int('n_layers', 1, 4)
    hidden_dims = trial.suggest_int('hidden_dims', 32, 128)
    learning_rate = trial.suggest_float("learning_rate_init", 1e-5, 1e-3)
    num_neighbors = trial.suggest_int('num_neighbors', 1, 15)
    
    batch_sampler = sampler_module.ItemToItemBatchSampler(
        g, user_ntype, item_ntype, args.batch_size)
    neighbor_sampler = sampler_module.NeighborSampler(
        g, user_ntype, item_ntype, args.random_walk_length,
        args.random_walk_restart_prob, args.num_random_walks, num_neighbors,
        n_layers)
    collator = sampler_module.PinSAGECollator(neighbor_sampler, g, item_ntype, textset)
    dataloader = DataLoader(
        batch_sampler,
        collate_fn=collator.collate_train,
        num_workers=args.num_workers)
    dataloader_test = DataLoader(
        torch.arange(g.number_of_nodes(item_ntype)),
        batch_size=args.batch_size,
        collate_fn=collator.collate_test,
        num_workers=args.num_workers)
    dataloader_it = iter(dataloader)
    
    model = PinSAGEModel(g, item_ntype, textset, hidden_dims, n_layers).to(args.device)
    opt = torch.optim.Adam(model.parameters(), lr=learning_rate)
    layers = []

    for epoch_id in range(2):
        model.train()
        for batch_id in tqdm.trange(args.batches_per_epoch):
            pos_graph, neg_graph, blocks = next(dataloader_it)
            # Copy to GPU
            for i in range(len(blocks)):
                blocks[i] = blocks[i].to(device)
            pos_graph = pos_graph.to(device)
            neg_graph = neg_graph.to(device)

            loss = model(pos_graph, neg_graph, blocks).mean()
            opt.zero_grad()
            loss.backward()
            opt.step()
            
        model.eval()
        with torch.no_grad():
            item_batches = torch.arange(g.number_of_nodes(item_ntype)).split(args.batch_size)
            h_item_batches = []
            for blocks in dataloader_test:
                for i in range(len(blocks)):
                    blocks[i] = blocks[i].to(device)

                h_item_batches.append(model.get_repr(blocks))
            h_item = torch.cat(h_item_batches, 0)
            metrics = evaluation.evaluate_nn(dataset, h_item, args.k, args.batch_size, args.n_latest_items)
            
    return metrics[0][2]

# 3. Create a study object and optimize the objective function.
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=1, timeout=10)

[32m[I 2022-05-29 13:47:19,886][0m A new study created in memory with name: no-name-8a439f5a-eda2-4fe2-9027-e635a207fa0a[0m
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [02:50<00:00, 58.69it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [02:45<00:00, 60.46it/s]
[32m[I 2022-05-29 13:53:22,921][0m Trial 0 finished with value: 0.23542091285041467 and parameters: {'n_layers': 1, 'hidden_dims': 50, 'learning_rate_init': 0.0008057495643996801, 'num_neighbors': 6}. Best is trial 0 with value: 0.23542091285041467.[0m


Ta Feng

In [7]:
with open('data/tafeng.pkl', 'rb') as f:
    dataset = pickle.load(f)
    
g = dataset['train-graph']
val_matrix = dataset['val-matrix'].tocsr()
test_matrix = dataset['test-matrix'].tocsr()
item_texts = dataset['item-texts']
user_ntype = dataset['user-type']
item_ntype = dataset['item-type']
user_to_item_etype = dataset['user-to-item-type']
timestamp = dataset['timestamp-edge-column']
device = torch.device(args.device)
# Assign user and movie IDs and use them as features (to learn an individual trainable
# embedding for each entity)
g.nodes[user_ntype].data['id'] = torch.arange(g.number_of_nodes(user_ntype))
g.nodes[item_ntype].data['id'] = torch.arange(g.number_of_nodes(item_ntype))
# Prepare torchtext dataset and vocabulary
if item_texts is not None:
    fields = {}
    examples = []
    for key, texts in item_texts.items():
        fields[key] = data.Field(include_lengths=True, lower=True, batch_first=True)
    for i in range(g.number_of_nodes(item_ntype)):
        example = data.Example.fromlist(
            [item_texts[key][i] for key in item_texts.keys()],
            [(key, fields[key]) for key in item_texts.keys()])
        examples.append(example)
    textset = data.Dataset(examples, fields)
    for key, field in fields.items():
        field.build_vocab(getattr(textset, key))
        #field.build_vocab(getattr(textset, key), vectors='fasttext.simple.300d')
else:
    textset = None
# Sampler

In [8]:
def objective(trial):

    # 2. Suggest values of the hyperparameters using a trial object.
    n_layers = trial.suggest_int('n_layers', 1, 4)
    hidden_dims = trial.suggest_int('hidden_dims', 16, 128)
    #num_epochs = trial.suggest_int('num_epochs', 3, 10)
    learning_rate = trial.suggest_float("learning_rate_init", 1e-5, 1e-3)
    num_neighbors = trial.suggest_int('num_neighbors', 1, 15)
    
    batch_sampler = sampler_module.ItemToItemBatchSampler(
        g, user_ntype, item_ntype, args.batch_size)
    neighbor_sampler = sampler_module.NeighborSampler(
        g, user_ntype, item_ntype, args.random_walk_length,
        args.random_walk_restart_prob, args.num_random_walks, num_neighbors,
        n_layers)
    collator = sampler_module.PinSAGECollator(neighbor_sampler, g, item_ntype, textset)
    dataloader = DataLoader(
        batch_sampler,
        collate_fn=collator.collate_train,
        num_workers=args.num_workers)
    dataloader_test = DataLoader(
        torch.arange(g.number_of_nodes(item_ntype)),
        batch_size=args.batch_size,
        collate_fn=collator.collate_test,
        num_workers=args.num_workers)
    dataloader_it = iter(dataloader)
    
    model = PinSAGEModel(g, item_ntype, textset, hidden_dims, n_layers).to(args.device)
    opt = torch.optim.Adam(model.parameters(), lr=learning_rate)
    layers = []

    for epoch_id in range(2):
        model.train()
        for batch_id in tqdm.trange(args.batches_per_epoch):
            pos_graph, neg_graph, blocks = next(dataloader_it)
            # Copy to GPU
            for i in range(len(blocks)):
                blocks[i] = blocks[i].to(device)
            pos_graph = pos_graph.to(device)
            neg_graph = neg_graph.to(device)

            loss = model(pos_graph, neg_graph, blocks).mean()
            opt.zero_grad()
            loss.backward()
            opt.step()
            
        model.eval()
        with torch.no_grad():
            item_batches = torch.arange(g.number_of_nodes(item_ntype)).split(args.batch_size)
            h_item_batches = []
            for blocks in dataloader_test:
                for i in range(len(blocks)):
                    blocks[i] = blocks[i].to(device)

                h_item_batches.append(model.get_repr(blocks))
            h_item = torch.cat(h_item_batches, 0)
            metrics = evaluation.evaluate_nn(dataset, h_item, args.k, args.batch_size, args.n_latest_items)
            
    return metrics[0][2]

# 3. Create a study object and optimize the objective function.
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=1, timeout=10)

[32m[I 2022-05-29 13:53:31,444][0m A new study created in memory with name: no-name-09121703-9026-4cf0-96a3-4593680c7dac[0m
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [09:59<00:00, 16.67it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [10:01<00:00, 16.64it/s]
[32m[I 2022-05-29 14:17:49,076][0m Trial 0 finished with value: 0.005602185466707845 and parameters: {'n_layers': 3, 'hidden_dims': 68, 'learning_rate_init': 0.0001351776812325878, 'num_neighbors': 6}. Best is trial 0 with value: 0.005602185466707845.[0m


Amazon

In [9]:
with open('data/amazon.pkl', 'rb') as f:
    dataset = pickle.load(f)
    
g = dataset['train-graph']
val_matrix = dataset['val-matrix'].tocsr()
test_matrix = dataset['test-matrix'].tocsr()
item_texts = dataset['item-texts']
user_ntype = dataset['user-type']
item_ntype = dataset['item-type']
user_to_item_etype = dataset['user-to-item-type']
timestamp = dataset['timestamp-edge-column']
device = torch.device(args.device)
# Assign user and movie IDs and use them as features (to learn an individual trainable
# embedding for each entity)
g.nodes[user_ntype].data['id'] = torch.arange(g.number_of_nodes(user_ntype))
g.nodes[item_ntype].data['id'] = torch.arange(g.number_of_nodes(item_ntype))
# Prepare torchtext dataset and vocabulary
if item_texts is not None:
    fields = {}
    examples = []
    for key, texts in item_texts.items():
        fields[key] = data.Field(include_lengths=True, lower=True, batch_first=True)
    for i in range(g.number_of_nodes(item_ntype)):
        example = data.Example.fromlist(
            [item_texts[key][i] for key in item_texts.keys()],
            [(key, fields[key]) for key in item_texts.keys()])
        examples.append(example)
    textset = data.Dataset(examples, fields)
    for key, field in fields.items():
        field.build_vocab(getattr(textset, key))
        #field.build_vocab(getattr(textset, key), vectors='fasttext.simple.300d')
else:
    textset = None
# Sampler

In [10]:
def objective(trial):

    # 2. Suggest values of the hyperparameters using a trial object.
    n_layers = trial.suggest_int('n_layers', 1, 4)
    hidden_dims = trial.suggest_int('hidden_dims', 64, 256)
    #num_epochs = trial.suggest_int('num_epochs', 3, 15)
    learning_rate = trial.suggest_float("learning_rate_init", 1e-5, 1e-3)
    num_neighbors = trial.suggest_int('num_neighbors', 1, 15)
    
    batch_sampler = sampler_module.ItemToItemBatchSampler(
        g, user_ntype, item_ntype, args.batch_size)
    neighbor_sampler = sampler_module.NeighborSampler(
        g, user_ntype, item_ntype, args.random_walk_length,
        args.random_walk_restart_prob, args.num_random_walks, num_neighbors,
        n_layers)
    collator = sampler_module.PinSAGECollator(neighbor_sampler, g, item_ntype, textset)
    dataloader = DataLoader(
        batch_sampler,
        collate_fn=collator.collate_train,
        num_workers=args.num_workers)
    dataloader_test = DataLoader(
        torch.arange(g.number_of_nodes(item_ntype)),
        batch_size=args.batch_size,
        collate_fn=collator.collate_test,
        num_workers=args.num_workers)
    dataloader_it = iter(dataloader)
    
    model = PinSAGEModel(g, item_ntype, textset, hidden_dims, n_layers).to(args.device)
    opt = torch.optim.Adam(model.parameters(), lr=learning_rate)
    layers = []

    for epoch_id in range(2):
        model.train()
        for batch_id in tqdm.trange(args.batches_per_epoch):
            pos_graph, neg_graph, blocks = next(dataloader_it)
            # Copy to GPU
            for i in range(len(blocks)):
                blocks[i] = blocks[i].to(device)
            pos_graph = pos_graph.to(device)
            neg_graph = neg_graph.to(device)

            loss = model(pos_graph, neg_graph, blocks).mean()
            opt.zero_grad()
            loss.backward()
            opt.step()
            
        model.eval()
        with torch.no_grad():
            item_batches = torch.arange(g.number_of_nodes(item_ntype)).split(args.batch_size)
            h_item_batches = []
            for blocks in dataloader_test:
                for i in range(len(blocks)):
                    blocks[i] = blocks[i].to(device)

                h_item_batches.append(model.get_repr(blocks))
            h_item = torch.cat(h_item_batches, 0)
            metrics = evaluation.evaluate_nn(dataset, h_item, args.k, args.batch_size, args.n_latest_items)
            
    return metrics[0][2]

# 3. Create a study object and optimize the objective function.
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=1, timeout=10)

[32m[I 2022-05-29 14:18:33,209][0m A new study created in memory with name: no-name-43bfc550-c131-4a93-a5a0-633d35ca9ab3[0m
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [15:54<00:00, 10.47it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [18:07<00:00,  9.20it/s]
[32m[I 2022-05-29 14:56:07,691][0m Trial 0 finished with value: 0.004244697693350659 and parameters: {'n_layers': 3, 'hidden_dims': 131, 'learning_rate_init': 0.0006658629757644586, 'num_neighbors': 11}. Best is trial 0 with value: 0.004244697693350659.[0m
