In [1]:

import numpy as np
import json 
import pandas as pd 
from scipy.special import gamma, kv
from tqdm import tqdm
import torch
import time
import os
import gc

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [3]:
# Parameters 
instance_id = 5

# kernel_type = 'matern'
# kernel_params = {'length_scale': 1.0, 'nu':2.0, 'sigma': 1.0}

# kernel_type = 'gaussian'
# kernel_params = {'length_scale': 1.0, 'sigma': 1.0}


# kernel_type = 'sigmoid'
# kernel_params = {'alpha': 0.5, 'c': 1.0, 'sigma':1.0}


# kernel_type = 'laplace'
# kernel_params = {'length_scale': 1.0}

kernel_type = 'poly'
kernel_params = {'c': 1, 'degree': 2}

# loss_type = 'mse'
loss_type = 'cross_entrophy'

kernel_params_str = "_".join([f"{key}={value}" for key, value in kernel_params.items()])

# 1. Read Data, Convert to DataFrame

In [4]:
def convert_json_to_df(json_file_path:str):
    with open(json_file_path, "r", encoding="utf-8") as file:
        data = json.load(file)

    in_sample_transactions = data["transactions"]["in_sample_transactions"]
    out_sample_transactions = data["transactions"]["out_of_sample_transactions"]
    product_labels = data['product_labels']

    in_sample_transactions = pd.DataFrame(in_sample_transactions)
    out_sample_transactions = pd.DataFrame(out_sample_transactions)
    
    # rename 'prodcut' to 'choice' 
    in_sample_transactions.rename(columns={'product':'choice'}, inplace=True)
    out_sample_transactions.rename(columns={'product':'choice'}, inplace=True)
    product_labels = pd.DataFrame(
        list(product_labels.items()), columns=["product_id", "product_name"]
    )
    return in_sample_transactions, out_sample_transactions,product_labels

In [5]:
def convert_list_to_one_hot(transaction:list,d):
    one_hot = np.zeros(d)
    for item in transaction:
        one_hot[item] = 1
    return one_hot

def convert_to_one_hot(transactions:pd.DataFrame,d):
    transactions["offered_product_one_hot"] = transactions['offered_products'].apply(lambda x : convert_list_to_one_hot(x,d))
    transactions['choice_one_hot'] = transactions['choice'].apply(lambda x: convert_list_to_one_hot([x],d))
    return transactions



In [6]:
in_sample_transactions, out_sample_transactions ,items= convert_json_to_df(f"hotel_json/instance_{instance_id}.json")

d = len(items) + 1 # consider the 0 as no-purchase

train_datasize = len(in_sample_transactions)
test_datasize = len(out_sample_transactions)
in_sample_transactions = convert_to_one_hot(in_sample_transactions,d)
out_sample_transactions = convert_to_one_hot(out_sample_transactions,d)

in_sample_transactions

Unnamed: 0,offered_products,choice,offered_product_one_hot,choice_one_hot
0,"[0, 1, 2, 3, 4, 5, 6]",5,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]"
1,"[0, 1, 2, 3, 4, 5, 6]",0,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
2,"[0, 1, 2, 3, 4, 5, 6]",0,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
3,"[0, 1, 2, 3, 4, 5, 6]",0,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
4,"[0, 1, 2, 3, 4, 5, 6]",0,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
...,...,...,...,...
995,"[0, 2, 4, 5]",2,"[1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0]","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]"
996,"[0, 2, 4, 5]",0,"[1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0]","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
997,"[0, 2, 4, 5]",0,"[1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0]","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
998,"[0, 2, 4, 5]",0,"[1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0]","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"


In [7]:
S_train = torch.stack([torch.tensor(sample, dtype=torch.float16,device=device) for sample in in_sample_transactions["offered_product_one_hot"]])
S_test = torch.stack([torch.tensor(sample, dtype=torch.float16,device=device) for sample in out_sample_transactions["offered_product_one_hot"]])
y_train = torch.stack([torch.tensor(sample, dtype=torch.float16,device=device) for sample in in_sample_transactions["choice_one_hot"]])
y_test = torch.stack([torch.tensor(sample, dtype=torch.float16,device=device) for sample in out_sample_transactions["choice_one_hot"]])

# 2. Kernel Implementation





Matrix-valued **Matern kernel** $\tilde{\boldsymbol{\mathsf{k}}}$ can be 

$$
\boldsymbol{\mathsf{k}}(\boldsymbol{e}_{S},\boldsymbol{e}_{S'}) = \boldsymbol{K} \otimes \mathsf{k}_{m}(\boldsymbol{e}_{S}, \boldsymbol{e}_{S'})
$$
where $\boldsymbol{K}$ is a positive semi-definite matrix. Then 

$$
\tilde{\mathsf{k}}^{ij}(\boldsymbol{e}_{S},\boldsymbol{e}_{S'}) = K_{ij} \times\sigma^{2} \frac{2^{1-\nu}}{\Gamma(\nu)} \left( \sqrt{ 2\nu }  \frac{\left\| \boldsymbol{e}_{S} - \boldsymbol{e}_{S'} \right\|_{2}  }{\ell} \right) K_{\nu} \left( \sqrt{ 2\nu } \frac{\left\| \boldsymbol{\boldsymbol{e}_{S}-\boldsymbol{e}_{S'}} \right\|_{2}  }{\ell} \right)
$$
Add constraint, 

$$
\mathsf{k}^{ij}(S, S') = \mathbb{1}(i \in \boldsymbol{e}_{S})\cdot \mathbb{1}(j \in \boldsymbol{e}_{S'}) \cdot   \tilde{\mathsf{k}}^{ij}(\boldsymbol{e}_{S}, \boldsymbol{e}_{S'})
$$


Same with Gaussian kernel.



In [8]:
K = torch.eye(d).to(torch.float16) # for the moment we consider the identity matrix as the covariance matrix
K = K.to(device)

# 3. Solve

## 3.1. Precalcualte Kernel Tensor

In [9]:
def polynomial_kernel(S1_set:torch.Tensor,S2_set:torch.Tensor,c:float=1.0, degree:int=1):
    
    K_g = torch.mm(S1_set, S2_set.T)
    K_g += c

    K_g = torch.pow(K_g, degree)

    K_g /= torch.max(K_g)
    return K_g

def gaussian_kernel(S1_set:torch.Tensor,S2_set:torch.Tensor, length_scale:float=1.0, sigma:float=1.0):
    sq_dist = torch.cdist(S1_set, S2_set, p=2)**2 
    K_g = sigma**2 * torch.exp(-sq_dist / (2 * length_scale**2))

    return K_g


def matern_kernel(S1_set:torch.Tensor,S2_set:torch.Tensor, length_scale:float, nu:float, sigma:float):
    dist = torch.cdist(S1_set, S2_set,p=2)
    dist = torch.where(dist == 0, torch.tensor(1e-6, device=S1_set.device), dist)
    scaled_dist = np.sqrt(2 * nu) * dist / length_scale
    factor = (2 **(1-nu)) / gamma(nu)
    K_g = sigma**2 * factor * (scaled_dist**nu) * torch.tensor(kv(nu, scaled_dist.cpu().numpy()), device=S1_set.device)
    
    return K_g

def sigmoid_kernel(S1_set:torch.Tensor, S2_set:torch.Tensor, alpha:float,c:float, sigma:float):
    inner_product = torch.mm(S1_set, S2_set.T)
    K_g = sigma**2 * torch.tanh(alpha * inner_product + c)
    return K_g
    

def laplace_kernel(S1_set:torch.Tensor, S2_set:torch.Tensor, length_scale:float=1.0):
    dist = torch.cdist(S1_set, S2_set,p=1)
    K_g = torch.exp(-dist / length_scale)
    return K_g


    

In [10]:
Kg_train_train = None
Kg_test_train = None
if kernel_type == 'matern':
    Kg_train_train = matern_kernel(S_train,S_train,kernel_params['length_scale'], kernel_params['nu'], kernel_params['sigma'])
    Kg_test_train = matern_kernel(S_test,S_train,kernel_params['length_scale'], kernel_params['nu'], kernel_params['sigma'])
elif kernel_type == 'gaussian':
    Kg_train_train = gaussian_kernel(S_train, S_train,kernel_params['length_scale'], kernel_params['sigma'])
    Kg_test_train = gaussian_kernel(S_test, S_train,kernel_params['length_scale'], kernel_params['sigma'])
elif kernel_type == 'sigmoid':
    Kg_train_train = sigmoid_kernel(S_train,S_train, kernel_params['alpha'], kernel_params['c'], kernel_params['sigma'])
    Kg_test_train = sigmoid_kernel(S_test,S_train, kernel_params['alpha'], kernel_params['c'], kernel_params['sigma'])
elif kernel_type == 'laplace':
    Kg_train_train = laplace_kernel(S_train,S_train, kernel_params['length_scale'])
    Kg_test_train = laplace_kernel(S_test,S_train, kernel_params['length_scale'])
elif kernel_type == 'poly':
    Kg_train_train = polynomial_kernel(S_train,S_train,kernel_params['c'],kernel_params['degree'])
    Kg_test_train = polynomial_kernel(S_test,S_train,kernel_params['c'],kernel_params['degree'])



In [11]:
# def cal_kernel_tensor_old(K_g:torch.Tensor, S_1:torch.Tensor, S_2:torch.Tensor):
#     print(K_g.shape)
#     K_kernel = torch.zeros((K_g.shape[0],K_g.shape[1], d, d),dtype = torch.float16)
#     print(K_kernel.shape)
#     K_g = K_g.to('cpu') 
#     total_iterations = K_g.shape[0] * K_g.shape[1]
#     with tqdm(total=total_iterations, desc="Kernel Tensor Calculating...", unit="iteration") as pbar:
#         for i in range(K_g.shape[0]):
#             mask_x1 = (S_train[i]!=0).view(-1, 1).to('cpu')
#             for j in range(K_g.shape[1]):
#                 mask_x2 = (S_train[j]!=0).view(1, -1).to('cpu')
#                 mask = mask_x1*mask_x2
#                 K_kernel[i, j] = K_g[i,j]  * K * mask
#                 pbar.update(1)
#     return K_kernel 

In [12]:
def cal_kernel_tensor_new(Kg:torch.Tensor, S_1:torch.Tensor,S_2:torch.Tensor):
    mask1 = (S_1 != 0).unsqueeze(1).unsqueeze(3)
    mask2 = (S_2 != 0).unsqueeze(0).unsqueeze(2)
    mask = (mask1 & mask2 )
    Kg_expanded = Kg.unsqueeze(-1).unsqueeze(-1)
    gc.collect()
    return (Kg_expanded * K * mask)
    
    

In [13]:
train_file = f"results/feature_free/hotel_{instance_id}/kernel_data/train_train_{kernel_type}({kernel_params_str})_kernel_data.pt"
test_file = f"results/feature_free/hotel_{instance_id}/kernel_data/test_train_{kernel_type}({kernel_params_str})_kernel_data.pt"

precompute_time  = 0
if os.path.exists(train_file) and os.path.exists(test_file):
    K_kernel_train_train = torch.load(train_file,map_location=device)
    K_kernel_test_train = torch.load(test_file,map_location=device)
else:
    time1 = time.time()
    K_kernel_train_train = cal_kernel_tensor_new(Kg_train_train,S_train, S_train)
    time2 = time.time()
    precompute_time = time2-time1
    
    K_kernel_train_train = K_kernel_train_train.to(dtype = torch.float16,device=device)
    
    K_kernel_test_train = cal_kernel_tensor_new(Kg_test_train, S_test, S_train)
    K_kernel_test_train = K_kernel_test_train.to(dtype =torch.float16,device=device)
    
    os.makedirs(os.path.dirname(train_file), exist_ok=True)
    torch.save(
        K_kernel_train_train, train_file
    )

    os.makedirs(os.path.dirname(test_file), exist_ok=True)
    torch.save(
        K_kernel_test_train,
        test_file,
    )


## 3.2 Solve Using `torch.optimin.Adam`

In [14]:
torch.cuda.empty_cache()
torch.cuda.synchronize()
allocated_memory = torch.cuda.memory_allocated(device) / (1024 ** 3)  
reserved_memory = torch.cuda.memory_reserved(device) / (1024 ** 3)  
print(f" Allocated Memory: {allocated_memory:.2f} GB | Reserved Memory: {reserved_memory:.2f} GB")

 Allocated Memory: 0.13 GB | Reserved Memory: 0.21 GB


In [15]:
alphaset = torch.rand((train_datasize, d), dtype=torch.float32, device=device) 
alphaset = (alphaset * 0.01).detach().requires_grad_(True)
lambda_ = 1e-4
grad_clip_threshold = 2.0
patience = 25
best_loss = float('inf')  
epochs_since_improvement = 0
best_alphaset = None
grad_norm_threshold = 1e-2
# grad_norm_threshold = 1e-6

scaler = torch.amp.GradScaler()


def objective(alphaset: torch.Tensor):
    # U = torch.zeros((datasize, d), dtype=torch.float32, device=device)
    
    # batch_size=64
    # for start in range(0, datasize, batch_size):
    #     end = min(start + batch_size, datasize)
    #     U[start:end] = torch.einsum("jikl, il -> jk", K_kernel[start:end], alphaset)
    #     torch.cuda.empty_cache()
    
    U = torch.einsum("jikl, il -> jk", K_kernel_train_train, alphaset)

        
    if torch.any(torch.isnan(U)):
        print("NaN detected in U")

    l = cross_entrophy_loss(U)
    # print('l=',l)
    r = reg(alphaset)
    # print('r=',r) 
     
    del U  
    gc.collect()
    return l + lambda_ * r



def cross_entrophy_loss(U):
    U_max = torch.max(U, dim=1, keepdim=True).values
    U_stable = U - U_max
    exp_utility = torch.exp(U_stable)
    
    if torch.any(torch.isnan(exp_utility)):
        print("NaN detected in exp(U)",epoch)
    
    exp_utility_masked = exp_utility * S_train

    sum_exp_utility = torch.sum(exp_utility_masked, dim=1, keepdim=True)

    p_matrix = exp_utility_masked / (sum_exp_utility)
    if torch.any(torch.isnan(p_matrix)):
        print("NaN detected in p",epoch)
        
    log_p = torch.log(p_matrix + 1e-30)
    if torch.any(torch.isnan(log_p)):
        print("NaN detected in log_p",epoch)
    loss_matrix = -y_train * log_p
    if torch.any(torch.isnan(loss_matrix)):
        print("NaN detected in loss matrix",epoch)
    loss_value = torch.sum(loss_matrix) / train_datasize
    return loss_value




def squared_loss(p_vec: torch.Tensor, y_vec: torch.Tensor):
    return torch.sum((p_vec - y_vec) ** 2)


def reg(alphaset: torch.Tensor):


    return torch.einsum("id,ijdl,jl->", alphaset, K_kernel_train_train, alphaset)

optimizer = torch.optim.Adam([alphaset], lr=1e-3)

time3 = time.time()
for epoch in range(300):
    optimizer.zero_grad(set_to_none=True)  
    
    with torch.amp.autocast(device_type='cuda'):
        loss_value = objective(alphaset)


    
    if loss_value.item() < best_loss:
        best_loss = loss_value.item()
        best_alphaset = alphaset.clone().detach()

        
 
    scaler.scale(loss_value).backward()  

    
    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Best Loss: {best_loss}")
    
    scaler.unscale_(optimizer)
    scaler.step(optimizer)
    torch.nn.utils.clip_grad_norm_([alphaset], max_norm=1.0)
    
    grad_norm = torch.norm(alphaset.grad).item()
    # print(grad_norm)
    if grad_norm < grad_norm_threshold:
        print(f"Early stopping at epoch {epoch}, Gradient Norm: {grad_norm}, Best Loss: {best_loss}")
        break 
    
    scaler.update()
    

    torch.cuda.empty_cache()

optimizer.state.clear()
del alphaset
del optimizer
del scaler
gc.collect()
torch.cuda.empty_cache()

torch.cuda.empty_cache()
time4 = time.time()
train_time = time4 - time3

allocated_memory = torch.cuda.memory_allocated(device) / (1024 ** 3)  # 转为GB
reserved_memory = torch.cuda.memory_reserved(device) / (1024 ** 3)  # 转为GB
print(f" Allocated Memory: {allocated_memory:.2f} GB | Reserved Memory: {reserved_memory:.2f} GB")

Epoch 0, Best Loss: 1.649713397026062
Epoch 10, Best Loss: 0.823919415473938
Epoch 20, Best Loss: 0.823919415473938
Epoch 30, Best Loss: 0.7907113432884216
Epoch 40, Best Loss: 0.7822631001472473
Epoch 50, Best Loss: 0.7771490216255188
Epoch 60, Best Loss: 0.7757511138916016
Epoch 70, Best Loss: 0.7730705142021179
Epoch 80, Best Loss: 0.771404504776001
Epoch 90, Best Loss: 0.7700221538543701
Epoch 100, Best Loss: 0.7686238884925842
Epoch 110, Best Loss: 0.7674310803413391
Epoch 120, Best Loss: 0.7662979364395142
Epoch 130, Best Loss: 0.7653632164001465
Epoch 140, Best Loss: 0.7644644975662231
Epoch 150, Best Loss: 0.7636728286743164
Epoch 160, Best Loss: 0.7629429697990417
Epoch 170, Best Loss: 0.7622671723365784
Epoch 180, Best Loss: 0.7616795301437378
Epoch 190, Best Loss: 0.7611636519432068
Epoch 200, Best Loss: 0.7606747150421143
Epoch 210, Best Loss: 0.7602311372756958
Epoch 220, Best Loss: 0.7598104476928711
Epoch 230, Best Loss: 0.759435772895813
Epoch 240, Best Loss: 0.75909113

# 4. Performance

In [16]:
import gc
gc.collect()
torch.cuda.empty_cache()
torch.cuda.synchronize()

def find_large_tensors():
    total_mem = 0
    for obj in gc.get_objects():
        try:
            if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)):
                if obj.is_cuda:
                    tensor_mem = obj.numel() * obj.element_size() / 1024**2  # MB
                    total_mem += tensor_mem
                    print(f"Tensor: {type(obj)}, Shape: {obj.shape}, Memory: {tensor_mem:.2f} MB")
        except Exception:
            pass
    print(f"Total GPU memory used by tensors: {total_mem:.2f} MB")

find_large_tensors()
allocated_memory = torch.cuda.memory_allocated(device) / (1024 ** 3)  # 转为GB
reserved_memory = torch.cuda.memory_reserved(device) / (1024 ** 3)  # 转为GB
print(f" Allocated Memory: {allocated_memory:.2f} GB | Reserved Memory: {reserved_memory:.2f} GB")


Tensor: <class 'torch.Tensor'>, Shape: torch.Size([1000, 1000]), Memory: 1.91 MB
Tensor: <class 'torch.Tensor'>, Shape: torch.Size([255, 1000]), Memory: 0.49 MB
Tensor: <class 'torch.Tensor'>, Shape: torch.Size([1000, 7]), Memory: 0.01 MB
Tensor: <class 'torch.Tensor'>, Shape: torch.Size([255, 7]), Memory: 0.00 MB
Tensor: <class 'torch.Tensor'>, Shape: torch.Size([1000, 7]), Memory: 0.01 MB
Tensor: <class 'torch.Tensor'>, Shape: torch.Size([255, 7]), Memory: 0.00 MB
Tensor: <class 'torch.Tensor'>, Shape: torch.Size([7, 7]), Memory: 0.00 MB
Tensor: <class 'torch.Tensor'>, Shape: torch.Size([1000, 1000, 7, 7]), Memory: 93.46 MB
Tensor: <class 'torch.Tensor'>, Shape: torch.Size([255, 1000, 7, 7]), Memory: 23.83 MB
Tensor: <class 'torch.Tensor'>, Shape: torch.Size([1000, 7]), Memory: 0.03 MB
Tensor: <class 'torch.Tensor'>, Shape: torch.Size([]), Memory: 0.00 MB
Tensor: <class 'torch.Tensor'>, Shape: torch.Size([1000, 7]), Memory: 0.03 MB
Tensor: <class 'torch.Tensor'>, Shape: torch.Size([1

  return isinstance(obj, torch.Tensor)
  if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)):


In [17]:
def get_prob_matrix(U:torch.Tensor,S:torch.Tensor):
    exp_utility = torch.exp(U)
    exp_utility_masked = exp_utility * S
    sum_exp_utility = torch.sum(exp_utility_masked, dim=1, keepdim=True)
    p_matrix = exp_utility_masked / sum_exp_utility
    return p_matrix

In [18]:
print(K_kernel_test_train.dtype)
torch.cuda.empty_cache()
torch.cuda.synchronize()
U_test = torch.einsum("jikl,il->jk", K_kernel_test_train, best_alphaset.to(torch.float16))
P_test = get_prob_matrix(U_test,S_test)

U_train = torch.einsum("jikl,il->jk", K_kernel_train_train, best_alphaset.to(torch.float16))
P_train = get_prob_matrix(U_train,S_train)

torch.float16


In [19]:
test_mse = torch.mean((P_test- y_test) ** 2).item()
test_rmse = np.sqrt(test_mse)
train_mse = torch.mean((P_train- y_train) ** 2).item()
train_rmse = np.sqrt(train_mse)
test_rmse,train_rmse

(0.22432896899001922, 0.2230328503690656)

In [20]:
epsilon  = 1e-7
test_nnl = -torch.sum(y_test * torch.log(P_test + epsilon)).item() / y_test.shape[0]
train_nnl = -torch.sum(y_train * torch.log(P_train + epsilon)).item() / y_train.shape[0]
test_nnl, train_nnl

(0.7955882352941176, 0.7545)

In [21]:
import csv,os
results = [
    [instance_id, test_rmse, train_rmse, test_nnl, train_nnl, lambda_,kernel_type , kernel_params_str,loss_type,precompute_time, train_time]
]
file_path = f'results/feature_free/results.csv'
file_exists = os.path.exists(file_path)
with open(file_path, mode="a", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    if not file_exists or os.stat(file_path).st_size == 0:
        writer.writerow(["instance_id", "test_rmse", "train_rmse","test_nnl","train_nnl", "lambda_", "kernel_type", "kernel_params","loss_type","precompute_time","train_time"])
    writer.writerows(results)