In [1]:
import numpy as np 
import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns
# import os
# import re
# import math
# import plotly.express as px
# import plotly.graph_objects as go
# from plotly.subplots import make_subplots

# from lightgbm import LGBMRegressor, LGBMClassifier
# from xgboost import XGBRegressor, XGBClassifier
# from catboost import CatBoostRegressor, CatBoostClassifier
# import lightgbm as lgb
# import xgboost as xgb
# import catboost as cab

# from sklearn.preprocessing import LabelEncoder
# from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold, GridSearchCV
# from sklearn.decomposition import LatentDirichletAllocation, NMF, TruncatedSVD
# from sklearn.linear_model import LogisticRegression, SGDClassifier
# from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor, StackingRegressor
# from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier, StackingClassifier
# from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
# from sklearn import metrics
# from sklearn.svm import SVC
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.preprocessing import PolynomialFeatures
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
# from collections import defaultdict, Counter
import warnings
import json 
import pickle
warnings.filterwarnings('ignore')

import pickle
import random
from tqdm import tqdm
# import sentence_transformers 
# from sklearn.preprocessing import KBinsDiscretizer
# from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
# from gensim.models import Word2Vec

import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.utils.data import random_split, DataLoader
from torch.utils.data import Dataset

from utils import set_random_seed, get_logger, ensure_dir, save_model_with_epoch, load_model_with_epoch
from data import NNDataset, NNDatasetV2
from model import MatchModel, BaseModel, MatchModelV2

"""
相比于trainDeeep.py，加入一些手动聚合的序列特征，例如历史序列的平均价格，历史序列的不同类别数之类的
SeqFeatureEmbedding现在使用2层全连接，可以换多层！
"""

emb_dim = 16
dense_bins = 10
hid_dim = 256
dropout = 0.3
layers = 4
bidirectional = False
seq_emb_factor = 4  # 人工序列特征的嵌入是emb_dim的几倍

batch_size = 1024
epochs = 100
len_candidate_set = 100
Fold = 0
device = torch.device('cuda:2')

train = True
load_init = False
load_epoch = 28
load_exp_id = 11088

# TODO: 调整batch-size， 调整hidden-size
# TODO: 跑Fold 4
learning_rate = 0.001
weight_decay = 0.00001
early_stop_lr = 1e-6
lr_patience = 5
lr_decay_ratio = 0.1
clip = 5
log_every = 100
early_stop = True
patience = 10
kfold = 5
attn_match = True 

w2v_window = 3
w2v_min_count = 1
w2v_epochs = 500
w2v_vector_size = 128

seed = 2023
set_random_seed(seed)

model_name = 'MatchModelV2withATTMatchFold{}'.format(Fold)
loc2id = {'DE': 0, 'JP': 1, 'UK': 2, 'ES': 3, 'FR': 4, 'IT': 5}

config = locals()

In [2]:
# 加载必要的数据

exp_id = config.get('exp_id', None)
if exp_id is None:
    exp_id = int(random.SystemRandom().random() * 100000)
    config['exp_id'] = exp_id

logger = get_logger(config)
logger.info('Exp_id {}'.format(exp_id))
logger.info(config)

logger.info('read data')

titles_embedding = np.load('./data/titles_embedding.npy')
descs_embedding = np.load('./data/descs_embedding.npy')
logger.info('titles_embedding: {}'.format(titles_embedding.shape))
logger.info('descs_embedding: {}'.format(descs_embedding.shape))

product2id = json.load(open('data/product2id.json', 'r'))
id2product = json.load(open('data/id2product.json', 'r'))
id2product = {int(k): v for k, v in id2product.items()}
logger.info('product2id: {}'.format(len(product2id)))
logger.info('id2product: {}'.format(len(id2product)))

word2vec_embedding = np.load('./data/word2vec_embedding.npy')
logger.info('word2vec_embedding: {}'.format(word2vec_embedding.shape))

top200 = pickle.load(open('data/top200.pkl', 'rb'))

df_train_encoded = pd.read_csv('data/df_train_encoded.csv')
df_test_encoded = pd.read_csv('data/df_test_encoded.csv')
products_encoded = pd.read_csv('./data/products_encoded.csv')
logger.info('df_train_encoded: {}'.format(df_train_encoded.shape))
logger.info('df_test_encoded: {}'.format(df_test_encoded.shape))
logger.info('products_encoded: {}'.format(products_encoded.shape))

logger.info('MinMaxScaler Norm products_num_feas')
mms = MinMaxScaler(feature_range=(0,1))
num_features = ['price', 'len_title', 'len_desc']
products_encoded[num_features] = mms.fit_transform(products_encoded[num_features])
for fe in num_features:
    products_encoded[fe] = products_encoded[fe].astype('float32')

logger.info('Load Hand-made Seq Features')
df_train_seqs_feas_all = pd.read_csv('data/df_train_seqs_feas_all.csv')  # 29维特征
df_test_seqs_feas_all = pd.read_csv('data/df_test_seqs_feas_all.csv')
logger.info('df_train_seqs_feas_all: {}'.format(df_train_seqs_feas_all.shape))
logger.info('df_test_seqs_feas_all: {}'.format(df_test_seqs_feas_all.shape))
seqs_cat_feas = [f for f in df_train_seqs_feas_all.columns if 'NUNIQUE' in f or 'COUNT' in f]
seqs_num_feas = [f for f in df_train_seqs_feas_all.columns if f not in seqs_cat_feas]
logger.info('seqs_cat_feas: {}'.format(seqs_cat_feas))
logger.info('seqs_num_feas: {}'.format(seqs_num_feas))

logger.info('MinMaxScaler Norm seqs_num_feas')
mms = MinMaxScaler(feature_range=(0,1))
df_train_seqs_feas_all[seqs_num_feas] = mms.fit_transform(df_train_seqs_feas_all[seqs_num_feas])
df_test_seqs_feas_all[seqs_num_feas] = mms.fit_transform(df_test_seqs_feas_all[seqs_num_feas])
for fe in seqs_num_feas:
    df_train_seqs_feas_all[fe] = df_train_seqs_feas_all[fe].astype('float32')
    df_test_seqs_feas_all[fe] = df_test_seqs_feas_all[fe].astype('float32')

df_train_seqs_cat_feas = df_train_seqs_feas_all[seqs_cat_feas]
df_train_seqs_num_feas = df_train_seqs_feas_all[seqs_num_feas]
df_test_seqs_cat_feas = df_test_seqs_feas_all[seqs_cat_feas]
df_test_seqs_num_feas = df_test_seqs_feas_all[seqs_num_feas]
logger.info('df_train_seqs_cat_feas: {}'.format(df_train_seqs_cat_feas.shape))
logger.info('df_train_seqs_num_feas: {}'.format(df_train_seqs_num_feas.shape))
logger.info('df_test_seqs_cat_feas: {}'.format(df_test_seqs_cat_feas.shape))
logger.info('df_test_seqs_num_feas: {}'.format(df_test_seqs_num_feas.shape))

id_count = products_encoded.shape[0]

train_preds_encoded = pickle.load(open('./data/train_preds_all_encoded.pkl', 'rb'))  # (len_train, 100)
test_preds_encoded = pickle.load(open('./data/test_preds_all_encoded.pkl', 'rb'))  # (len_test, 100)
test_preds = pickle.load(open('./data/test_preds_all.pkl', 'rb'))
logger.info('train_preds_encoded: {}'.format(len(train_preds_encoded)))
logger.info('test_preds_encoded: {}'.format(len(test_preds_encoded)))
logger.info('test_preds: {}'.format(len(test_preds)))

logger.info('Cutting the candidate_set to {}'.format(len_candidate_set))
cut_train_preds_encoded = [lst[:len_candidate_set] for lst in tqdm(train_preds_encoded, total=len(train_preds_encoded))]
df_train_encoded['recall'] = cut_train_preds_encoded
cut_test_preds_encoded = [lst[:len_candidate_set] for lst in tqdm(test_preds_encoded, total=len(test_preds_encoded))]
df_test_encoded['recall'] = cut_test_preds_encoded

logger.info('Eval the prev_items')
df_train_encoded['prev_items'] = df_train_encoded['prev_items'].apply(eval)
df_test_encoded['prev_items'] = df_test_encoded['prev_items'].apply(eval)

df_test = pd.read_csv('data/sessions_test_task1.csv')
logger.info('df_test: {}'.format(df_test.shape))

tmp = pd.concat([df_train_seqs_feas_all[seqs_cat_feas], df_test_seqs_feas_all[seqs_cat_feas]])
tmp_nunique = (tmp.max() + 1).to_dict()  # 不是nunique，因为这个是计数特征，不是连续的0~n-1

data_feature = {}
data_feature['len_encode_brand'] = products_encoded['encode_brand'].nunique()
data_feature['len_encode_color'] = products_encoded['encode_color'].nunique()
data_feature['len_encode_size'] = products_encoded['encode_size'].nunique()
data_feature['len_encode_model'] = products_encoded['encode_model'].nunique()
data_feature['len_encode_material'] = products_encoded['encode_material'].nunique()
data_feature['len_encode_author'] = products_encoded['encode_author'].nunique()
data_feature['len_locale'] = len(loc2id)
data_feature['dense_bins'] = dense_bins
data_feature['id_count'] = id_count
data_feature['len_features'] = products_encoded.shape[1] - 1
data_feature['len_emb_features'] = 3
data_feature['len_candidate_set'] = len_candidate_set
data_feature['w2v_vector_size'] = w2v_vector_size
data_feature['sentence_vector_size'] = 384
data_feature['len_seqs_cat_feas'] = len(seqs_cat_feas)
data_feature['len_seqs_num_feas'] = len(seqs_num_feas)
data_feature['seq_emb_factor'] = seq_emb_factor
data_feature.update(tmp_nunique)
logger.info('data_feature:')
logger.info(data_feature)

del tmp

2023-05-15 03:28:43,691 - INFO - Log directory: ./log
2023-05-15 03:28:43,694 - INFO - Exp_id 61323
2023-05-15 03:28:43,699 - INFO - read data
2023-05-15 03:29:08,912 - INFO - titles_embedding: (1410675, 384)
2023-05-15 03:29:08,918 - INFO - descs_embedding: (1410675, 384)
2023-05-15 03:29:16,415 - INFO - product2id: 1410675
2023-05-15 03:29:16,417 - INFO - id2product: 1410675
2023-05-15 03:29:20,999 - INFO - word2vec_embedding: (1410675, 128)
2023-05-15 03:29:32,881 - INFO - df_train_encoded: (3606249, 4)
2023-05-15 03:29:32,883 - INFO - df_test_encoded: (316971, 4)
2023-05-15 03:29:32,884 - INFO - products_encoded: (1410675, 14)
2023-05-15 03:29:32,885 - INFO - MinMaxScaler Norm products_num_feas
2023-05-15 03:29:34,923 - INFO - Load Hand-made Seq Features
2023-05-15 03:30:07,024 - INFO - df_train_seqs_feas_all: (3606249, 29)
2023-05-15 03:30:07,028 - INFO - df_test_seqs_feas_all: (316971, 29)
2023-05-15 03:30:07,035 - INFO - seqs_cat_feas: ['idNUNIQUE', 'idCOUNT', 'brandNUNIQUE', 'b

100%|██████████| 3606249/3606249 [01:17<00:00, 46743.71it/s] 
100%|██████████| 316971/316971 [00:03<00:00, 90874.01it/s] 


2023-05-15 03:34:41,414 - INFO - Eval the prev_items
2023-05-15 03:37:57,542 - INFO - df_test: (316971, 2)
2023-05-15 03:37:59,096 - INFO - data_feature:
2023-05-15 03:37:59,098 - INFO - {'len_encode_brand': 177190, 'len_encode_color': 203261, 'len_encode_size': 218061, 'len_encode_model': 524102, 'len_encode_material': 45569, 'len_encode_author': 30836, 'len_locale': 6, 'dense_bins': 10, 'id_count': 1410675, 'len_features': 13, 'len_emb_features': 3, 'len_candidate_set': 100, 'w2v_vector_size': 128, 'sentence_vector_size': 384, 'len_seqs_cat_feas': 14, 'len_seqs_num_feas': 15, 'seq_emb_factor': 4, 'idNUNIQUE': 133, 'idCOUNT': 475, 'brandNUNIQUE': 39, 'brandCOUNT': 475, 'colorNUNIQUE': 53, 'colorCOUNT': 269, 'sizeNUNIQUE': 115, 'sizeCOUNT': 475, 'modelNUNIQUE': 59, 'modelCOUNT': 475, 'materialNUNIQUE': 25, 'materialCOUNT': 475, 'authorNUNIQUE': 65, 'authorCOUNT': 190}


In [51]:
logger.info('Load Hand-made Seq Features')
df_train_seqs_feas_all = pd.read_csv('data/df_train_seqs_feas_all.csv')  # 29维特征
df_test_seqs_feas_all = pd.read_csv('data/df_test_seqs_feas_all.csv')
logger.info('df_train_seqs_feas_all: {}'.format(df_train_seqs_feas_all.shape))
logger.info('df_test_seqs_feas_all: {}'.format(df_test_seqs_feas_all.shape))
seqs_cat_feas = [f for f in df_train_seqs_feas_all.columns if 'NUNIQUE' in f or 'COUNT' in f]
seqs_num_feas = [f for f in df_train_seqs_feas_all.columns if f not in seqs_cat_feas]
logger.info('seqs_cat_feas: {}'.format(seqs_cat_feas))
logger.info('seqs_num_feas: {}'.format(seqs_num_feas))

logger.info('MinMaxScaler Norm seqs_num_feas')
mms = MinMaxScaler(feature_range=(0,1))
df_train_seqs_feas_all[seqs_num_feas] = mms.fit_transform(df_train_seqs_feas_all[seqs_num_feas])
df_test_seqs_feas_all[seqs_num_feas] = mms.fit_transform(df_test_seqs_feas_all[seqs_num_feas])
for fe in seqs_num_feas:
    df_train_seqs_feas_all[fe] = df_train_seqs_feas_all[fe].astype('float32')
    df_test_seqs_feas_all[fe] = df_test_seqs_feas_all[fe].astype('float32')

df_train_seqs_cat_feas = df_train_seqs_feas_all[seqs_cat_feas]
df_train_seqs_num_feas = df_train_seqs_feas_all[seqs_num_feas]
df_test_seqs_cat_feas = df_test_seqs_feas_all[seqs_cat_feas]
df_test_seqs_num_feas = df_test_seqs_feas_all[seqs_num_feas]
logger.info('df_train_seqs_cat_feas: {}'.format(df_train_seqs_cat_feas.shape))
logger.info('df_train_seqs_num_feas: {}'.format(df_train_seqs_num_feas.shape))
logger.info('df_test_seqs_cat_feas: {}'.format(df_test_seqs_cat_feas.shape))
logger.info('df_test_seqs_num_feas: {}'.format(df_test_seqs_num_feas.shape))

2023-05-15 04:17:29,954 - INFO - Load Hand-made Seq Features
2023-05-15 04:18:01,584 - INFO - df_train_seqs_feas_all: (3606249, 29)
2023-05-15 04:18:01,586 - INFO - df_test_seqs_feas_all: (316971, 29)
2023-05-15 04:18:01,587 - INFO - seqs_cat_feas: ['idNUNIQUE', 'idCOUNT', 'brandNUNIQUE', 'brandCOUNT', 'colorNUNIQUE', 'colorCOUNT', 'sizeNUNIQUE', 'sizeCOUNT', 'modelNUNIQUE', 'modelCOUNT', 'materialNUNIQUE', 'materialCOUNT', 'authorNUNIQUE', 'authorCOUNT']
2023-05-15 04:18:01,588 - INFO - seqs_num_feas: ['priceMEAN', 'priceSTD', 'priceMIN', 'priceMAX', 'priceSUM', 'len_titleMEAN', 'len_titleSTD', 'len_titleMIN', 'len_titleMAX', 'len_titleSUM', 'len_descMEAN', 'len_descSTD', 'len_descMIN', 'len_descMAX', 'len_descSUM']
2023-05-15 04:18:01,589 - INFO - MinMaxScaler Norm seqs_num_feas
2023-05-15 04:18:25,931 - INFO - df_train_seqs_cat_feas: (3606249, 14)
2023-05-15 04:18:25,934 - INFO - df_train_seqs_num_feas: (3606249, 15)
2023-05-15 04:18:25,935 - INFO - df_test_seqs_cat_feas: (316971, 1

In [47]:
df_test_seqs_num_feas.isna().any()

priceMEAN         True
priceSTD          True
priceMIN          True
priceMAX          True
priceSUM         False
len_titleMEAN     True
len_titleSTD      True
len_titleMIN      True
len_titleMAX      True
len_titleSUM     False
len_descMEAN      True
len_descSTD       True
len_descMIN       True
len_descMAX       True
len_descSUM      False
dtype: bool

In [48]:
df_with_nan = df_test_seqs_num_feas[df_test_seqs_num_feas.isna().any(axis=1)]

In [50]:
df_test_seqs_cat_feas[df_test_seqs_num_feas.isna().any(axis=1)]

Unnamed: 0,idNUNIQUE,idCOUNT,brandNUNIQUE,brandCOUNT,colorNUNIQUE,colorCOUNT,sizeNUNIQUE,sizeCOUNT,modelNUNIQUE,modelCOUNT,materialNUNIQUE,materialCOUNT,authorNUNIQUE,authorCOUNT
104606,3,3,0,0,0,0,0,0,0,0,0,0,0,0
104701,2,4,0,0,0,0,0,0,0,0,0,0,0,0
104978,2,2,1,1,1,1,0,0,0,0,1,1,0,0
105063,2,2,0,0,0,0,0,0,0,0,0,0,0,0
105116,2,3,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
316942,2,2,1,1,1,1,0,0,1,1,1,1,0,0
316951,3,3,0,0,0,0,0,0,0,0,0,0,0,0
316955,2,3,1,1,1,1,0,0,0,0,0,0,0,0
316968,4,6,0,0,0,0,0,0,0,0,0,0,0,0


In [49]:
df_with_nan

Unnamed: 0,priceMEAN,priceSTD,priceMIN,priceMAX,priceSUM,len_titleMEAN,len_titleSTD,len_titleMIN,len_titleMAX,len_titleSUM,len_descMEAN,len_descSTD,len_descMIN,len_descMAX,len_descSUM
104606,,,,,0.000000e+00,,,,,0.000000,,,,,0.000000
104701,,,,,0.000000e+00,,,,,0.000000,,,,,0.000000
104978,7.450000e-05,,7.450000e-05,7.450000e-05,5.730769e-06,0.400411,,0.401639,0.390782,0.019957,0.190381,,0.190381,0.190,0.012773
105063,,,,,0.000000e+00,,,,,0.000000,,,,,0.000000
105116,,,,,0.000000e+00,,,,,0.000000,,,,,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
316942,2.247500e-07,,2.247500e-07,2.247500e-07,1.728846e-08,0.396304,,0.397541,0.386774,0.019754,0.188377,,0.188377,0.188,0.012639
316951,,,,,0.000000e+00,,,,,0.000000,,,,,0.000000
316955,4.212500e-07,,4.212500e-07,4.212500e-07,3.240385e-08,0.318275,,0.319672,0.310621,0.015884,0.201403,,0.201403,0.201,0.013513
316968,,,,,0.000000e+00,,,,,0.000000,,,,,0.000000


In [41]:
df_train_seqs_num_feas[df_train_seqs_num_feas.isna()]

Unnamed: 0,priceMEAN,priceSTD,priceMIN,priceMAX,priceSUM,len_titleMEAN,len_titleSTD,len_titleMIN,len_titleMAX,len_titleSUM,len_descMEAN,len_descSTD,len_descMIN,len_descMAX,len_descSUM
0,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3606244,,,,,,,,,,,,,,,
3606245,,,,,,,,,,,,,,,
3606246,,,,,,,,,,,,,,,
3606247,,,,,,,,,,,,,,,


In [52]:
# 加载模型等

logger.info('create model')

products_input = {name: torch.tensor(products_encoded[name].values).to(device) for name in products_encoded.columns}

if 'BaseModel' in model_name:
    model = BaseModel(config, data_feature, products_input, word2vec_embedding, titles_embedding, descs_embedding).to(device)
elif 'MatchModel' in model_name:
    model = MatchModelV2(config, data_feature, products_input, word2vec_embedding, titles_embedding, descs_embedding).to(device)
else:
    raise ValueError('Error model name {}'.format(model_name))
logger.info(model)

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, mode='max', patience=lr_patience, factor=lr_decay_ratio)

for name, param in model.named_parameters():
    logger.info(str(name) + '\t' + str(param.shape) + '\t' +
                              str(param.device) + '\t' + str(param.requires_grad))
total_num = sum([param.nelement() for param in model.parameters()])
logger.info('Total parameter numbers: {}'.format(total_num))


# 数据集DataLoader

trn_idx_list = np.load('data/5fold_trn_idx_list.npy', allow_pickle=True)
val_idx_list = np.load('data/5fold_val_idx_list.npy', allow_pickle=True)
logger.info('Fold {}: trn_idx {}'.format(Fold, len(trn_idx_list[Fold])))
logger.info('Fold {}: val_idx {}'.format(Fold, len(val_idx_list[Fold])))

train_set = NNDatasetV2(df_train_encoded.iloc[trn_idx_list[Fold]], 
                        df_train_seqs_cat_feas.iloc[trn_idx_list[Fold]], 
                        df_train_seqs_num_feas.iloc[trn_idx_list[Fold]])
val_set = NNDatasetV2(df_train_encoded.iloc[val_idx_list[Fold]], 
                      df_train_seqs_cat_feas.iloc[val_idx_list[Fold]], 
                      df_train_seqs_num_feas.iloc[val_idx_list[Fold]])
test_set = NNDatasetV2(df_test_encoded, df_test_seqs_cat_feas, 
                       df_test_seqs_num_feas)
logger.info('train_set: {}'.format(len(train_set)))
logger.info('val_set: {}'.format(len(val_set)))
logger.info('test_set: {}'.format(len(test_set)))


def collate_fn(indices):
    batch_prev_items = []
    batch_locale = []
    batch_candidate_set = []
    batch_len = []
    batch_mask = []
    batch_label = []
    batch_label_index = []  # 交叉熵需要的是label在候选集中的index
    batch_seq_cat = []
    batch_seq_num = []
    for item in indices:
        batch_len.append(len(item[0]))  # prev_items
    max_len = max(batch_len)
    for item in indices:
        l = len(item[0])
        batch_mask.append([1] * (l) + [0] * (max_len - l))  # 0代表padding的位置，需要mask
    for item in indices:
        # ['prev_items', 'locale', 'recall', 'next_item', 'seqs_cat_feas', 'seqs_num_feas']
        prev_items = item[0].copy()
        while (len(prev_items) < max_len):
            prev_items.append(id_count)  # embdding的时候id_count+1，把id_count作为padding了
        batch_prev_items.append(prev_items)
        batch_locale.append(item[1])
        batch_candidate_set.append(item[2].copy())
        batch_label.append(item[3])
        if item[3] in item[2]:
            batch_label_index.append(item[2].index(item[3]))
        else:
            batch_label_index.append(len(item[2]))
        batch_seq_cat.append(item[4])
        batch_seq_num.append(item[5])
    return [torch.LongTensor(batch_prev_items).to(device), torch.LongTensor(batch_locale).to(device), 
            torch.LongTensor(batch_candidate_set).to(device),
            torch.LongTensor(batch_len).to(device), torch.LongTensor(batch_mask).to(device), 
            torch.LongTensor(batch_label).to(device), torch.LongTensor(batch_label_index).to(device),
            torch.LongTensor(batch_seq_cat).to(device), torch.FloatTensor(batch_seq_num).to(device)]


train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
logger.info('train_loader: {}'.format(len(train_loader)))
logger.info('val_loader: {}'.format(len(val_loader)))
logger.info('test_loader: {}'.format(len(test_loader)))


output_dir = 'ckpt/{}'.format(exp_id)
ensure_dir(output_dir)

if load_init:
    load_dir = 'ckpt/{}'.format(load_exp_id)
    load_path = '{}/{}_{}_{}.pt'.format(load_dir, load_exp_id, model_name, load_epoch)
    logger.info('Load Init model from {}'.format(load_path))
    model.load_state_dict(torch.load(load_path, map_location='cpu'))
    # print(model.device)

2023-05-15 04:22:54,317 - INFO - create model
2023-05-15 04:23:32,272 - INFO - MatchModelV2(
  (product_emb): ProductEmbedding(
    (product_fea): Product(
      (locale_emb): Embedding(6, 16)
      (price_emb): Linear(in_features=1, out_features=16, bias=True)
      (len_title_emb): Linear(in_features=1, out_features=16, bias=True)
      (len_desc_emb): Linear(in_features=1, out_features=16, bias=True)
      (encode_brand_emb): Embedding(177190, 16)
      (encode_color_emb): Embedding(203261, 16)
      (encode_size_emb): Embedding(218061, 16)
      (encode_model_emb): Embedding(524102, 16)
      (encode_material_emb): Embedding(45569, 16)
      (encode_author_emb): Embedding(30836, 16)
      (encode_price_emb): Embedding(10, 16)
      (encode_len_title_emb): Embedding(10, 16)
      (encode_len_desc_emb): Embedding(10, 16)
    )
    (title_emb): Embedding(1410676, 384, padding_idx=1410675)
    (title_linear): Linear(in_features=384, out_features=32, bias=True)
    (desc_emb): Embedding

In [53]:
for batch_prev_items, batch_locale, batch_candidate_set, batch_len, batch_mask, \
                batch_label, batch_label_index, batch_seq_cat, batch_seq_num in tqdm(train_loader, desc='train model {}'.format(exp_id), total=len(train_loader)):
    print(batch_seq_cat.shape, batch_seq_num.shape)
    print(torch.isnan(batch_seq_num).any())
    break

train model 61323:   0%|          | 0/2818 [00:00<?, ?it/s]

torch.Size([1024, 14]) torch.Size([1024, 15])
tensor(False, device='cuda:2')





In [36]:
for batch_prev_items, batch_locale, batch_candidate_set, batch_len, batch_mask, \
                batch_label, batch_label_index, batch_seq_cat, batch_seq_num in tqdm(test_loader, desc='train model {}'.format(exp_id), total=len(test_loader)):
    print(batch_seq_cat.shape, batch_seq_num.shape)
    print(torch.isnan(batch_seq_num).any())
    break

train model 61323:   0%|          | 0/310 [00:00<?, ?it/s]

torch.Size([1024, 14]) torch.Size([1024, 15])
tensor(False, device='cuda:2')





In [54]:
score, loss = model.predict(batch_prev_items=batch_prev_items, batch_locale=batch_locale, 
                                        batch_candidate_set=batch_candidate_set, batch_len=batch_len, 
                                        batch_label=batch_label_index, batch_mask=batch_mask,
                                        batch_seq_cat=batch_seq_cat, batch_seq_num=batch_seq_num)

1 tensor(False, device='cuda:2') tensor(False, device='cuda:2')
3 tensor(False, device='cuda:2') tensor(False, device='cuda:2')
4 tensor(False, device='cuda:2') tensor(False, device='cuda:2')
5 tensor(False, device='cuda:2') tensor(False, device='cuda:2')
2 tensor(False, device='cuda:2') tensor(False, device='cuda:2')


In [55]:
origin_score = model.forward(batch_prev_items=batch_prev_items, batch_locale=batch_locale, 
                                        batch_candidate_set=batch_candidate_set, batch_len=batch_len, 
                                        batch_label=batch_label_index, batch_mask=batch_mask,
                                        batch_seq_cat=batch_seq_cat, batch_seq_num=batch_seq_num)

1 tensor(False, device='cuda:2') tensor(False, device='cuda:2')
3 tensor(False, device='cuda:2') tensor(False, device='cuda:2')
4 tensor(False, device='cuda:2') tensor(False, device='cuda:2')
5 tensor(False, device='cuda:2') tensor(False, device='cuda:2')
2 tensor(False, device='cuda:2') tensor(False, device='cuda:2')


In [9]:
seq_fea = model.seq_fea_emb.forward(batch_seq_cat, batch_seq_num)

3 tensor(False, device='cuda:2') tensor(False, device='cuda:2')
4 tensor(False, device='cuda:2') tensor(False, device='cuda:2')
5 tensor(True, device='cuda:2') tensor(False, device='cuda:2')


In [27]:
a = model.seq_fea_emb.seq_num_emb[0].forward(batch_seq_num)

In [None]:
a = model.seq_fea_emb.seq_num_emb.forward(batch_seq_num)

In [30]:
batch_seq_num.shape 

torch.Size([1024, 15])

In [33]:
torch.isnan(batch_seq_num).any()

tensor(True, device='cuda:2')

In [32]:
torch.isnan(batch_seq_cat).any()

tensor(False, device='cuda:2')

In [29]:
model.seq_fea_emb.seq_num_emb[0]

Linear(in_features=15, out_features=256, bias=True)

In [28]:
torch.isnan(a).any()

tensor(True, device='cuda:2')

In [14]:
torch.isnan(model.seq_fea_emb.seq_num_emb[0].weight).any()

tensor(False, device='cuda:2')

In [25]:
torch.isnan(model.seq_fea_emb.seq_num_emb[0].bias).any()

tensor(False, device='cuda:2')

In [16]:
torch.isnan(model.seq_fea_emb.seq_num_emb[2].weight).any()

tensor(False, device='cuda:2')

In [26]:
torch.isnan(model.seq_fea_emb.seq_num_emb[2].bias).any()

tensor(False, device='cuda:2')

In [13]:
model.seq_fea_emb.seq_num_emb[0].weight

Parameter containing:
tensor([[-0.1698, -0.1382, -0.0299,  ..., -0.2223, -0.1431,  0.1554],
        [ 0.2277, -0.0768,  0.2571,  ...,  0.0304, -0.0268, -0.2199],
        [-0.0926,  0.0366,  0.0841,  ..., -0.1207, -0.1244, -0.0577],
        ...,
        [ 0.1187,  0.1778,  0.0615,  ...,  0.2291, -0.1480, -0.0150],
        [ 0.2459,  0.0303,  0.1273,  ...,  0.0031, -0.1471,  0.0767],
        [ 0.2048, -0.2471, -0.0897,  ..., -0.1482, -0.0821,  0.0180]],
       device='cuda:2', requires_grad=True)

In [18]:
model.seq_fea_emb.seq_num_emb[2].weight

Parameter containing:
tensor([[ 0.0464, -0.0045,  0.0083,  ..., -0.0614,  0.0260, -0.0051],
        [ 0.0320,  0.0081, -0.0036,  ..., -0.0488, -0.0134, -0.0376],
        [ 0.0469,  0.0510, -0.0363,  ...,  0.0507,  0.0555,  0.0120],
        ...,
        [-0.0187,  0.0309, -0.0306,  ...,  0.0426,  0.0534, -0.0376],
        [-0.0186,  0.0345, -0.0244,  ...,  0.0400, -0.0067,  0.0009],
        [-0.0617,  0.0360,  0.0541,  ..., -0.0453,  0.0325,  0.0001]],
       device='cuda:2', requires_grad=True)

In [19]:
model.seq_fea_emb.seq_cat_emb[0].weight

Parameter containing:
tensor([[-0.0290,  0.0571,  0.0445,  ..., -0.0549,  0.0114, -0.0176],
        [-0.0340,  0.0471,  0.0197,  ...,  0.0143, -0.0160, -0.0311],
        [ 0.0572, -0.0158,  0.0650,  ..., -0.0228, -0.0290, -0.0478],
        ...,
        [ 0.0175, -0.0576,  0.0565,  ...,  0.0112,  0.0423,  0.0325],
        [ 0.0220,  0.0642, -0.0455,  ..., -0.0289,  0.0323,  0.0132],
        [-0.0546, -0.0563,  0.0144,  ..., -0.0452,  0.0218, -0.0116]],
       device='cuda:2', requires_grad=True)

In [20]:
model.seq_fea_emb.seq_cat_emb[2].weight

Parameter containing:
tensor([[ 0.0277, -0.0287, -0.0604,  ..., -0.0460,  0.0555,  0.0345],
        [-0.0079,  0.0150,  0.0315,  ...,  0.0191,  0.0371, -0.0398],
        [-0.0033,  0.0423,  0.0566,  ...,  0.0389,  0.0315,  0.0210],
        ...,
        [-0.0579, -0.0562,  0.0390,  ..., -0.0594,  0.0487, -0.0598],
        [-0.0621,  0.0214, -0.0616,  ..., -0.0214, -0.0563, -0.0581],
        [-0.0032, -0.0371, -0.0409,  ...,  0.0337,  0.0503,  0.0557]],
       device='cuda:2', requires_grad=True)

In [10]:
print(4, torch.isnan(seq_fea).any(), torch.isinf(seq_fea).any())

4 tensor(True, device='cuda:2') tensor(False, device='cuda:2')


In [8]:
torch.isnan(origin_score).sum()

tensor(11700, device='cuda:2')

In [28]:
(torch.softmax(origin_score, dim=1) == score).any()

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [16]:
score

tensor([[0.0085, 0.0097, 0.0083,  ..., 0.0136, 0.0088, 0.0105],
        [0.0099, 0.0118, 0.0120,  ..., 0.0104, 0.0091, 0.0104],
        [0.0110, 0.0120, 0.0075,  ..., 0.0088, 0.0083, 0.0115],
        ...,
        [0.0095, 0.0076, 0.0087,  ..., 0.0092, 0.0077, 0.0088],
        [0.0097, 0.0113, 0.0110,  ..., 0.0105, 0.0104, 0.0127],
        [0.0073, 0.0078, 0.0098,  ..., 0.0130, 0.0101, 0.0119]],
       device='cuda:3', grad_fn=<SoftmaxBackward0>)

In [30]:
origin_score.cpu().numpy()

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [29]:
torch.save(origin_score, './origin_score.pt')
torch.save(score, './score.pt')
torch.save(batch_label_index, './batch_label_index.pt')

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [19]:
origin_score

tensor([[-0.2161, -0.0138, -0.1616,  ...,  0.0532,  0.1819,  0.1372],
        [-0.0826,  0.0503,  0.2164,  ...,  0.1613, -0.0115, -0.0652],
        [-0.2265,  0.0434, -0.0520,  ..., -0.0257, -0.0259,  0.1452],
        ...,
        [ 0.2151, -0.3285, -0.0392,  ..., -0.0140, -0.1214,  0.0703],
        [ 0.0691,  0.1465,  0.0234,  ...,  0.1110,  0.2160,  0.3192],
        [-0.2882, -0.1319, -0.1053,  ..., -0.1123, -0.0745,  0.1891]],
       device='cuda:3', grad_fn=<SqueezeBackward1>)

In [23]:
origin_score.shape, batch_label_index.shape

(torch.Size([1024, 100]), torch.Size([1024]))

In [20]:
batch_label_index

tensor([36,  2,  7,  ...,  2, 28, 53], device='cuda:3')

In [24]:
loss_func = nn.CrossEntropyLoss(ignore_index=data_feature['len_candidate_set']).to(device) 

In [26]:
loss_func

CrossEntropyLoss()

In [27]:
data_feature['len_candidate_set']

100

In [25]:
loss_func(origin_score, batch_label)

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [17]:
loss

tensor(nan, device='cuda:3', grad_fn=<NllLossBackward0>)

In [7]:
batch_seq_cat

tensor([[4, 5, 4,  ..., 4, 0, 0],
        [8, 9, 4,  ..., 3, 0, 0],
        [2, 2, 1,  ..., 1, 0, 0],
        ...,
        [2, 3, 1,  ..., 3, 0, 0],
        [2, 2, 1,  ..., 0, 0, 0],
        [1, 2, 1,  ..., 2, 0, 0]], device='cuda:3')

In [8]:
batch_seq_num

tensor([[2.8975e-07, 2.6173e-07, 1.7475e-07,  ..., 2.2122e-01, 4.8800e-01,
         3.8864e-02],
        [5.7172e-05, 2.7616e-05, 3.8225e-05,  ..., 1.3013e-02, 4.7000e-02,
         4.5832e-03],
        [4.2475e-05, 0.0000e+00, 4.2475e-05,  ..., 1.8318e-01, 1.8300e-01,
         7.5902e-03],
        ...,
        [9.6917e-05, 2.5311e-05, 7.6250e-05,  ..., 6.2062e-02, 6.7000e-02,
         4.0647e-03],
        [1.8125e-05, 6.2500e-06, 1.5000e-05,  ..., 1.4815e-01, 1.4800e-01,
         6.1385e-03],
        [4.4975e-07, 0.0000e+00, 4.4975e-07,  ..., 3.4134e-01, 3.4100e-01,
         1.4144e-02]], device='cuda:3')

In [9]:
products_encoded

Unnamed: 0,id,locale,price,len_title,len_desc,encode_brand,encode_color,encode_size,encode_model,encode_material,encode_author,encode_price,encode_len_title,encode_len_desc
0,0,0,30.950001,96.0,121.0,112134,203260,218060,426630,45568,30835,0,2,2
1,1,0,17.900000,186.0,330.0,124505,203260,128007,524101,45568,30835,0,4,5
2,2,0,68.889999,181.0,95.0,122979,114264,170270,145013,15566,30835,0,4,1
3,3,0,18.990000,101.0,191.0,9834,46931,218060,67408,29357,30835,0,2,3
4,4,0,7.170000,45.0,15.0,105135,117844,170305,174527,23064,30835,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1410670,1410670,5,578.979980,124.0,250.0,116789,55013,109396,524101,45568,30835,0,2,4
1410671,1410671,5,43.490002,195.0,479.0,121227,80328,143053,524101,19188,30835,0,4,7
1410672,1410672,5,8.410000,144.0,57.0,52556,39304,6470,257247,45568,30835,0,3,1
1410673,1410673,5,100.000000,59.0,85.0,23342,203260,218060,57350,45568,30835,0,1,1


In [10]:
products_input

{'id': tensor([      0,       1,       2,  ..., 1410672, 1410673, 1410674],
        device='cuda:3'),
 'locale': tensor([0, 0, 0,  ..., 5, 5, 5], device='cuda:3'),
 'price': tensor([ 30.9500,  17.9000,  68.8900,  ...,   8.4100, 100.0000,  18.3500],
        device='cuda:3'),
 'len_title': tensor([ 96., 186., 181.,  ..., 144.,  59., 113.], device='cuda:3'),
 'len_desc': tensor([121., 330.,  95.,  ...,  57.,  85.,  24.], device='cuda:3'),
 'encode_brand': tensor([112134, 124505, 122979,  ...,  52556,  23342,  89982], device='cuda:3'),
 'encode_color': tensor([203260, 203260, 114264,  ...,  39304, 203260, 170638], device='cuda:3'),
 'encode_size': tensor([218060, 128007, 170270,  ...,   6470, 218060, 218060], device='cuda:3'),
 'encode_model': tensor([426630, 524101, 145013,  ..., 257247,  57350, 524101], device='cuda:3'),
 'encode_material': tensor([45568, 45568, 15566,  ..., 45568, 45568, 18060], device='cuda:3'),
 'encode_author': tensor([30835, 30835, 30835,  ..., 30835, 30835, 30835],

In [None]:
mms = MinMaxScaler(feature_range=(0,1))
data[dense_features] = mms.fit_transform(data[dense_features])

In [11]:
model.seq_fea_emb(batch_seq_cat, batch_seq_num)

tensor([[-5.1054e-01, -3.6703e-01,  4.0416e-01,  ..., -6.3685e+01,
         -6.2647e+01,  3.2512e+01],
        [-3.1322e-01, -3.3288e-01,  9.6511e-02,  ...,  3.2139e+02,
         -1.0420e+02,  8.0631e+02],
        [-7.0365e-02, -2.7373e-01, -4.4755e-02,  ...,  2.0069e+00,
          6.9152e+01,  2.6755e+02],
        ...,
        [-1.9821e-01, -3.8917e-01,  1.4973e-01,  ...,  1.8869e+02,
          1.3707e+02,  6.3639e+02],
        [-2.9664e-01, -4.2047e-01,  1.1163e-01,  ..., -7.9712e+00,
          1.4268e+01,  9.6694e+01],
        [-4.3668e-02, -2.9515e-01, -3.6954e-02,  ..., -1.9614e+01,
         -2.9803e+01,  3.8246e+01]], device='cuda:3', grad_fn=<CatBackward0>)