In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import sys
sys.path.append('../../../../')
from sentence_transformer_custom import SentenceTransformerCustom
import config



#### Step 1 - Load data

In [2]:
# product item
item_df = pd.read_parquet('../PChome_datasets/search/pchome_test_collection/round0/product_collection/product_collection_lg.parquet')

# load test query
# test_query_df = pd.read_csv('../PChome_datasets/search/pchome_test_collection/round1/test_query/test_query_250.csv') # round0 test query
# test_query_df = pd.read_csv('../PChome_datasets/search/pchome_test_collection/round1/test_query/test_query_250.csv') # round1 test query

# load valid query
test_query_df = pd.read_csv('../PChome_datasets/search/pchome_test_collection/round1/valid_query/valid_query_200.csv')

In [8]:
# -------------------------------------------------------
#   Get product embeddings
# -------------------------------------------------------
print('Get product embeddings ...')
product_collection_size = 'lg' # 'sm' or 'lg'

# get product embeddings sm
if product_collection_size == 'sm':
    product_collection_sm_df = pd.read_parquet(config.product_collection_sm_path)
    name_sentences = product_collection_sm_df['name'].to_list()
    product_embeddings = model.encode(
        sentences=name_sentences,
        batch_size=128,
        normalize_embeddings=True,
        show_progress_bar=True,
    )
elif product_collection_size == 'lg':
    product_collection_lg_df = pd.read_parquet(config.product_collection_lg_path)
    # from npy array file
    product_embeddings = np.load(config.save_model_path + '/eval_reports/test_collection_lg/product_embeddings.npy')

Get product embeddings ...


#### Step 2 - Build model

In [10]:
# -------------------------------------------------------
#   Model
# -------------------------------------------------------
# init model
print('Experiment name:', config.exp_name)
model = SentenceTransformerCustom(config.save_model_path)

Experiment name: bert_soft-margin-triplet-loss_train-lg_intent-based-neg-2


#### Step 3 - Search function

In [11]:
def search(query):
    # get test query embeddings
    query_embeddings = model.encode(
        sentences=[query],
        batch_size=128,
        normalize_embeddings=True,
    )

    # get query result lists
    scores = np.dot(query_embeddings, product_embeddings.T)
    top_50_indices = np.argsort(-scores)[:,:50]
    # result_df = product_collection_lg_df[product_collection_lg_df.index.isin(top_50_indices[0])].reindex(top_50_indices[0])
    result_df = item_df.loc[top_50_indices[0]]
    return result_df

In [12]:
search("vga轉hdmi")

Unnamed: 0,item_id,name
1056473,DCACYC-A900B0DKC,HDMI TO VGA轉接線
335168,DCACRX-A90099ESR,HDMI轉VGA轉接線-1.8米 HDMI(公) TO VGA(公)
576210,DCACYC-A900BEGL4,HDMI轉VGA接頭 黑色
193532,DRAD5V-A90066HU3,VGA(公)轉HDMI(母)影音轉接線
789845,CAAQ12-A74419917,VGA轉HDMI轉換器
709574,DCACYC-A900BCSA1,HDMI轉VGA接頭 白色
2009483,DCACRJ-A9008MNCX,HDMI(公) to VGA(母) 視頻轉接線 轉接器-黑色
464535,DMAA2K-A9008WYMY,VGA(公)轉HDMI(母) 訊號影音傳輸轉接器
1913648,DCACRJ-A9008MNCK,HDMI(公) to VGA(母) 視頻轉接線 轉接器-白色
455184,DMAA2K-A900AEJDO,(5年保固) 嚴選 HDMI TO VGA視頻轉接線(黑)


#### Step 4 - Output search results for test queries

In [13]:
for i, r in tqdm(enumerate(test_query_df.iloc)):
    query = r['query']
    results = search(query)
    # results = results.drop('sign_id', axis=1).reset_index(drop=True)

    # save result files for pooling step(id start from 250)
    results.to_parquet('./dataset_process/test_collection_round1/runs/triplet_bert/results/results_round1_valid_query/result_'+str(r['query_id']).zfill(3)+'.parquet')

200it [02:16,  1.47it/s]


#### sandbox

In [6]:
N = 0
print("query: ", test_query_df['query'][N])
result_df = pd.read_parquet('./dataset_process/test_collection_round1/runs/triplet_bert/results/results_round1_valid_query/result_'+str(N).zfill(3)+'.parquet')
result_df

query:  文具


Unnamed: 0,item_id,name
1506439,DJAV0S-A90080KTK,最新便利文具用品特選專集
562018,DJAH0H-A75182816,文具病：愛文具、玩文具、品文具，以文具傳達生活態度的25種可能
1300558,DJAH2O-A81683547,日本最新實用便利個性文具商品特選2014
622818,DJAH2O-A9007YQWI,趣味文具大集合 VOL.41：文具贈禮
1668611,DJAH0H-A9008A35F,文具的品格：全球經典文具的深度巡禮
1248965,DEAHFY-A9006XKAO,磁鐵書籤 - 文具系列 【STUDIO】
1512382,DJAP4E-A900AI84T,實用文具用品完全精選圖鑑 最新版
1582321,DJAH2O-A9005YQ1O,文具用品便利生活特選專集
648737,DJAH2O-A9005N6QJ,趣味文具大集合 VOL.30：文具的快樂
118533,DEAHFY-A9006XK9Y,磁鐵書籤 - 文具系列 【SCHOOL】


In [61]:
import random
N = random.randint(0,99)
print("N: ", N)

N:  97
