In [1]:
from tqdm import tqdm
import re

import numpy as np
import pandas as pd
from gensim.summarization import bm25

#### Step 1 - Load data

In [2]:
# # load test query
# test_query_df = pd.read_csv('../../../../../PChome_datasets/search/pchome_test_collection/round1/test_query/test_query_250.csv')

# load valid query
test_query_df = pd.read_csv('../../../../../PChome_datasets/search/pchome_test_collection/round1/valid_query/valid_query_200.csv')

# product item
item_df = pd.read_parquet('../../../../../PChome_datasets/search/pchome_test_collection/round0/product_collection/product_collection_lg.parquet')

#### Step 2 - Build model

In [3]:
# 1. method 2: chinese char and english word
def ch_char_en_word_tokenizer(x):
    # to lower case
    x = x.lower()
    # chinese char and english word
    pattern = re.compile(r'([\u4E00-\u9FFF]|[A-Za-z0-9\°]+(\-*[A-Za-z0-9\°]*)*)')
    results = re.findall(pattern, x)
    results = [x[0] for x in results]
    return results

# english word digit tokenizer
def en_word_digit_tokenizer(x):
    # to lower case
    x = x.lower()
    pattern = re.compile(r'([A-Za-z]+|[0-9]+)')
    results = re.findall(pattern, x)
    return results

# convert name to tokenized name
tokenized_items = item_df['name'].map(ch_char_en_word_tokenizer).to_list()

In [4]:
# 2. bm25 model
bm25Model = bm25.BM25(tokenized_items)

#### Step 3 - Search function

In [5]:
def search(query):
    # jieba tokenizer
    # scores = np.array(bm25Model.get_scores(jieba.lcut(query, cut_all=True)))

    # ch_char_en_word_tokenizer
    scores = np.array(bm25Model.get_scores(ch_char_en_word_tokenizer(query)))

    # nothing found by ch_char_en_word_tokenizer, alternative solution: en_word_digit_tokenizer
    if scores.sum() == 0:
        print(query)
        scores = np.array(bm25Model.get_scores(en_word_digit_tokenizer(query)))
    
    top_50_indices = np.argsort(-scores)[:50]
    return top_50_indices

#### Step 4 - Output search results for test queries

In [6]:
for r in tqdm(test_query_df.iloc):
    top_50_indices = search(r['query'])
    results = item_df[item_df.index.isin(top_50_indices)].reindex(top_50_indices)
    results.reset_index(drop=True, inplace=True)
    
    # save result files for pooling step(id start from 250)
    results.to_parquet('./results/results_round1_valid_query/result_'+str(r['query_id']).zfill(3)+'.parquet')

125it [01:59,  1.01s/it]

5300u


189it [03:07,  1.06s/it]

lr6egt


197it [03:17,  1.10s/it]

ef-1268ba


200it [03:21,  1.01s/it]


#### Sanbox

In [7]:
# search for assigned query
query = test_query_df['query'][0]
top_50_indices = search(query)
item_df.loc[top_50_indices]

Unnamed: 0,item_id,name
562018,DJAH0H-A75182816,文具病：愛文具、玩文具、品文具，以文具傳達生活態度的25種可能
1489649,DJAP0T-A42169844,故宮文物寶藏文具篇：文具的故事
2093540,DJAH0H-A82260802,文具手帖Season04：好色文具強迫症！
622818,DJAH2O-A9007YQWI,趣味文具大集合 VOL.41：文具贈禮
712380,DJAH0H-A9007LDDB,文具手帖：台灣文具屋散策（熄燈號）
648737,DJAH2O-A9005N6QJ,趣味文具大集合 VOL.30：文具的快樂
1874143,DJAH0H-A900A9T3K,文房具解剖圖鑑：23款經典文具「進化論」，圖解文具的構造、形狀、演變歷程，譜出浪漫文具文化史！
203575,DJAP06-A9007KZYS,文具手帖（1～4）文具好朋友勸敗套書
1177395,DJAP06-A9007L011,文具手帖（5～8）文具偏執狂必藏套書
438940,DJAM0H-A68469142,戀：文具物語


In [5]:
N = 0
print("query: ", test_query_df['query'][N])
result_df = pd.read_parquet('./results/results_round1_valid_query/result_'+str(N).zfill(3)+'.parquet')
result_df

query:  文具


Unnamed: 0,item_id,name
0,DJAH0H-A75182816,文具病：愛文具、玩文具、品文具，以文具傳達生活態度的25種可能
1,DJAP0T-A42169844,故宮文物寶藏文具篇：文具的故事
2,DJAH0H-A82260802,文具手帖Season04：好色文具強迫症！
3,DJAH2O-A9007YQWI,趣味文具大集合 VOL.41：文具贈禮
4,DJAH0H-A9007LDDB,文具手帖：台灣文具屋散策（熄燈號）
5,DJAH2O-A9005N6QJ,趣味文具大集合 VOL.30：文具的快樂
6,DJAH0H-A900A9T3K,文房具解剖圖鑑：23款經典文具「進化論」，圖解文具的構造、形狀、演變歷程，譜出浪漫文具文化史！
7,DJAP06-A9007KZYS,文具手帖（1～4）文具好朋友勸敗套書
8,DJAP06-A9007L011,文具手帖（5～8）文具偏執狂必藏套書
9,DJAM0H-A68469142,戀：文具物語
