In [1]:
from tqdm import tqdm
import numpy as np
import pandas as pd
import jieba
import re
import import_ipynb
from cilin_lexicon_v2 import OutPutLexicon

importing Jupyter notebook from cilin_lexicon_v2.ipynb


#### Step 1 - Load data

In [2]:
# # load test query
# test_query_df = pd.read_csv('../../../../../PChome_datasets/search/pchome_test_collection/round1/test_query/test_query_250.csv')

# load valid query
test_query_df = pd.read_csv('../../../../../PChome_datasets/search/pchome_test_collection/round1/valid_query/valid_query_200.csv')

# product item
item_df = pd.read_parquet('../../../../../PChome_datasets/search/pchome_test_collection/round0/product_collection/product_collection_lg.parquet')

#### load cilin lexicon

In [4]:
# generate lexicon txt file

# oplex = OutPutLexicon('cilin_labeled_data_.json')
# class_name = oplex.start()

In [5]:
# class_name = ['brand','name','type','p-other']
class_name = ['type','brand','p-other']
for each_name in class_name:
    jieba.load_userdict('./Lexicon_merge/{}.txt'.format(each_name))

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\EE303\AppData\Local\Temp\jieba.cache
Loading model cost 0.311 seconds.
Prefix dict has been built successfully.


#### Step 2 - Build model

#### model one

In [6]:
# 1. jieba tokenizer
def jieba_tokenizer(x):
    tokens = jieba.lcut(x, cut_all=False)
    stop_words = ['【','】','/','~','＊','、','（','）','+','‧',' ','']
    tokens = [t for t in tokens if t not in stop_words]
    return tokens

In [7]:
jieba_tokenizer('3090')

['3090']

In [8]:
#Import TfIdfVectorizer from the scikit-learn library
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stopwords
tfidf = TfidfVectorizer(token_pattern=r"(?u)\b\w\w+\b", tokenizer=jieba_tokenizer, ngram_range=(1,2))

#Replace NaN with an empty string
#item_tokens_df['tokens'] = item_tokens_df['tokens'].fillna('')
item_df['name'] = item_df['name'].fillna('')



In [9]:
#Construct the required TF-IDF matrix by applying the fit_transform method on the overview feature
tfidf_matrix = tfidf.fit_transform(item_df['name'])
#Output the shape of tfidf_matrix
tfidf_matrix.shape

(2252390, 5197395)

#### model two

In [10]:
# #Define a TF-IDF Vectorizer Object. Remove all english stopwords
# tfidf = TfidfVectorizer(token_pattern=r"(?u)\b\w\w+\b", tokenizer=jieba_tokenizer, ngram_range=(1,2))

#Define a TF-IDF Vectorizer Object. Remove all english stopwords
tfidf_char = TfidfVectorizer(token_pattern=r"(?u)\b\w\w+\b", analyzer='char')

In [11]:
#Construct the required TF-IDF matrix by applying the fit_transform method on the overview feature
tfidf_matrix_char = tfidf_char.fit_transform(item_df['name'])
#Output the shape of tfidf_matrix
tfidf_matrix_char.shape

(2252390, 8156)

#### Step 3 - Search function

In [12]:
from sklearn.metrics.pairwise import cosine_similarity
def search(query):
    score = np.zeros(tfidf_matrix.shape[1])
    que_tfidf = tfidf.transform([query]) # sparse array
    scores = cosine_similarity(que_tfidf,tfidf_matrix)
    top_50_indices = np.argsort(-scores[0])[:50]
    sum_of_score = sum(scores[0])
    # print("sum_of_score: ", sum_of_score)
    
    # if sum_of_score < 15 then using model two search again
    if sum_of_score < 10 : 
    # if True : 
        # print('using model 2')
        que_tfidf = tfidf_char.transform([query]) # sparse array
        scores = cosine_similarity(que_tfidf,tfidf_matrix_char)
        top_50_indices = np.argsort(-scores[0])[:50]
        sum_of_score = sum(scores[0])
        
        return sum_of_score, top_50_indices
    
    return sum_of_score, top_50_indices

In [13]:
# query = test_query_df.iloc[112]['query']
# query = test_query_df.iloc[225]['query']
# query = test_query_df.iloc[225]['query']
query = '禮物'

In [14]:
sum_of_score, top_50_indices = search(query)
print("query: ", query)
print("sum_of_score: ", sum_of_score)
item_df.loc[top_50_indices]

query:  禮物
sum_of_score:  297.5381864694536


Unnamed: 0,item_id,name
339451,DJAM1W-A900B2KBD,禮物
122351,DJAF0D-A79759704,交換禮物
1443509,QFCD6S-D9009ON3L,禮物（電子書）
1111037,DNAA3W-A60228716,禮物 DVD
667754,DJAF06-A900AEN3X,禮物(精裝)
825342,DJAM07-A26949851,天使的禮物
893188,DJAM05-A9005CN84,天使禮物
948980,DJAF0D-A9007OYIU,最後的禮物
705705,DJAM0H-A90094LUA,先生的禮物
2209387,DJAH0H-A9007TZ5G,永恆的禮物


#### Step 4 - Output search results for test queries

In [15]:
for i, r in tqdm(enumerate(test_query_df.iloc)):
    query = r['query']
    sum_of_score, top_50_indices = search(query)
    results = item_df.loc[top_50_indices]
    # results = results.drop('sign_id', axis=1).reset_index(drop=True)

    # save result files for pooling step(id start from 250)
    results.to_parquet('./results/results_round1_valid_query/result_'+str(r['query_id']).zfill(3)+'.parquet')

200it [06:28,  1.94s/it]


#### Sanbox

In [7]:
N = 0
print("query: ",test_query_df.iloc[N]['query'])
top_50 = pd.read_parquet('./results/results_round1_valid_query/result_'+str(N).zfill(3)+'.parquet')
top_50

query:  文具


Unnamed: 0,item_id,name
562018,DJAH0H-A75182816,文具病：愛文具、玩文具、品文具，以文具傳達生活態度的25種可能
1274449,DEAHM3-A9007H88Q,文具置物盤
762168,DJAF0D-A76754434,文具精靈國
712380,DJAH0H-A9007LDDB,文具手帖：台灣文具屋散策（熄燈號）
648737,DJAH2O-A9005N6QJ,趣味文具大集合 VOL.30：文具的快樂
622818,DJAH2O-A9007YQWI,趣味文具大集合 VOL.41：文具贈禮
1306687,DJAH2O-A9005511R,文具設計造型完全圖鑑 NO.10：文具物語
1875355,DSACSD-A9007FT7F,文具膠帶3/4
203575,DJAP06-A9007KZYS,文具手帖（1～4）文具好朋友勸敗套書
488678,QFCD25-D900915XV,文具手帖（熄燈號）：臺灣文具屋散策（電子書）


In [None]:
for i in range(50):
    query = jieba_tokenizer(test_query_df.iloc[i]['query'])
    sum_of_score, top_50_indices = search(query)
    print(i, sum_of_score)
    if sum_of_score == 0:
        print(i," Document Not Found Error")

In [None]:
merge = []
for i in tqdm(item_df['name']):
    merge = list(set(merge + list(i)))

In [None]:
for i in range(10):
    print(test_query_df.iloc[i]['query'])