In [252]:
import pandas as pd
import numpy as np
import nltk
import json
import re

from sentence_transformers import SentenceTransformer
from itertools import islice, cycle
from pynndescent import NNDescent
from collections import Counter
from functools import reduce

In [247]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/atgmello/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/atgmello/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
item_data_filename = '../data/interim/item_data.parquet'
df_item = pd.read_parquet(item_data_filename)
#df_item = df_item.loc[:1_000]

In [4]:
raw_filename = '../data/interim/train_dataset.parquet'
df_raw = pd.read_parquet(raw_filename)
df_raw = df_raw.loc[:1_000]

In [5]:
def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

In [8]:
def preproc_user_history(s:str)->list:
    return json.loads(s.replace("'", '"').lower())

In [9]:
df_raw['user_history'] = df_raw['user_history'].apply(preproc_user_history)

# Feature

## Most searched terms

### Most searched word

In [10]:
df_raw.head()

Unnamed: 0,user_history,item_bought
0,"[{'event_info': 1786148, 'event_timestamp': '2...",1748830
1,"[{'event_info': 643652, 'event_timestamp': '20...",228737
2,"[{'event_info': 248595, 'event_timestamp': '20...",1909110
3,"[{'event_info': 'radioboss', 'event_timestamp'...",1197370
4,"[{'event_info': 'amazfit bip', 'event_timestam...",2049207


In [11]:
df_raw.loc[3, 'user_history']

[{'event_info': 'radioboss',
  'event_timestamp': '2019-10-08t16:23:51.630-0400',
  'event_type': 'search'},
 {'event_info': 505541,
  'event_timestamp': '2019-10-08t16:24:06.220-0400',
  'event_type': 'view'},
 {'event_info': 505541,
  'event_timestamp': '2019-10-08t16:25:56.141-0400',
  'event_type': 'view'},
 {'event_info': 505541,
  'event_timestamp': '2019-10-08t16:26:15.505-0400',
  'event_type': 'view'},
 {'event_info': 505541,
  'event_timestamp': '2019-10-08t16:26:46.149-0400',
  'event_type': 'view'},
 {'event_info': 505541,
  'event_timestamp': '2019-10-08t16:36:11.769-0400',
  'event_type': 'view'},
 {'event_info': 'radioboss',
  'event_timestamp': '2019-10-08t22:32:32.256-0400',
  'event_type': 'search'},
 {'event_info': 'sound forge',
  'event_timestamp': '2019-10-10t14:32:56.589-0400',
  'event_type': 'search'},
 {'event_info': 1230082,
  'event_timestamp': '2019-10-12t09:56:36.964-0400',
  'event_type': 'view'},
 {'event_info': 'sound forge plugins',
  'event_timestamp'

In [85]:
def get_most_searched_words(hist:list, n:int=3)->list:
    searched_items = reduce(lambda x, y: 
                            x + y['event_info'].split(' ') if y['event_type']=='search' 
                            else x, 
                            hist, [])
    common_words_counts = [item.lower()
                           for tup in take(n, cycle(Counter(searched_items).most_common(n)))
                           for item in tup]
    return common_words_counts

In [86]:
pd.DataFrame(list(df_raw['user_history']
                  .apply(get_most_searched_words)))

Unnamed: 0,0,1,2,3,4,5
0,RELOGIO,1.0,SMARTWATCH,1.0,RELOGIO,1.0
1,DESMAMADEIRA,5.0,ELETRICA,5.0,DESMAMADEIRA,5.0
2,,,,,,
3,SOUND,4.0,FORGE,4.0,PLUGINS,3.0
4,AMAZFIT,10.0,BIP,10.0,BIPAMAZFIT,5.0
...,...,...,...,...,...,...
996,PS3,8.0,FIFA,4.0,STREET,4.0
997,APPLE,18.0,WATCH,18.0,S3,13.0
998,ATACADO,27.0,BABY,24.0,DOLL,24.0
999,MICRO,12.0,195,10.0,65,10.0


In [90]:
(pd.DataFrame(list(df_raw['user_history']
                  .apply(get_most_searched_words))) 
             .rename(columns={0:
                              'most_searched_word_1',
                              1:
                              'most_searched_word_count_1',
                              2:
                              'most_searched_word_2',
                              3:
                              'most_searched_word_count_2',
                              4:
                              'most_searched_word_3',
                              5:
                              'most_searched_word_count_3'}))

Unnamed: 0,most_searched_word_1,most_searched_word_count_1,most_searched_word_2,most_searched_word_count_2,most_searched_word_3,most_searched_word_count_3
0,RELOGIO,1.0,SMARTWATCH,1.0,RELOGIO,1.0
1,DESMAMADEIRA,5.0,ELETRICA,5.0,DESMAMADEIRA,5.0
2,,,,,,
3,SOUND,4.0,FORGE,4.0,PLUGINS,3.0
4,AMAZFIT,10.0,BIP,10.0,BIPAMAZFIT,5.0
...,...,...,...,...,...,...
996,PS3,8.0,FIFA,4.0,STREET,4.0
997,APPLE,18.0,WATCH,18.0,S3,13.0
998,ATACADO,27.0,BABY,24.0,DOLL,24.0
999,MICRO,12.0,195,10.0,65,10.0


In [7]:
def preproc_search(s:str)->str:
    # TODO: improve search preprocessing
    return s.lower()

### Most searched bi-gram

In [12]:
def token_sliding_window(s:str, size:int):
    tokens = s.split(' ')
    for i in range(len(tokens) - size + 1):
        yield ' '.join(tokens[i:i+size])

In [13]:
list(token_sliding_window("oi tudo bem com vc", 2))

['oi tudo', 'tudo bem', 'bem com', 'com vc']

In [14]:
def get_most_searched_ngram(hist:list, n:int=2, m:int=3)->list:
    searched_items = reduce(lambda x, y: 
                            x + [y['event_info']] if y['event_type']=='search' 
                            else x, 
                            hist, [])
    searched_ngram = reduce(lambda x, y:
                           x + list(token_sliding_window(y, n)),
                           searched_items, [])
    sorted_cycle = (sorted(take(m, cycle(Counter(searched_ngram)
                                      .most_common(m))),
                           key=lambda x: x[1],
                           reverse=True))
    common_ngrams_counts = [item
                           for tup in sorted_cycle
                           for item in tup]
    return common_ngrams_counts

In [15]:
cols_feat = ['most_searched_ngram_1',
             'most_searched_ngram_count_1',
             'most_searched_ngram_2',
             'most_searched_ngram_count_2']
df_raw[cols_feat] = (pd.DataFrame(list(df_raw['user_history']
                                       .apply(get_most_searched_ngram))))

In [16]:
df_raw

Unnamed: 0,user_history,item_bought,most_searched_ngram_1,most_searched_ngram_count_1,most_searched_ngram_2,most_searched_ngram_count_2,most_searched_ngram_3,most_searched_ngram_count_3
0,"[{'event_info': 1786148, 'event_timestamp': '2...",1748830,relogio smartwatch,1.0,relogio smartwatch,1.0,relogio smartwatch,1.0
1,"[{'event_info': 643652, 'event_timestamp': '20...",228737,desmamadeira eletrica,5.0,desmamadeira eletrica,5.0,desmamadeira eletrica,5.0
2,"[{'event_info': 248595, 'event_timestamp': '20...",1909110,,,,,,
3,"[{'event_info': 'radioboss', 'event_timestamp'...",1197370,sound forge,4.0,sound forge,4.0,forge plugins,3.0
4,"[{'event_info': 'amazfit bip', 'event_timestam...",2049207,amazfit bip,5.0,amazfit bipamazfit,5.0,bipamazfit bip,5.0
...,...,...,...,...,...,...,...,...
996,"[{'event_info': 1073520, 'event_timestamp': '2...",17047,fifa street,4.0,street ps3,4.0,ps3 midia,4.0
997,"[{'event_info': 1111331, 'event_timestamp': '2...",1700398,apple watch,18.0,watch s3,13.0,aparadores pelos,6.0
998,"[{'event_info': 84286, 'event_timestamp': '201...",633650,baby doll,24.0,doll bojo,13.0,bojo atacado,13.0
999,"[{'event_info': 'itasca', 'event_timestamp': '...",1200120,195 65,10.0,65 r15,10.0,micro sd,9.0


## Domain embedding

Summarize domains by the top 10 words from all the titles for each domain.

In [17]:
df_item[['domain_id', 'title']].groupby(by='domain_id').agg(' '.join)['title']

domain_id
MLB-3D_GLASSES              Projetor Holográfico 3d Sistema Ventilador Fy3...
MLB-3D_PENS                 Caneta 3d Impressora Drawing Pen 3d + Filament...
MLB-3D_PRINTERS             Impressora 3d Anycubic I3 Mega S Modelo Novo +...
MLB-3D_PRINTER_FILAMENTS    Filamento Pla De Fibra De Carbono - Gtmax 3d -...
MLB-3D_PRINTER_HEATBEDS     Mesa Aquecida Pcb Mk3 Preta Aluminio 110ºc Mes...
                                                  ...                        
MLM-YARNS                   Estambre De Lana Tamatz Trapillo Para Tejer Es...
MLM-YERBA_MATE              Abarrotes Te Yerba Mate  Con Palo Pipore 500 G...
MLM-YOGA_MATS               Tapete Si-karai Para Crossfit / Gym Master Ros...
MLM-ZAFUS                   Banco De Meditación Yoga Madera Portable Unita...
MLM-ZIPPERS                 Corredera (zipper)(carrito) Para Cierre Del #5...
Name: title, Length: 7893, dtype: object

In [227]:
custom_stopwords = ['kit', '', '+', '-', 'und', 'unidade', 'unidad']
stopwords = (nltk.corpus.stopwords.words('portuguese')
             + nltk.corpus.stopwords.words('spanish')
             + custom_stopwords)

In [302]:
def generate_top_title(s:str, stopwords:list=stopwords, n:int=10)->str:
    counter = Counter([w for w in nltk.word_tokenize(s.lower())
                      if w not in stopwords
                      and not re.search('\d', w)
                      and len(w) > 2]).most_common(n)
    title = ' '.join([w[0] for w in counter])
    return title

In [303]:
%%time
df_domain_title = pd.DataFrame(df_item[['domain_id', 'title']]
                               .groupby(by='domain_id')
                               .agg(' '.join)
                               ['title']
                               .apply(generate_top_title))

CPU times: user 4min 55s, sys: 884 ms, total: 4min 56s
Wall time: 4min 58s


In [310]:
df_domain_title['title'] = (df_domain_title
                            .reset_index()
                            [['domain_id','title']]
                            .apply(lambda x: 
                                   (' '.join(' '.join(x['domain_id']
                                                      .lower()
                                                      .split('-')[1:])
                                             .split('_'))
                                    + ' ' 
                                    + x['title']),
                                   axis=1)
                            .values)

In [313]:
df_domain_title

Unnamed: 0_level_0,title
domain_id,Unnamed: 1_level_1
MLB-3D_GLASSES,3d glasses óculos oculos passivo ativo cinema ...
MLB-3D_PENS,3d pens caneta impressora filamento abs bivolt...
MLB-3D_PRINTERS,3d printers impressora anycubic creality photo...
MLB-3D_PRINTER_FILAMENTS,3d printer filaments pla filamento filamentos ...
MLB-3D_PRINTER_HEATBEDS,3d printer heatbeds mesa aquecida impressora p...
...,...
MLM-YARNS,yarns estambre hilaza madejas algodón gramos l...
MLM-YERBA_MATE,yerba mate mate yerba bombilla matero calabaza...
MLM-YOGA_MATS,yoga mats yoga tapete gym ejercicio pilates pi...
MLM-ZAFUS,zafus yoga meditación zafu banco madera portab...


In [314]:
df_domain_title.reset_index()[['domain_id','title']].values[:20]

array([['MLB-3D_GLASSES',
        '3d glasses óculos oculos passivo ativo cinema original sony qualidade samsung yamaha'],
       ['MLB-3D_PENS',
        '3d pens caneta impressora filamento abs bivolt drawing lcd pen profissional cores'],
       ['MLB-3D_PRINTERS',
        '3d printers impressora anycubic creality photon resina ender pronta entrega nivelamento tronxy'],
       ['MLB-3D_PRINTER_FILAMENTS',
        '3d printer filaments pla filamento filamentos pedra impressora cores fibra carbono gtmax maquina'],
       ['MLB-3D_PRINTER_HEATBEDS',
        '3d printer heatbeds mesa aquecida impressora pcb preta aluminio dual power adesivo magnético'],
       ['MLB-3D_PRINTER_HOTENDS',
        '3d printer hotends hotend bico allmetal ptfe direct completo cooler termistor tubo cartucho'],
       ['MLB-3D_PRINTER_NOZZLES',
        '3d printer nozzles nozzle bico impressora volcano hotend vulcano frete peças'],
       ['MLB-3D_PRINTER_THERMISTORS',
        '3d printer thermistors termistor 

In [315]:
embedder = SentenceTransformer('distilbert-multilingual-nli-stsb-quora-ranking')

In [316]:
%%time
df_domain_title['title_embedding'] = list(embedder.encode(list(df_domain_title['title'])))

CPU times: user 13min 43s, sys: 1.76 s, total: 13min 45s
Wall time: 7min 6s


In [362]:
df_domain_title['domain_code'] = list(range(len(df_domain_title)))

In [383]:
df_domain_title

Unnamed: 0_level_0,title,title_embedding,domain_code
domain_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MLB-3D_GLASSES,3d glasses óculos oculos passivo ativo cinema ...,"[-0.06968159, 0.25974673, 0.5379761, 0.0724750...",0
MLB-3D_PENS,3d pens caneta impressora filamento abs bivolt...,"[0.26427686, 0.2541827, 0.47321633, 0.29305992...",1
MLB-3D_PRINTERS,3d printers impressora anycubic creality photo...,"[-0.101104945, 0.15050887, 0.7006622, 0.357083...",2
MLB-3D_PRINTER_FILAMENTS,3d printer filaments pla filamento filamentos ...,"[0.17531304, 0.05898144, 0.51190627, 0.3485206...",3
MLB-3D_PRINTER_HEATBEDS,3d printer heatbeds mesa aquecida impressora p...,"[0.051528, 0.21352565, 0.50668174, 0.3358144, ...",4
...,...,...,...
MLM-YARNS,yarns estambre hilaza madejas algodón gramos l...,"[0.06797173, -0.23393154, 0.23260438, 0.425522...",7888
MLM-YERBA_MATE,yerba mate mate yerba bombilla matero calabaza...,"[0.34707746, 0.2361566, 0.23432232, 0.05118732...",7889
MLM-YOGA_MATS,yoga mats yoga tapete gym ejercicio pilates pi...,"[0.22566795, 0.14807013, 0.42122185, 0.3297225...",7890
MLM-ZAFUS,zafus yoga meditación zafu banco madera portab...,"[0.074348785, 0.27690154, 0.36054817, 0.140551...",7891


In [389]:
    domain_mapper = {x[1]: x[0]
                     for x in
                     enumerate(sorted(df_item['domain_id'].dropna().unique()))}


In [392]:
domain_mapper['MLM-YARNS']

7888

Now use the domain embeddings to find out from which domain are the searched items from.

In [317]:
# Temporary solution
df_raw['most_searched_ngram_1'] = df_raw['most_searched_ngram_1'].fillna('None')

In [318]:
%%time
df_raw['most_searched_ngram_1_embedding'] = list(embedder.encode(list(df_raw['most_searched_ngram_1'])))

CPU times: user 18.2 s, sys: 12 µs, total: 18.2 s
Wall time: 9.23 s


In [319]:
%%time
data = np.array([np.array(x) for x in df_domain_title['title_embedding'].values])
index = NNDescent(data, metric='cosine')

CPU times: user 8.3 s, sys: 65.7 ms, total: 8.37 s
Wall time: 3.38 s


In [320]:
%%time
query_data = np.array([np.array(x) for x in df_raw['most_searched_ngram_1_embedding'].values])
closest_domain = index.query(query_data, k=5)

CPU times: user 3.3 s, sys: 132 ms, total: 3.43 s
Wall time: 7.19 s


In [321]:
closest_domain[0]

array([[3511, 3513, 3512, 7284, 7283],
       [1476, 1446, 1435, 1469, 5510],
       [7680, 5329, 4261, 4264, 6193],
       ...,
       [1338, 5399, 4549, 1336, 1339],
       [1459, 4982,  640, 2013, 2598],
       [6312, 3268, 2392, 6970, 3142]], dtype=int32)

In [322]:
closest_domain[0][0]

array([3511, 3513, 3512, 7284, 7283], dtype=int32)

In [323]:
df_domain_title.iloc[closest_domain[0][0]]

Unnamed: 0_level_0,title,title_embedding
domain_id,Unnamed: 1_level_1,Unnamed: 2_level_1
MLB-SMARTWATCHES,smartwatches smartwatch relógio xiaomi intelig...,"[-0.09381391, 0.23167917, 0.5604316, 0.0873341..."
MLB-SMARTWATCH_CHARGERS,smartwatch chargers carregador xiaomi usb cabo...,"[-0.0165702, 0.28124207, 0.5039951, 0.19861004..."
MLB-SMARTWATCH_BATTERIES,smartwatch batteries bateria moto snap relógio...,"[0.030472, 0.16015075, 0.5882568, 0.1388443, 0..."
MLM-SMARTWATCH_BATTERIES,smartwatch batteries smartwatch bateria pila r...,"[0.11540755, 0.20662041, 0.53214353, 0.0347781..."
MLM-SMARTWATCHES,smartwatches reloj inteligente smartwatch watc...,"[0.008772849, 0.2652028, 0.49709296, 0.1106698..."


In [359]:
idx = 89
df_raw['most_searched_ngram_1'][idx]

'banco banqueta'

In [360]:
df_domain_title.iloc[closest_domain[0][idx]]

Unnamed: 0_level_0,title,title_embedding
domain_id,Unnamed: 1_level_1,Unnamed: 2_level_1
MLB-STOOLS,stools banqueta cozinha alta banquetas bar enc...,"[0.22174336, 0.37877455, 0.5859332, 0.01127014..."
MLB-DINING_SETS,dining sets mesa cadeiras jantar conjunto sala...,"[0.09556813, 0.35928696, 0.58406156, 0.1645083..."
MLM-SIDEBOARDS,sideboards credenza consola mueble vitrina mes...,"[0.15594926, 0.29973918, 0.51180124, 0.0653956..."
MLM-STOOLS,stools barra banco bar bancos cocina restauran...,"[0.39710644, 0.55040145, 0.3535078, 0.02175984..."
MLM-DINING_SETS,dining sets comedor sillas mesa madera persona...,"[0.17041424, 0.490705, 0.5158578, 0.017168684,..."
