In [1]:
import json_lines as jl
import pandas as pd

from itertools import islice
from sklearn.linear_model import LogisticRegression 
from collections import Counter

In [2]:
def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

In [3]:
line_batch_limit = 1000
max_users = 10
train_file = "../data/raw/train_dataset.jl.gz"
df = pd.DataFrame()
train_json = [0]

with jl.open(train_file) as file:
    while len(train_json) > 0 and max_users > 0:
        train_json = take(line_batch_limit, file)
        #item_bought_list.extend([user for user in train_json])
        df = pd.concat([df, pd.DataFrame(train_json, index=range(0, line_batch_limit))])
        max_users -= 1

In [4]:
df = df.reset_index(drop=True)

In [5]:
df

Unnamed: 0,user_history,item_bought
0,"[{'event_info': 1786148, 'event_timestamp': '2...",1748830
1,"[{'event_info': 643652, 'event_timestamp': '20...",228737
2,"[{'event_info': 248595, 'event_timestamp': '20...",1909110
3,"[{'event_info': 'RADIOBOSS', 'event_timestamp'...",1197370
4,"[{'event_info': 'AMAZFIT BIP', 'event_timestam...",2049207
...,...,...
9995,"[{'event_info': 322120, 'event_timestamp': '20...",1715188
9996,"[{'event_info': 'FORMAS BOLO', 'event_timestam...",1623124
9997,"[{'event_info': 'BOMBA CHIMARRAO', 'event_time...",977127
9998,"[{'event_info': 312056, 'event_timestamp': '20...",1206550


In [None]:
limit = 100000
item_file = "../data/raw/item_data.jl.gz"
df_item = pd.DataFrame()
item_temp = [0]

with jl.open(item_file) as file:
    while len(item_temp) > 0:
        item_temp = take(limit, file)
        df_item = pd.concat([df_item, pd.DataFrame(item_temp)])

In [None]:
df_item

# Feature Engineering

In [None]:
df.loc[2, 'user_history']

## Most viewed product

In [None]:
missing_id = df_item['item_id'].max() + 1

In [10]:
def get_most_viewed(hist):
    item_list = []
    for item in hist:
        if item['event_type']=='view':
            item_list.append(item['event_info'])
    try:
        return Counter(item_list).most_common(1)[0][0]
    except IndexError as e:
        return missing_id

In [11]:
def get_most_viewed_functional(hist):
    item_list = [item['event_info'] for item in
                   filter(lambda item: item['event_type']=='view', hist)]
    try:
        most_common = Counter(item_list).most_common(1)[0][0]
    except IndexError as e:
        return missing_id

In [12]:
#%%timeit
#df['user_history'].apply(get_most_viewed)

In [13]:
#%%timeit
#df['user_history'].apply(get_most_viewed_functional)

In [14]:
df['most_viewed'] = df['user_history'].apply(get_most_viewed)

## How many times the most visited item was viewed

In [15]:
def get_number_most_viewed(hist):
    item_list = []
    for item in hist:
        if item['event_type']=='view':
            item_list.append(item['event_info'])
    try:
        return Counter(item_list).most_common(1)[0][1]
    except IndexError as e:
        return 0

In [16]:
df['times_most_viewed'] = df['user_history'].apply(get_number_most_viewed)

In [17]:
df

Unnamed: 0,user_history,item_bought,most_viewed,times_most_viewed
0,"[{'event_info': 1786148, 'event_timestamp': '2...",1748830,1615991,16
1,"[{'event_info': 643652, 'event_timestamp': '20...",228737,228737,3
2,"[{'event_info': 248595, 'event_timestamp': '20...",1909110,248595,2
3,"[{'event_info': 'RADIOBOSS', 'event_timestamp'...",1197370,505541,5
4,"[{'event_info': 'AMAZFIT BIP', 'event_timestam...",2049207,1313192,15
...,...,...,...,...
9995,"[{'event_info': 322120, 'event_timestamp': '20...",1715188,322120,2
9996,"[{'event_info': 'FORMAS BOLO', 'event_timestam...",1623124,2102277,0
9997,"[{'event_info': 'BOMBA CHIMARRAO', 'event_time...",977127,81596,1
9998,"[{'event_info': 312056, 'event_timestamp': '20...",1206550,312056,4


## Last viewed product

In [18]:
def get_last_viewed(hist):
    idx_hist = len(hist) - 1
    item = {'event_type': 'null'}
    while item['event_type'] != 'view' and idx_hist >= 0:
        item = hist[idx_hist]
        idx_hist -= 1
    if item['event_type'] == 'view':
        return item['event_info']
    else:
        return missing_id

In [19]:
df['last_viewed'] = df['user_history'].apply(get_last_viewed)

In [20]:
df

Unnamed: 0,user_history,item_bought,most_viewed,times_most_viewed,last_viewed
0,"[{'event_info': 1786148, 'event_timestamp': '2...",1748830,1615991,16,1615991
1,"[{'event_info': 643652, 'event_timestamp': '20...",228737,228737,3,228737
2,"[{'event_info': 248595, 'event_timestamp': '20...",1909110,248595,2,248595
3,"[{'event_info': 'RADIOBOSS', 'event_timestamp'...",1197370,505541,5,937557
4,"[{'event_info': 'AMAZFIT BIP', 'event_timestam...",2049207,1313192,15,86082
...,...,...,...,...,...
9995,"[{'event_info': 322120, 'event_timestamp': '20...",1715188,322120,2,742153
9996,"[{'event_info': 'FORMAS BOLO', 'event_timestam...",1623124,2102277,0,2102277
9997,"[{'event_info': 'BOMBA CHIMARRAO', 'event_time...",977127,81596,1,81596
9998,"[{'event_info': 312056, 'event_timestamp': '20...",1206550,312056,4,1206550


In [21]:
df.loc[0,'user_history'][-1]

{'event_info': 1615991,
 'event_timestamp': '2019-10-20T19:28:41.646-0400',
 'event_type': 'view'}

## Most viewed and last viewed products info

In [22]:
col = 'most_viewed'

df = (df
      .set_index(col)
      .join(df_item[['condition', 'domain_id', 'price', 'item_id']]
            .add_suffix('_{}'.format(col))
            .set_index('item_id_{}'.format(col)), how='left')
      .reset_index()
      .rename(columns={'index': col}))

In [23]:
col = 'last_viewed'

df = (df
      .set_index(col)
      .join(df_item[['condition', 'domain_id', 'price', 'item_id']]
            .add_suffix('_{}'.format(col))
            .set_index('item_id_{}'.format(col)), how='left')
      .reset_index()
      .rename(columns={'index': col}))

In [24]:
df

Unnamed: 0,last_viewed,most_viewed,user_history,item_bought,times_most_viewed,condition_most_viewed,domain_id_most_viewed,price_most_viewed,condition_last_viewed,domain_id_last_viewed,price_last_viewed
0,15,15,"[{'event_info': 'FAROS LED AUXILIARES', 'event...",492271,7,new,MLM-GAME_CONSOLES_VIDEO_GAMES_AND_ARCADE_MACHINES,140.00,new,MLM-GAME_CONSOLES_VIDEO_GAMES_AND_ARCADE_MACHINES,140.00
1,33,5896,"[{'event_info': 'PISCINA BOLINHA', 'event_time...",33,4,new,MLB-INFLATABLE_BALL_PITS,385.00,new,MLB-INFLATABLE_BALL_PITS,166.92
2,235,235,"[{'event_info': 84323, 'event_timestamp': '201...",235,4,new,MLB-TABLETS,308.98,new,MLB-TABLETS,308.98
3,452,539361,"[{'event_info': 'MEDICAMENTO METOTREXATO', 'ev...",1796243,4,new,MLM-SMARTWATCHES,1599.00,new,MLM-WRISTWATCHES,499.00
4,524,1375452,"[{'event_info': 'RELOGIO FEMININO', 'event_tim...",852053,3,new,MLB-WRISTWATCHES,29.99,new,MLB-SHIRTS,6.50
...,...,...,...,...,...,...,...,...,...,...,...
9995,2102277,2102277,"[{'event_info': 'MAX TITANIUM', 'event_timesta...",594421,0,,,,,,
9996,2102277,2102277,"[{'event_info': 'DISCOS CNCO', 'event_timestam...",1395880,0,,,,,,
9997,2102277,2102277,"[{'event_info': 'ESTRADO CAMA CASAL', 'event_t...",906052,0,,,,,,
9998,2102277,2102277,"[{'event_info': 'SMART MUSCLE TRAINER', 'event...",732618,0,,,,,,


In [25]:
len(df[df['last_viewed']==missing_id])

672

In [26]:
100*(len(df[df['last_viewed']==missing_id])/len(df))

6.72

In [27]:
100*(len(df[df['most_viewed']==missing_id])/len(df))

6.72

In [29]:
df.loc[9999, 'user_history']

[{'event_info': 'FORMAS BOLO',
  'event_timestamp': '2019-10-21T09:39:47.143-0400',
  'event_type': 'search'},
 {'event_info': 'FORMAS BOLO',
  'event_timestamp': '2019-10-21T09:39:53.311-0400',
  'event_type': 'search'},
 {'event_info': 'FORMAS BOLO',
  'event_timestamp': '2019-10-21T09:40:04.331-0400',
  'event_type': 'search'}]

6.7% of dataset (10.000 first lines at least) does not have a valid id.

Cause: user has only searched for products, no product view was registered.

### Categorical values

In [30]:
df_item['domain_id'].nunique()

7893

7893 unique categorical variables.

Way too many categorical variables.

Use NLP + KMeans:

https://www.sbert.net/examples/applications/clustering/README.html

In [101]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering, KMeans
import numpy as np

In [32]:
embedder = SentenceTransformer('xlm-r-distilroberta-base-paraphrase-v1')

In [69]:
#embedder = SentenceTransformer('distilbert-multilingual-nli-stsb-quora-ranking')

100%|██████████| 501M/501M [01:24<00:00, 5.89MB/s]   


In [33]:
def preproc_domain(s:str)->str:
    if not s:
        return 'other'
    domain = s.split('-')[1]
    domain = ' '.join(domain.split('_'))
    domain = domain.lower()
    return domain

In [34]:
df_item['domain_id'].apply(preproc_domain).unique()

array(['individual houses for sale', 'video games', 'skirts', ...,
       'electric cream separators', 'antique audio antennas',
       'rugby helmets'], dtype=object)

In [37]:
df_item['domain_id'].apply(preproc_domain).nunique()

4330

In [35]:
corpus = df_item['domain_id'].apply(preproc_domain).unique()

In [69]:
corpus[:10]

array(['individual houses for sale', 'video games', 'skirts',
       'graphics cards', 'notebooks', 'vehicle accessories',
       'cellphone covers', 'wall and ceiling lights', 'napkin holders',
       'flats'], dtype=object)

In [70]:
corpus[-10:]

array(['wire rope thimbles', 'knife sheaths', 'virtual currencies',
       'skate boot covers', 'advertising inflatables', 'pressure tanks',
       'printer cleaning kits', 'electric cream separators',
       'antique audio antennas', 'rugby helmets'], dtype=object)

In [86]:
%%time
corpus_embeddings = embedder.encode(corpus)

CPU times: user 2min 25s, sys: 187 ms, total: 2min 25s
Wall time: 1min 13s


In [87]:
%%time
# Normalize the embeddings to unit length
corpus_embeddings = corpus_embeddings /  np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)

CPU times: user 14 ms, sys: 110 µs, total: 14.1 ms
Wall time: 11.5 ms


In [92]:
# Perform clustering
clustering_model = AgglomerativeClustering(n_clusters=20) 
                                           #affinity='cosine', 
                                           #linkage='average', 
                                           #distance_threshold=0.8)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

In [93]:
clustered_sentences = {}
for sentence_id, cluster_id in enumerate(cluster_assignment):
    if cluster_id not in clustered_sentences:
        clustered_sentences[cluster_id] = []

    clustered_sentences[cluster_id].append(corpus[sentence_id])

for i, cluster in clustered_sentences.items():
    print("Cluster ", i+1)
    print(cluster)
    print("")

Cluster  1
['individual houses for sale', 'notebooks', 'napkin holders', 'horses and mares', 'construction materials', 'tablets', 'sneakers', 'souvenirs', 'clothing patches', 'pogs', 'horse ranch ropes', 'action figures', 'home appliances', 'cash registers', 'costumes', 'school and office supplies', 'laptop housings', 'balloons', 'decorative paintings', 'books', 'mugs', 'stuffed toys', 'reptile and amphibian terrariums', 'bathtubs', 'styling chairs', 'suspenders', 'dog supplements', 'dog toy bones', 'bodyweight scales', 'toy robots', 'artificial plants', 'sim card trays', 'employee time clocks', 'sports cones', 'green walls and living paintings', 'handbags', 'purebred dogs', 'calculators', 'tablet screens', 'supplements', 'home decor', 'aesthetic treatment tables and chairs', 'dog muzzles', 'apartments for rent', 'sex dolls', 'animal and pet products', 'gun cleaning kits', 'toys and games', 'individual apartments for sale', 'office chairs', 'gym bars', 'gamepads and joysticks', 'projec

In [110]:
def remove_stopwords(s:str)->str:
    stopwords = ['electric', 'supplies', 'sets', 'covers', 'sets']
    domain = filter(lambda w: w not in stopwords, s.split(' '))
    return ' '.join(domain)

In [111]:
def preproc_domain(s:str)->str:
    if not s:
        return 'other'
    domain = s.split('-')[1]
    domain = ' '.join(domain.split('_'))
    domain = domain.lower()
    domain = remove_stopwords(domain)
    return domain

In [112]:
corpus = df_item['domain_id'].apply(preproc_domain).unique()

In [113]:
%%time
corpus_embeddings = embedder.encode(corpus)

CPU times: user 2min 37s, sys: 189 ms, total: 2min 37s
Wall time: 1min 19s


In [114]:
%%time
# Normalize the embeddings to unit length
corpus_embeddings = corpus_embeddings /  np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)

CPU times: user 6.21 ms, sys: 3.06 ms, total: 9.27 ms
Wall time: 7.85 ms


In [115]:
# Perform clustering
clustering_model = AgglomerativeClustering(n_clusters=20) 
                                           #affinity='cosine', 
                                           #linkage='average', 
                                           #distance_threshold=0.8)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

In [116]:
clustered_sentences = {}
for sentence_id, cluster_id in enumerate(cluster_assignment):
    if cluster_id not in clustered_sentences:
        clustered_sentences[cluster_id] = []

    clustered_sentences[cluster_id].append(corpus[sentence_id])

for i, cluster in clustered_sentences.items():
    print("Cluster ", i+1)
    print(cluster)
    print("")

Cluster  20
['individual houses for sale', 'apartments for rent', 'individual apartments for sale', 'houses for rent', 'individual lands for sale', 'rooms for rent', 'development houses for sale', 'retail space for sale', 'offices for rent', 'lands for rent', 'warehouses for rent', 'retail space for rent', 'development apartments for sale', 'houses for vacation rental', 'buildings for sale', 'other properties for rent', 'farm houses for sale', 'farms for sale', 'buildings for rent', 'apartments for vacation rental', 'farm houses for vacation rental', 'other properties for sale', 'farm houses for rent', 'individual offices for sale', 'warehouses for sale', 'development lands for sale']

Cluster  16
['video games', 'game consoles video games and arcade machines', 'game consoles', 'engine control modules', 'video game guitars', 'board games', 'tv remote controls', 'remote control toy vehicles', 'automotive control alarm cases', 'video game controller', 'game console cases and bags', 'game

Testing KMeans

In [117]:
num_clusters = 20
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

In [118]:
clustered_sentences = {}
for sentence_id, cluster_id in enumerate(cluster_assignment):
    if cluster_id not in clustered_sentences:
        clustered_sentences[cluster_id] = []

    clustered_sentences[cluster_id].append(corpus[sentence_id])

for i, cluster in clustered_sentences.items():
    print("Cluster ", i+1)
    print(cluster)
    print("")

Cluster  5
['individual houses for sale', 'cars and vans', 'suitcases', 'cash registers', 'freezers', 'punching bags', 'backpacks', 'keychains', 'crashed cars', 'handbags', 'apartments for rent', 'fanny packs', 'individual apartments for sale', 'coat racks', 'bar code scanners', 'houses for rent', 'trading card games', 'lunchboxes', 'boxing speed bags', 'money boxes', 'collectible military jackets', 'toiletry bags', 'shelf units', 'card payment terminals', 'individual lands for sale', 'sport and bazaar bottles', 'bookmarks', 'diaper bags', 'sleeping bags', 'inflatable bounce houses', 'serving and home trays', 'airbags', 'handicraft boxes', 'briefcases', 'clothes hangers', 'storage drawers', 'item finders', 'dog carriers and carrying bags', 'camera backpacks', 'toy cash registers', 'rubber stamps', 'distribution kits', 'drafting kits', 'breast milk storage bags', 'notebook cases', 'tablet cases', 'file boxes', 'bookcases', 'rolling shopping carts', 'dropping bottles', 'waste baskets', '

### Result

Aglomerative clustering seems to perform better.

In [119]:
# Perform clustering
clustering_model = AgglomerativeClustering(n_clusters=20) 
                                           #affinity='cosine', 
                                           #linkage='average', 
                                           #distance_threshold=0.8)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

In [128]:
clustered_sentences = {}
for sentence_id, cluster_id in enumerate(cluster_assignment):
    if cluster_id not in clustered_sentences:
        clustered_sentences[cluster_id] = []

    clustered_sentences[cluster_id].append(corpus[sentence_id])

for i, cluster in clustered_sentences.items():
    print("Cluster ", i+1)
    print(cluster)
    print("")

Cluster  20
['individual houses for sale', 'apartments for rent', 'individual apartments for sale', 'houses for rent', 'individual lands for sale', 'rooms for rent', 'development houses for sale', 'retail space for sale', 'offices for rent', 'lands for rent', 'warehouses for rent', 'retail space for rent', 'development apartments for sale', 'houses for vacation rental', 'buildings for sale', 'other properties for rent', 'farm houses for sale', 'farms for sale', 'buildings for rent', 'apartments for vacation rental', 'farm houses for vacation rental', 'other properties for sale', 'farm houses for rent', 'individual offices for sale', 'warehouses for sale', 'development lands for sale']

Cluster  16
['video games', 'game consoles video games and arcade machines', 'game consoles', 'engine control modules', 'video game guitars', 'board games', 'tv remote controls', 'remote control toy vehicles', 'automotive control alarm cases', 'video game controller', 'game console cases and bags', 'game

In [122]:
cluster_assignment

array([19, 15,  1, ..., 16, 11,  3])

In [129]:
cluster_mapper = {corpus[sentence_idx]: cluster
                  for (sentence_idx, cluster) in enumerate(cluster_assignment)}

In [130]:
cluster_mapper

{'individual houses for sale': 19,
 'video games': 15,
 'skirts': 1,
 'graphics cards': 2,
 'notebooks': 2,
 'vehicle accessories': 0,
 'cellphone': 11,
 'wall and ceiling lights': 9,
 'napkin holders': 7,
 'flats': 6,
 'esthetic devices': 8,
 'horses and mares': 2,
 'cars and vans': 0,
 'construction materials': 1,
 'beauty and personal care': 3,
 'washing machines': 4,
 'unclassified products': 3,
 'game consoles video games and arcade machines': 15,
 'electrical cables': 11,
 'wristwatches': 9,
 'bicycles': 10,
 'data cables and adapters': 11,
 'automotive speakers': 8,
 'dehumidifiers': 3,
 'kitchen': 4,
 'anti theft studs': 2,
 'eyeshadows': 3,
 'tablets': 8,
 'headphones': 11,
 'sweatshirts and hoodies': 2,
 'resistance bands': 2,
 'boots and booties': 1,
 'tools': 14,
 'baby': 12,
 'blouses': 7,
 'airsoft guns': 1,
 'dresses': 1,
 'dolls': 6,
 'boots': 1,
 'non corrective contact lenses': 8,
 'smartwatches': 9,
 'motorcycles': 0,
 'binoculars': 6,
 'sneakers': 1,
 'guitars': 8,


In [136]:
cluster_mapper['individual houses for sale']

19

In [125]:
df_item['domain_preproc'] = df_item['domain_id'].apply(preproc_domain)

In [139]:
df_item['cluster'] = df_item['domain_preproc'].map(cluster_mapper)

In [140]:
df_item

Unnamed: 0,item_id,title,domain_id,product_id,price,category_id,condition,domain_preproc,cluster
0,111260,Casa Sola En Venta Con Gran Patio Solo Pago De...,MLM-INDIVIDUAL_HOUSES_FOR_SALE,,1150000.00,MLM170527,new,individual houses for sale,19
1,871377,Resident Evil Origins Collection Nintendo Swit...,MLM-VIDEO_GAMES,15270800,1392.83,MLM151595,new,video games,15
2,490232,Falda De Imitación Piel Negra,MLM-SKIRTS,,350.00,MLM7697,new,skirts,1
3,1150706,Powercolor Red Devil Radeon Rx 580 8gb Gddr5,MLM-GRAPHICS_CARDS,,3200.00,MLM9761,used,graphics cards,2
4,934912,Laptop Hp Nx6320 Core Duo Con Puerto Db9 Windo...,MLM-NOTEBOOKS,,1599.00,MLM1652,used,notebooks,2
...,...,...,...,...,...,...,...,...,...
2272,1099649,Carrinho De Bebê Stoke,MLB-BABY_STROLLERS,,1600.00,MLB1386,used,baby strollers,12
2273,1482187,Grelha Para Hambúrguer Preta Com Cabo Em Madei...,MLB-KITCHEN_SUPPLIES,,69.90,MLB193425,new,kitchen,4
2274,1118904,Meia Tam 7/8 Anti Embolia Trombose Antitrombo,MLB-SOCKS,,118.00,MLB108791,new,socks,1
2275,237229,Pano De Boca Cremer Menina Luxo Bordado C/3 Und,MLB-DISPOSABLE_BABY_DIAPERS,,26.90,MLB40629,new,disposable baby diapers,12


## Last searched product

In [156]:
# TODO: improve search preprocessing

def preproc_search(s:str)->str:
    return s.lower()

In [157]:
def get_last_searched(hist):
    idx_hist = len(hist) - 1
    item = {'event_type': 'null'}
    while item['event_type'] != 'search' and idx_hist >= 0:
        item = hist[idx_hist]
        idx_hist -= 1
    if item['event_type'] == 'search':
        return preproc_search(item['event_info'])
    else:
        return ''

In [227]:
df['last_searched'] = df['user_history'].apply(get_last_searched)

### Define cluster

In [161]:
from sklearn.metrics.pairwise import cosine_similarity

In [198]:
cosine_similarity((embedder.encode(["capacete de bicicleta"])),
                  (embedder.encode(["bicycle helmet"])))

array([[0.94372696]], dtype=float32)

In [199]:
domain_preproc = df_item['domain_preproc'].unique()

In [205]:
domain_preproc[:10]

array(['individual houses for sale', 'video games', 'skirts',
       'graphics cards', 'notebooks', 'vehicle accessories', 'cellphone',
       'wall and ceiling lights', 'napkin holders', 'flats'], dtype=object)

In [202]:
%%time
domain_preproc_embeddings = embedder.encode(domain_preproc[:10])
domain_preproc_embeddings = (domain_preproc_embeddings
                             /np.linalg.norm(domain_preproc_embeddings, axis=1, keepdims=True))

CPU times: user 467 ms, sys: 0 ns, total: 467 ms
Wall time: 244 ms


In [211]:
np.argmax([cosine_similarity((embedder.encode(["celular"])),
                             [domain]) for domain in domain_preproc_embeddings])

6

In [212]:
def get_search_cluster(s:str):
    s_embedding = embedder.encode([s])
    s_embedding = s_embedding /  np.linalg.norm(s_embedding, axis=1, keepdims=True)
    idx = np.argmax([cosine_similarity(s_embedding,
                                       [domain]) 
                     for domain in domain_preproc_embeddings])
    return domain_preproc[idx]

In [219]:
get_search_cluster("celular")

'cellphone'

In [221]:
cluster_mapper.get(get_search_cluster("celular"))

11

In [222]:
def get_search_cluster(s:str):
    s_embedding = embedder.encode([s])
    s_embedding = s_embedding /  np.linalg.norm(s_embedding, axis=1, keepdims=True)
    idx = np.argmax([cosine_similarity(s_embedding,
                                       [domain]) 
                     for domain in domain_preproc_embeddings])
    return cluster_mapper.get(domain_preproc[idx])

In [223]:
get_search_cluster("celular")

11

In [231]:
%%time
domain_preproc_embeddings = embedder.encode(domain_preproc)
domain_preproc_embeddings = (domain_preproc_embeddings
                             /np.linalg.norm(domain_preproc_embeddings, axis=1, keepdims=True))

CPU times: user 2min 29s, sys: 190 ms, total: 2min 29s
Wall time: 1min 15s


In [296]:
def get_search_cluster(s:str):
    s_embedding = embedder.encode([s])
    s_embedding = s_embedding /  np.linalg.norm(s_embedding, axis=1, keepdims=True)
    idx = np.argmax([cosine_similarity(s_embedding,
                                       [domain]) 
                     for domain in domain_preproc_embeddings])
    return cluster_mapper.get(domain_preproc[idx])

In [297]:
%%timeit
df['last_searched'].head(10).apply(get_search_cluster)

11.9 s ± 387 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [300]:
(len(df)/10)*12/3600

3.3333333333333335

`get_search_cluster` would take more than 3 hours to finish.

Strategy needs to be improved.

In [298]:
def get_search_cluster(s:str):
    s_embedding = embedder.encode([s])
    s_embedding = s_embedding /  np.linalg.norm(s_embedding, axis=1, keepdims=True)
    idx = np.argmax(cosine_similarity(domain_preproc_embeddings, s_embedding))
    return cluster_mapper.get(domain_preproc[idx])

In [311]:
%%timeit
df['last_searched'].head(10).apply(get_search_cluster)

1.66 s ± 55.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [301]:
(len(df)/10)*2/3600

0.5555555555555556

That's better. Just had to use the API correctly.

It still takes too long, though.

## Second most/last viewed

Consider second most/last viewed as well