# Version 1.0
## Use Bert model to embed the sentences ( without K-Fold )

In [1]:
from pathlib import Path

import numpy as np
from tqdm import tqdm
from collections import defaultdict, Counter
from utils import read_json_lines, JSONLinesWriter, save_args
from normalizer import *
from sklearn.model_selection import KFold

## Preprocessing
### Paths

In [2]:
save_path = Path('./experimental/exp1')
save_path.mkdir(parents=True,exist_ok=True)

root = Path('./data/contest')
# Paths
product_info_path = root.joinpath('products-info_v1.jsonl')
search_data_path = root.joinpath('torob-search-data_v1.jsonl')
test_offline_path = root.joinpath('test-offline-data_v1.jsonl')

### Search Data

In [3]:
print("Aggregating searches based on raw query...")
agg_searches = defaultdict(lambda: dict(results=Counter(), clicks=Counter()))
for search in tqdm(read_json_lines(str(search_data_path))):
    agg_searches[search['raw_query']]['results'].update(search['result'])
    agg_searches[search['raw_query']]['clicks'].update(search['clicked_result'])

print('Writing aggregated searches into file...')
with JSONLinesWriter(save_path.joinpath('aggregate_search_data.jsonl')) as out_file:
    for raw_query, stats in tqdm(agg_searches.items()):
        results, results_count = list(zip(*stats['results'].most_common()))
        clicks, clicks_count = list(zip(*stats['clicks'].most_common()))
        record = {
            'raw_query': raw_query,
            'raw_query_normalized': normalize_text(raw_query),
            'results': results,
            'results_count': results_count,
            'clicks': clicks,
            'clicks_count': clicks_count,
        }
        out_file.write_record(record)

print("Finished aggregating searches.")
print(f'Number of aggregate search records: {len(agg_searches)}')
print(f"The aggregated searches data were stored in '{save_path.joinpath('aggregate_search_data.jsonl')}'.")

Aggregating searches based on raw query...


2499901it [00:49, 50666.98it/s]


Writing aggregated searches into file...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 270099/270099 [00:11<00:00, 23220.27it/s]

Finished aggregating searches.
Number of aggregate search records: 270099
The aggregated searches data were stored in 'experimental/exp1/aggregate_search_data.jsonl'.





### Products

In [4]:
count = 0
with JSONLinesWriter(str(save_path.joinpath('aggregate_product_data.jsonl'))) as out_file:
    for product in tqdm(read_json_lines(product_info_path)):
        titles = product['titles']
        titles_concat_normalized = normalize_text(" ".join(titles))
        titles_words_set = set(titles_concat_normalized.split())
        titles_words_concat = " ".join(titles_words_set)

        record = {
            'id': product['id'],
            'title_normalized': titles_words_concat,
        }
        out_file.write_record(record)
        count += 1
print('Finished preprocessing products.')
print(f'Number of processed products: {count}')
print(f"The processed products data were stored in '{save_path.joinpath('aggregate_product_data.jsonl')}'")

3612277it [01:44, 34666.29it/s]

Finished preprocessing products.
Number of processed products: 3612277
The processed products data were stored in 'experimental/exp1/aggregate_product_data.jsonl'





### Test Queries

In [5]:
print('Preprocessing test queries...')
count = 0
with JSONLinesWriter(str(save_path.joinpath('aggregate_test_query.jsonl'))) as out_file:
    for test_sample in tqdm(read_json_lines(test_offline_path)):
        normalized_query = normalize_text(test_sample['raw_query'])
        record = {
            'raw_query_normalized': normalized_query,
        }
        count += 1
        out_file.write_record(record)
print('Finished preprocessing test queries.')
print(f'Number of processed test queries: {count}')
print(f"The processed test queries were stored in '{save_path.joinpath('aggregate_test_query.jsonl')}'")

Preprocessing test queries...


23140it [00:00, 41941.50it/s]

Finished preprocessing test queries.
Number of processed test queries: 23140
The processed test queries were stored in 'experimental/exp1/aggregate_test_query.jsonl'





In [1]:
%reset -f

In [8]:
from pathlib import Path

import numpy as np
from tqdm import tqdm
import pandas as pd
from collections import defaultdict, Counter
from utils import read_json_lines, JSONLinesWriter, save_args
from transformers import AutoConfig, AutoTokenizer, TFAutoModel,  BertModel, BertTokenizer
import torch

## Normalization + Embeddings
### Paths

In [3]:
save_path = Path('./experimental/exp1')
save_path.mkdir(parents=True,exist_ok=True)

root = Path('./experimental/exp1')
# Paths
product_info_path = root.joinpath('aggregate_product_data.jsonl')
search_data_path = root.joinpath('aggregate_search_data.jsonl')
test_offline_path = root.joinpath('aggregate_test_query.jsonl')

### Parameters

In [4]:
vocab_size = 4096
embedding_dim =256
pre_model = "dbmdz/bert-large-cased-finetuned-conll03-english" 

### Bert

In [5]:
tokenizer = AutoTokenizer.from_pretrained(pre_model)
model = BertModel.from_pretrained(pre_model,
                                  ignore_mismatched_sizes=True,
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  max_position_embeddings=2048
                                  )

Downloading:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
def embedder(tokenizer, model, text):
    tokenized_text = tokenizer.tokenize(text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    segments_ids = [1] * len(tokenized_text)

    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        hidden_states = outputs[2]

    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        hidden_states = outputs[2]

    token_embeddings = torch.stack(hidden_states, dim=0)
    token_embeddings = torch.squeeze(token_embeddings, dim=1)

    token_vecs_cat = []

    for token in token_embeddings:
        cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)
        token_vecs_cat.append(cat_vec)

    token_vecs_sum = []

    for token in token_embeddings:
        sum_vec = torch.sum(token[-4:], dim=0)
        token_vecs_sum.append(sum_vec)

    token_vecs = hidden_states[-2][0]
    sentence_embedding = torch.mean(token_vecs, dim=0)

    return sentence_embedding.numpy()

### Get Products Embeddings

In [14]:
products_df = pd.DataFrame(read_json_lines(product_info_path))
products_id_to_idx = dict((p_id, idx) for idx, p_id in enumerate(products_df['id']))

In [None]:
products_embeds = []
for pid, detail in tqdm(enumerate(products_df.itertuples(index=False))):
    p_sentence = detail.title_normalized
    try:
        products_embeds.append(embedder(tokenizer,model,p_sentence))
    except:
        print(p_sentence)

330it [01:39,  1.36it/s]

طرفدار وکیفیت پک لیسانس پشت پوکه پرکننده واقعی موجودی مویی ارایشی والیومه waterproof 12ml ارتفاع 13ml 12 پرمواد رایگان پخش 4 یک میلی اورجینال لاو 7 حجم12 کنندگی l کنید دهنده essense میل مواد مشکیextreme پرپشت بعدی crazy rimel شناسه 619661 love بلند عالی اب محصول درجه عمده مشکی ضداب حجیم ایتالیا ی بسیار asli 13 خاصیت ارسال بهداشت ابی پرفروش کد ظرفیت فروش لیبل تحت لوازم امریکایی کالا های سری زیاد کشور کننده رنگ 10789 volum رخ ای سوراخدار وزارت ضد چرکی خرده تقویت ماندگاری لیتر فرچه دار volume دهندگی ماسکارا اصل valume ا 2 essence org ترین از کپی زیرچشم و i 391 امریکا فرم قوی ته عددی والیوم بدون ضمانت extre بااطمینان بگیرید 3 ایتالیایی برند کیفیت گاش بالا وبلندکننده ilove بارکد اکستریم ولوم 1309 382938 شاپ سیاه مژه کریزی صورتی ریزش واسطه مدل exterme دو mascara مارتا حجم دابل ایوروشه ها با extreme black قهوه extereme سوراخ ریمیل 12میل اکسترم سبز شاهکار اصلی دهندهessence دهند یکessence ریمل 2523 العاده پر ساخت 5 اسنس model کاملا ارایش 619660 r270 فوق چشم حالت کمیاب برچسب اصالت طرح خرید همراه

13665it [1:00:24,  2.73it/s]