In [None]:
# !pip install --upgrade --no-cache-dir gdown

!pip install -q hazm
!pip install -q parsivar

!pip install -q datasets  --no-cache-dir
!pip install -q transformers  --no-cache-dir

In [None]:
# !pip install -q "datasets==2.10.1" # previously 2.9.0

In [1]:
import datasets
datasets.__version__ # '2.9.0'

'2.9.0'

In [2]:
import csv
import json
import sqlite3

from collections import defaultdict, Counter

import hazm
from parsivar import Normalizer

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm_notebook

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

import torch
from torch import nn

from datasets import (
    Dataset,
    DatasetDict,
    load_dataset,
    load_metric,
    load_from_disk,
    concatenate_datasets,
)
from transformers import AutoTokenizer, AutoModel
from transformers import AdamW

In [3]:
ls

baseline.ipynb             stop-words.txt
[0m[01;34mdata[0m/                      [01;34mtest_dataset[0m/
data_df.csv                [01;34mtokenizer[0m/
data_df.pkl                [01;34mTorob[0m/
dataset_v3_2.txt           torob_contest.ipynb
data_trob20.csv            torob_contest_pairwise_dataset_v2.ipynb
data_trob_without_sql.csv  [01;31mtorob-data-challenge-2023_datafiles_v1.7z[0m
eda.ipynb                  torob_v3.1_kaggle_run_model.ipynb
graph_torob_contest.ipynb  torob-v3_2-dataset.ipynb
[01;34mLaBSE[0m/                     torob_v3_lab_system_data_generation.ipynb
[01;34mmodel[0m/                     train_data_complete.csv
my_database.db             [01;31mtrain_data_complete.csv.zip[0m
[01;34moutput_data[0m/               [01;34mtrain_dataset[0m/
prediction.txt             [01;31mtrain_test_dataset_torch.zip[0m
sort_by_click.ipynb        Untitled.ipynb


## Normalizer

In [4]:
class MyNormalizer:
    def __init__(self):
        self.parsivar_normalizer = Normalizer(
            statistical_space_correction=True,
            half_space_char=" ",
            pinglish_conversion_needed=True,
        )
        self.hazm_normalizer = hazm.Normalizer(
            remove_extra_spaces=True,
            persian_numbers=True,
            persian_style=True,
            punctuation_spacing=False,
            remove_diacritics=True,
            affix_spacing=False,
            token_based=True,
        )

    def normalize(self, txt):
        return self.hazm_normalizer.normalize(
            self.parsivar_normalizer.normalize(
                txt.replace("\n", " ").replace("\u200c", " ").lower().strip()
            )
        )

In [5]:
normalizer = MyNormalizer()
normalizer.normalize("34.0")

'۳۴ ٫۰'

## Load Data

In [6]:
class JsonFileIterator:
    def __init__(self, path):
        self.path = path
        self.f = open(path, "r")
        self.i = 0
        self.length = self.counter_lines()

    def __iter__(self):
        return self

    def __next__(self):
        line = self.f.readline()
        if not line:
            # End of file
            self.f.close()
            raise StopIteration
        self.i += 1
        return json.loads(line)

    def counter_lines(self):
        with open(self.path, "r") as f1:
            return sum(1 for _ in f1)

    def __len__(self):
        return self.length


In [8]:
normalizer = MyNormalizer()
search_data = JsonFileIterator("./data/torob-search-data_v1.jsonl")
queries = dict()
for search in tqdm_notebook(search_data):
    raw_query = search["raw_query"]
    normalized_query = normalizer.normalize(raw_query)
    if queries.get(normalized_query, -1) == -1:
        queries[normalized_query] = 1
    else:
        queries[normalized_query] += 1

  0%|          | 0/2499901 [00:00<?, ?it/s]

In [9]:
len(queries)

176427

In [11]:
agg_searches = defaultdict(
    lambda: dict(
        results=Counter(),
        clicks=Counter(),
    )
)

search_data = JsonFileIterator("./data/torob-search-data_v1.jsonl")

print("Aggregating searches based on raw query...")

for search in tqdm_notebook(search_data):
    raw_query = search["raw_query"]
    normalized_query = normalizer.normalize(raw_query)

    if queries[normalized_query] >= 30:
        results = search["result"][: np.max(search["clicked_rank"]) + 8]
        clicked_results = search["clicked_result"]
        agg_searches[normalized_query]["results"].update(results)
        agg_searches[normalized_query]["clicks"].update(clicked_results)

Aggregating searches based on raw query...


  0%|          | 0/2499901 [00:00<?, ?it/s]

In [12]:
len(agg_searches)

10435

<!--  -->

In [38]:
stopwords = []
with open('stop-words.txt', encoding='utf-8' ) as f:
    for line in f:
        stopwords.append(normalizer.normalize(line.strip()))

In [39]:
def find_new_words(titles_list, index_high_len):
    highest_len_title_words = titles_list[index_high_len].split()
    total_new_words = []
    for i in range(1, len(titles_list)):
        if i != index_high_len:
            words = titles_list[i].split()
            new_words = [word for word in words if word not in highest_len_title_words]
            new_words = [word for word in new_words if word not in stopwords]
            total_new_words += new_words
    return list(set(total_new_words))

<!--  -->

In [14]:
import pickle

In [18]:
agg_s = list(agg_searches.keys())

In [19]:
with open('./data/torob_list_search', 'wb') as fp:
    pickle.dump(agg_s, fp, protocol=pickle.HIGHEST_PROTOCOL)

<!--  -->

In [20]:
len(agg_s)

10435

In [21]:
len(agg_searches)

10435

In [34]:
agg_s[:10]

['ساعت هوشمند',
 'خط زن',
 'پرده پذیرایی',
 'ایفون ۱۳ پرو',
 'عینک افتابی',
 'لوازم ارایشی',
 'ترازو',
 'گوشیa ۳۰',
 'گوشی ایفون ۱۲',
 'گوشی سامسونگ']

In [35]:
agg_searches[agg_s[0]]

{'results': Counter({2459592: 9946,
          9391819: 10066,
          4229448: 9751,
          7824893: 9946,
          1670767: 6281,
          9901900: 9944,
          8563833: 9985,
          900897: 8435,
          7611444: 9175,
          7451228: 9630,
          1665693: 6760,
          6462973: 5363,
          8258186: 5771,
          4648687: 6373,
          232002: 5569,
          5815030: 4770,
          2342213: 3890,
          None: 6448,
          4111195: 4507,
          5289550: 3939,
          9083224: 4340,
          7231049: 3451,
          1509175: 3933,
          8572726: 4586,
          4943283: 3962,
          2931230: 3700,
          3995256: 3176,
          3065636: 2949,
          9023085: 3282,
          8277631: 2626,
          8476777: 3087,
          1803417: 2931,
          3870044: 2984,
          2262418: 2677,
          2575701: 2332,
          3200289: 7075,
          762244: 2223,
          951916: 4387,
          2404132: 1494,
          8058124: 3

<!--  -->

In [24]:
product_info = JsonFileIterator("./data/products-info_v1.jsonl")

<!--  -->

In [25]:
def read_json_lines(path, n_lines=None):
    """Creates a generator which reads and returns lines of
    a json lines file, one line at a time, each as a dictionary.

    This could be used as a memory-efficient alternative of `pandas.read_json`
    for reading a json lines file.
    """
    with open(path, 'r') as f:
        for i, line in enumerate(f):
            if n_lines == i:
                break
            yield json.loads(line)

In [26]:
product = pd.DataFrame(read_json_lines('./data/products-info_v1.jsonl'))

In [27]:
product = product.set_index("id")

In [28]:
len(product)

3612277

In [29]:
product.loc[1867826]

category_name                                   میکروسکوپ
titles           [میکروسکوپ اپتیکی سلسترون مدل 44121 CGL]
min_price                                       1900000.0
max_price                                       2082000.0
avg_price                                  1928314.393939
min_num_shops                                         1.0
max_num_shops                                         2.0
avg_num_shops                                    1.655303
Name: 1867826, dtype: object

In [30]:
product.loc[1867826]["titles"]

['میکروسکوپ اپتیکی سلسترون مدل 44121 CGL']

In [31]:
product.loc[1867826].min_price

1900000.0

<!--  -->

<!--  -->

<!--  -->

## Prepare data

In [None]:
with open("dataset_v3_2.txt", "w", encoding="utf-8", newline="") as csvfile:
    wrtiter = csv.writer(csvfile)
    wrtiter.writerow(
        [
            "query",
            "product_id",
            "p_des",
            "product_title_lowest_length",
            "category_name",
            "min_num_shops",
            "max_num_shops",
            "avg_num_shops",
            "min_price",
            "max_price",
            "avg_price",
            "mean_min_prices",
            "mean_max_prices",
            "mean_avg_prices",
            "std_min_prices",
            "std_max_prices",
            "std_avg_prices",
            "num_query",
            "res_clicks",
            "click",
            "candidate_score1",
            "candidate_score2",
            "clicks",
            "impressions",
            "ctr",
            "ctr_laplace_normalized",
            "click_per_maxclick",
            "ctr_normalized_multiplied_clicks_normalized",
        ]
    )
    # conn = sqlite3.connect("my_database.db")
    # c = conn.cursor()
    data_list = [] 
    for query in tqdm_notebook(agg_s):
        results = agg_searches[query]
        min_prices = []
        max_prices = []
        avg_prices = []

        for product_id, res_clicks in results["results"].most_common(60):
            if product_id != None:
                result_product = product.loc[product_id]
                
                # c.execute("SELECT * FROM products WHERE id = ?", (product_id,))
                # result_product = c.fetchone()
                
                if result_product[2] != None:
                    if result_product[2] != None:
                        min_prices.append(result_product[2])
                    if result_product[3] != None:
                        max_prices.append(result_product[3])
                    if result_product[4] != None:
                        avg_prices.append(result_product[4])
            mean_min_prices = np.mean(min_prices)
            mean_max_prices = np.mean(max_prices)
            mean_avg_prices = np.mean(avg_prices)

            std_min_prices = np.std(min_prices)
            std_max_prices = np.std(max_prices)
            std_avg_prices = np.std(avg_prices)

        for product_id, res_clicks in results["results"].most_common(60):
            if product_id != None:

                # c.execute("SELECT * FROM products WHERE id = ?", (product_id,))
                # result_product = c.fetchone()

                result_product = product.loc[product_id]
                
                ##################################################
                # print(result_product)
                ##################################################
                
                category_name = result_product[0]
                min_price = result_product[2]
                max_price = result_product[3]
                avg_price = result_product[4]

                titles_list_product = product.loc[product_id]["titles"]

                # titles_list_product = json.loads(result_product[2])

                if len(titles_list_product) > 0:
                    min_num_shops = result_product[5]
                    max_num_shops = result_product[6]
                    avg_num_shops = result_product[7]
                    highest_len_product = max(titles_list_product, key=len)
                    index_highest_len_product = titles_list_product.index(
                        highest_len_product
                    )
                    product_title_new_words = find_new_words(
                        titles_list_product, index_highest_len_product
                    )

                    # lowest length product title
                    product_title = min(titles_list_product, key=len)

                    p_des = " ".join(
                        [highest_len_product, " ".join(product_title_new_words)]
                    ).replace("\u200c", " ")

                    max_clicks = np.max(list(results["clicks"].values()))

                    ##### score 
                    candidate_score = results["clicks"].get(product_id, 0)
                    candidate_score1 = np.log2(candidate_score + 1)
                    candidate_score2 = np.log2(candidate_score + 1) / np.log2(
                        max_clicks + 1
                    )

                    # clicks, impressions, ctr, ctr_laplace_normalized, 
                    clicks = results["clicks"].get(product_id, 0)
                    impressions = results["results"].get(product_id, 0)
                    ctr = clicks / impressions
                    ctr_laplace_normalized = (clicks + 1) / (impressions + len(results["results"]))

                    # 
                    click_per_maxclick = clicks + 1 / max_clicks + 1
                    # 
                    ctr_normalized_multiplied_clicks_normalized = ctr_laplace_normalized * click_per_maxclick

                    wrtiter.writerow(
                        [
                            query,
                            product_id,
                            p_des,
                            product_title,
                            category_name,
                            min_num_shops,
                            max_num_shops,
                            avg_num_shops,
                            min_price,
                            max_price,
                            avg_price,
                            mean_min_prices,
                            mean_max_prices,
                            mean_avg_prices,
                            std_min_prices,
                            std_max_prices,
                            std_avg_prices,
                            queries[query],
                            res_clicks,
                            candidate_score,
                            candidate_score1,
                            candidate_score2,
                            clicks,
                            impressions,
                            ctr,
                            ctr_laplace_normalized,
                            click_per_maxclick,
                            ctr_normalized_multiplied_clicks_normalized
                        ]
                    )
    # conn.close()

df = pd.read_csv("dataset_v3_2.txt", sep=",")
df.head()

  0%|          | 0/10435 [00:00<?, ?it/s]

In [None]:
with open("dataset_v3_2.txt", "w", encoding="utf-8", newline="") as csvfile:
    wrtiter = csv.writer(csvfile)
    wrtiter.writerow(
        [
            "query",
            "product_id",
            "p_des",
            "product_title_lowest_length",
            "category_name",
            "min_num_shops",
            "max_num_shops",
            "avg_num_shops",
            "min_price",
            "max_price",
            "avg_price",
            "mean_min_prices",
            "mean_max_prices",
            "mean_avg_prices",
            "std_min_prices",
            "std_max_prices",
            "std_avg_prices",
            "num_query",
            "res_clicks",
            "click",
            "candidate_score1",
            "candidate_score2",
            "clicks",
            "impressions",
            "ctr",
            "ctr_laplace_normalized",
            "click_per_maxclick",
            "ctr_normalized_multiplied_clicks_normalized",
        ]
    )
    # conn = sqlite3.connect("my_database.db")
    # c = conn.cursor()
    data_list = [] 
    for query in tqdm_notebook(agg_s):
        results = agg_searches[query]
        min_prices = []
        max_prices = []
        avg_prices = []

        for product_id, res_clicks in results["results"].most_common(60):
            if product_id != None:
                result_product = product.loc[product_id]

                if result_product[2] != None:
                    if result_product[2] != None:
                        min_prices.append(result_product[2])
                    if result_product[3] != None:
                        max_prices.append(result_product[3])
                    if result_product[4] != None:
                        avg_prices.append(result_product[4])
            mean_min_prices = np.mean(min_prices)
            mean_max_prices = np.mean(max_prices)
            mean_avg_prices = np.mean(avg_prices)

            std_min_prices = np.std(min_prices)
            std_max_prices = np.std(max_prices)
            std_avg_prices = np.std(avg_prices)

        for product_id, res_clicks in results["results"].most_common(60):
            if product_id != None:

                result_product = product.loc[product_id]
                
                ##################################################
                # print(result_product)
                ##################################################
                
                category_name = result_product[0]
                min_price = result_product[2]
                max_price = result_product[3]
                avg_price = result_product[4]

                titles_list_product = product.loc[product_id]["titles"]

                # titles_list_product = json.loads(result_product[2])

                if len(titles_list_product) > 0:
                    min_num_shops = result_product[5]
                    max_num_shops = result_product[6]
                    avg_num_shops = result_product[7]
                    highest_len_product = max(titles_list_product, key=len)
                    index_highest_len_product = titles_list_product.index(
                        highest_len_product
                    )
                    product_title_new_words = find_new_words(
                        titles_list_product, index_highest_len_product
                    )

                    # lowest length product title
                    product_title = min(titles_list_product, key=len)

                    p_des = " ".join(
                        [highest_len_product, " ".join(product_title_new_words)]
                    ).replace("\u200c", " ")

                    max_clicks = np.max(list(results["clicks"].values()))

                    ##### score 
                    candidate_score = results["clicks"].get(product_id, 0)
                    candidate_score1 = np.log2(candidate_score + 1)
                    candidate_score2 = np.log2(candidate_score + 1) / np.log2(
                        max_clicks + 1
                    )

                    # clicks, impressions, ctr, ctr_laplace_normalized, 
                    clicks = results["clicks"].get(product_id, 0)
                    impressions = results["results"].get(product_id, 0)
                    ctr = clicks / impressions
                    ctr_laplace_normalized = (clicks + 1) / (impressions + len(results["results"]))

                    # 
                    click_per_maxclick = clicks + 1 / max_clicks + 1
                    # 
                    ctr_normalized_multiplied_clicks_normalized = ctr_laplace_normalized * click_per_maxclick

                    wrtiter.writerow(
                        [
                            query,
                            product_id,
                            p_des,
                            product_title,
                            category_name,
                            min_num_shops,
                            max_num_shops,
                            avg_num_shops,
                            min_price,
                            max_price,
                            avg_price,
                            mean_min_prices,
                            mean_max_prices,
                            mean_avg_prices,
                            std_min_prices,
                            std_max_prices,
                            std_avg_prices,
                            queries[query],
                            res_clicks,
                            candidate_score,
                            candidate_score1,
                            candidate_score2,
                            clicks,
                            impressions,
                            ctr,
                            ctr_laplace_normalized,
                            click_per_maxclick,
                            ctr_normalized_multiplied_clicks_normalized
                        ]
                    )


In [None]:
# save with joblib 
joblib.dump(df, "dataset_v3_2_query_more_than_equal_30.pkl")

In [None]:
copy = df.copy(deep=True)
# show 20 random rows
df.sample(20)

In [2]:
import joblib

In [3]:
df = joblib.load("dataset_v3_2_query_more_than_equal_30.pkl")

In [14]:
# show query with minimum number of products
df.groupby("query").size().sort_values(ascending=True).head(20)

query
moripods                    7
هیرویت                      8
کارتخوان af ۷۰             10
وگادول                     12
لرزه گیر ماشین لباسشویی    12
پرینتر ۱۳۵                 12
۱۲۴۰۰ f                    12
ضدافتاب لافارر             13
comfobuds ۲                13
۱۴۹                        13
liberty ۳ پرو              13
note ۱۱ پرو ۱۲۸            14
r ۸۶۰                      14
کپسول هیرویت               14
۱۳ pro ۲۵۶                 14
۰۱۸۰                       14
جیمیلای ۳۶۰۵               14
شیاومی ۱۱ تی پرو           14
t ۱۸                       14
کیسلکت                     14
dtype: int64

In [25]:
# show "moripods" products
df[df["query"] == "moripods"]

Unnamed: 0,query,product_id,p_des,product_title_lowest_length,category_name,min_num_shops,max_num_shops,avg_num_shops,min_price,max_price,...,impressions,ctr,ctr_laplace_normalized,click_per_maxclick,ctr_normalized_multiplied_clicks_normalized,max_ctr_laplace_normalized,ctr_laplace_normalized_by_max_ctr_laplace_normalized,ceil_ctr_laplace_normalized_by_max_ctr_laplace_normalized,ctr_laplace_normalized_by_max_ctr_laplace_normalized_multiplied_click_per_maxclick,ceil_ctr_laplace_normalized_by_max_ctr_laplace_normalized_multiplied_click_per_maxclick
358353,moripods,6921597,هندزفری بلوتوث هایلو مدل MoriPods T33 شیائومی ...,هایلو t33,هدفون، هدست و هندزفری,100.0,116.0,109.095077,530000.0,577000.0,...,63,0.936508,0.84507,60.016949,50.718549,0.84507,10.0,10.0,600.169492,601.0
358354,moripods,4739675,هدفون بی سیم شیائومی هایلو مدل Xiaomi Haylou M...,MoriPods anc,هدفون، هدست و هندزفری,22.0,41.0,32.780703,674000.0,768000.0,...,63,0.095238,0.098592,7.016949,0.691812,0.84507,1.166667,2.0,8.186441,9.0
358355,moripods,1816649,کاور سیلیکونی محافظ هندزفری بلوتوث هایلو Haylo...,کاور هندزفری haylou mori pods,کیف و کاور هدفون و هندزفری,3.0,5.0,4.320515,48000.0,80000.0,...,63,0.031746,0.042254,3.016949,0.127477,0.84507,0.5,1.0,1.508475,2.0
358356,moripods,4507768,کاور سیلیکونی محافظ هدفون هایلو Haylou MoriPods,کاور محافظ سیلیکونی هندزفری هایلو MoriPods T33,کیف و کاور هدفون و هندزفری,1.0,2.0,1.5315,70000.0,78000.0,...,61,0.016393,0.028986,2.016949,0.058462,0.84507,0.342995,1.0,0.691804,1.0
358357,moripods,1623862,کاور سیلیکونی کیس هایلو MoriPods,کاور سیلیکونی کیس هایلو MoriPods,کیف و کاور گوشی,1.0,1.0,1.0,125000.0,125000.0,...,59,0.016949,0.029851,2.016949,0.060207,0.84507,0.353234,1.0,0.712455,1.0
358358,moripods,872273,کاور کد M10 مناسب برای کیس Moripods T33,کاور کد M10 مناسب برای کیس Moripods T33,کیف و کاور گوشی,1.0,1.0,1.0,68040.0,119400.0,...,53,0.018868,0.032787,2.016949,0.066129,0.84507,0.387978,1.0,0.782532,1.0
358359,moripods,8289854,هدفون بلوتوثی هایلو مدل MoriPods Bluetooth V5....,هدفون بلوتوثی هایلو مدل MoriPods Bluetooth V5.2 E,هدفون، هدست و هندزفری,0.0,0.0,0.0,,,...,33,0.0,0.02439,1.016949,0.024804,0.84507,0.288618,1.0,0.29351,1.0


In [16]:
# get all products for query[0]
# df[df["query"] == df["query"][0]]

In [17]:
# add a column for ctr_laplace_normalized/max_ctr_laplace_normalized, note: group by query
max_ctr_laplace_normalized = df.groupby("query")["ctr_laplace_normalized"].max()
df["max_ctr_laplace_normalized"] = df["query"].map(max_ctr_laplace_normalized)

In [40]:
len(df)

603637

In [20]:
import numpy as np

In [None]:
df["ctr_laplace_normalized_by_max_ctr_laplace_normalized"] = ( df["ctr_laplace_normalized"] / df["max_ctr_laplace_normalized"] ) * 10

In [21]:
df["ceil_ctr_laplace_normalized_by_max_ctr_laplace_normalized"] = df["ctr_laplace_normalized_by_max_ctr_laplace_normalized"].apply(np.ceil)

In [22]:
# df["ctr_laplace_normalized"] / df["max_ctr_laplace_normalized"]
df["ctr_laplace_normalized_by_max_ctr_laplace_normalized"].head(20)


0     10.000000
1      4.371409
2      8.252018
3      3.384518
4      2.497263
5      8.804563
6      1.618161
7      1.159291
8      2.071292
9      5.247443
10     1.160673
11     1.127305
12     1.810594
13     0.745930
14     2.121837
15     1.101672
16     1.273197
17     1.689985
18     1.073321
19     3.562143
Name: ctr_laplace_normalized_by_max_ctr_laplace_normalized, dtype: float64

In [24]:
# add a column for ctr_laplace_normalized/max_ctr_laplace_normalized * click_per_maxclick
df["ctr_laplace_normalized_by_max_ctr_laplace_normalized_multiplied_click_per_maxclick"] = df["ctr_laplace_normalized_by_max_ctr_laplace_normalized"] * df["click_per_maxclick"]

df["ceil_ctr_laplace_normalized_by_max_ctr_laplace_normalized_multiplied_click_per_maxclick"] = df["ctr_laplace_normalized_by_max_ctr_laplace_normalized_multiplied_click_per_maxclick"].apply(np.ceil)

In [None]:
data_df["relevance"] = 0
for query in tqdm_notebook(data_df["query"].unique()):
    # get the products for the query
    products = data_df[data_df["query"] == query]["p_id"].unique()
    # get the clicks and impressions for the products
    clicks = data_df[data_df["query"] == query]["clicks"].values
    impressions = data_df[data_df["query"] == query]["impressions"].values
    # calculate ctr
    ctr = clicks / impressions
    # calculate max ctr
    max_ctr = np.max(ctr)
    # calculate relevance score
    relevance = np.ceil(4 * ctr / max_ctr)
    # add relevance score to data_df
    data_df.loc[data_df["query"] == query, "relevance"] = relevance

In [28]:
# create a list of all products which sorted by df["ctr_laplace_normalized_by_max_ctr_laplace_normalized_multiplied_click_per_maxclick"]
products = df.sort_values(by="ctr_laplace_normalized_by_max_ctr_laplace_normalized_multiplied_click_per_maxclick", ascending=False)["product_id"].unique()


In [43]:
products.shape

(313533,)

In [38]:
# what is product == 1856490
df[df["product_id"] == 1856490]

Unnamed: 0,query,product_id,p_des,product_title_lowest_length,category_name,min_num_shops,max_num_shops,avg_num_shops,min_price,max_price,...,impressions,ctr,ctr_laplace_normalized,click_per_maxclick,ctr_normalized_multiplied_clicks_normalized,max_ctr_laplace_normalized,ctr_laplace_normalized_by_max_ctr_laplace_normalized,ceil_ctr_laplace_normalized_by_max_ctr_laplace_normalized,ctr_laplace_normalized_by_max_ctr_laplace_normalized_multiplied_click_per_maxclick,ceil_ctr_laplace_normalized_by_max_ctr_laplace_normalized_multiplied_click_per_maxclick
7226,دوچرخه,1856490,دوچرخه تاشو 26 لندروور شش پره جدید - آبی لندرو...,Landrover,دوچرخه شهری سایز 26 و بالاتر,1.0,3.0,2.475798,5200000.0,5200000.0,...,38041,0.24237,0.218114,9221.000108,2011.231952,0.228833,9.531595,10.0,87890.8363,87891.0
13673,دوچرخه ۲۶ ارزان,1856490,دوچرخه تاشو 26 لندروور شش پره جدید - آبی لندرو...,Landrover,دوچرخه شهری سایز 26 و بالاتر,1.0,3.0,2.475798,5200000.0,5200000.0,...,55,0.054545,0.013201,4.090909,0.054005,0.041522,3.179318,4.0,13.006301,14.0
16146,دوچرخه ۲۶ دنده‌ای,1856490,دوچرخه تاشو 26 لندروور شش پره جدید - آبی لندرو...,Landrover,دوچرخه شهری سایز 26 و بالاتر,1.0,3.0,2.475798,5200000.0,5200000.0,...,407,0.17199,0.078801,71.003279,5.595153,0.247573,3.182956,4.0,226.000295,227.0
45136,دوچرخه ۲۶ دنده,1856490,دوچرخه تاشو 26 لندروور شش پره جدید - آبی لندرو...,Landrover,دوچرخه شهری سایز 26 و بالاتر,1.0,3.0,2.475798,5200000.0,5200000.0,...,113,0.19469,0.053488,23.015873,1.231082,0.116364,4.596657,5.0,105.796073,106.0
98178,دچرخه,1856490,دوچرخه تاشو 26 لندروور شش پره جدید - آبی لندرو...,Landrover,دوچرخه شهری سایز 26 و بالاتر,1.0,3.0,2.475798,5200000.0,5200000.0,...,746,0.327078,0.200655,245.004098,49.161347,0.200655,10.0,10.0,2450.040984,2451.0
104114,دوچرخه سایز ۲۶,1856490,دوچرخه تاشو 26 لندروور شش پره جدید - آبی لندرو...,Landrover,دوچرخه شهری سایز 26 و بالاتر,1.0,3.0,2.475798,5200000.0,5200000.0,...,352,0.221591,0.076625,79.011905,6.054258,0.08134,9.420323,10.0,744.317658,745.0
130233,دوجرخه,1856490,دوچرخه تاشو 26 لندروور شش پره جدید - آبی لندرو...,Landrover,دوچرخه شهری سایز 26 و بالاتر,1.0,3.0,2.475798,5200000.0,5200000.0,...,43,0.255814,0.031008,12.058824,0.373917,0.046512,6.666667,7.0,80.392157,81.0
132093,دوچرخه گالانت,1856490,دوچرخه تاشو 26 لندروور شش پره جدید - آبی لندرو...,Landrover,دوچرخه شهری سایز 26 و بالاتر,1.0,3.0,2.475798,5200000.0,5200000.0,...,37,0.27027,0.027708,11.022727,0.305416,0.098901,2.801567,3.0,30.880912,31.0
133648,دوچرخه ساحلی,1856490,دوچرخه تاشو 26 لندروور شش پره جدید - آبی لندرو...,Landrover,دوچرخه شهری سایز 26 و بالاتر,1.0,3.0,2.475798,5200000.0,5200000.0,...,58,0.12069,0.026059,8.02439,0.209105,0.129231,2.016442,3.0,16.180716,17.0
148274,دوچرخ,1856490,دوچرخه تاشو 26 لندروور شش پره جدید - آبی لندرو...,Landrover,دوچرخه شهری سایز 26 و بالاتر,1.0,3.0,2.475798,5200000.0,5200000.0,...,221,0.276018,0.105983,62.016393,6.572678,0.105983,10.0,10.0,620.163934,621.0


In [32]:
l =  np.unique(products, return_counts=True)

In [36]:
l[0][l[1] >= 1]

array([     13,      38,     115, ..., 9999898, 9999926, 9999983],
      dtype=int64)

In [44]:
df

Unnamed: 0,query,product_id,p_des,product_title_lowest_length,category_name,min_num_shops,max_num_shops,avg_num_shops,min_price,max_price,...,impressions,ctr,ctr_laplace_normalized,click_per_maxclick,ctr_normalized_multiplied_clicks_normalized,max_ctr_laplace_normalized,ctr_laplace_normalized_by_max_ctr_laplace_normalized,ceil_ctr_laplace_normalized_by_max_ctr_laplace_normalized,ctr_laplace_normalized_by_max_ctr_laplace_normalized_multiplied_click_per_maxclick,ceil_ctr_laplace_normalized_by_max_ctr_laplace_normalized_multiplied_click_per_maxclick
0,ساعت هوشمند,9391819,ساعت هوشمند مدل T55 WATCH به همراه یک عدد بند...,واچ T55,ساعت و مچ بند هوشمند,3.0,9.0,5.329130,245000.0,400000.0,...,10066,0.277667,0.204640,2796.000358,572.174266,0.204640,10.000000,10.0,27960.003578,27961.0
1,ساعت هوشمند,8563833,ساعت هوشمند شیائومی هایلو مدل Haylou LS02 Glob...,/,ساعت و مچ بند هوشمند,107.0,136.0,122.587388,527000.0,555000.0,...,9985,0.121582,0.089457,1215.000358,108.689842,0.204640,4.371409,5.0,5311.263633,5312.0
2,ساعت هوشمند,2459592,ساعت هوشمند شیائومی مدل Mibro Lite XPAW004 ا X...,mibro lite,ساعت و مچ بند هوشمند,114.0,141.0,127.041878,145000.0,930000.0,...,9946,0.229841,0.168870,2287.000358,386.204668,0.204640,8.252018,9.0,18872.369021,18873.0
3,ساعت هوشمند,7824893,سیلیکون Mi Smart Band 6 - 1.56 اینچ با صفحه نم...,/,ساعت و مچ بند هوشمند,149.0,178.0,163.859612,426000.0,859000.0,...,9946,0.094209,0.069261,938.000358,64.966723,0.204640,3.384518,4.0,3174.679336,3175.0
4,ساعت هوشمند,9901900,ساعت هوشمند شیائومی مدل Mibro X1 – نسخه گلوبال...,/,ساعت و مچ بند هوشمند,71.0,96.0,83.494704,969000.0,1125000.0,...,9944,0.069489,0.051104,692.000358,35.364024,0.204640,2.497263,3.0,1728.106790,1729.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
603632,galaxy watch ۵ پرو,4191284,ساعت هوشمند سامسونگ Samsung Galaxy Watch5 40mm...,Samsung Galaxy Watch5 40mm,ساعت و مچ بند هوشمند,0.0,31.0,19.313275,6745000.0,9700000.0,...,1,1.000000,0.071429,2.023810,0.144558,0.175439,4.071429,5.0,8.239796,9.0
603633,galaxy watch ۵ پرو,2073176,ساعت هوشمند سامسونگ مدل Galaxy Watch 5 Pro,ساعت هوشمند سامسونگ مدل Galaxy Watch 5 Pro,ساعت و مچ بند هوشمند,0.0,0.0,0.000000,,,...,1,0.000000,0.035714,1.023810,0.036565,0.175439,2.035714,3.0,2.084184,3.0
603634,galaxy watch ۵ پرو,6471449,ساعت هوشمند سامسونگ Galaxy Watch5 Pro 45mm مدل...,ساعت هوشمند سامسونگ Galaxy Watch5 Pro 45mm مدل...,ساعت و مچ بند هوشمند,0.0,0.0,0.000000,,,...,1,0.000000,0.035714,1.023810,0.036565,0.175439,2.035714,3.0,2.084184,3.0
603635,galaxy watch ۵ پرو,7175758,ساعت هوشمند گلکسی سامسونگ Samsung Galaxy Watch...,/,ساعت و مچ بند هوشمند,1.0,17.0,11.175274,3282200.0,4900000.0,...,1,0.000000,0.035714,1.023810,0.036565,0.175439,2.035714,3.0,2.084184,3.0


## Preprocess

In [41]:
c_model = "HooshvareLab/bert-fa-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(c_model)

NameError: name 'AutoTokenizer' is not defined

In [45]:
def preprocess(record):
    query = record["query"]
    p_des = normalizer.normalize(record["p_des"])
    category = record["category_name"]
    max_shop = int(np.log2(int(record["max_num_shops"]) + 1))
    popularity = "popularity is " + normalizer.normalize(str(max_shop))

    min_price = None
    # get avg of available prices
    if any([record["min_price"], record["max_price"], record["avg_price"]]):
        # get min price
        min_price = min(
            [
                record["min_price"],
                record["max_price"],
                record["avg_price"],
            ]
        )
    if min_price is not None:
        price = int(np.log2(int(min_price) + 1))
    else:
        price = int(np.log2(0 + 1))

    price_level = f"price level is {price}"


    encoded_text = tokenizer(
        query,
        category + " " + popularity + " " + " " + price_level + " " + p_des,
        truncation=True,
        max_length=512,
    )

    # label = record["candidate_score2"] * 10
    label = record["ctr_laplace_normalized_by_max_ctr_laplace_normalized"]

    return {
        "input_ids": encoded_text["input_ids"],
        "attention_mask": encoded_text["attention_mask"],
        "token_type_ids": encoded_text["token_type_ids"],
        "label": label,
    }


In [46]:
len(df)

603637

In [47]:
train_dataset = df.sample(frac=0.9, random_state=42)
test_dataset = df.drop(train_dataset.index)

train_dataset.shape, test_dataset.shape

((543273, 33), (60364, 33))

In [48]:
train_dataset = Dataset.from_pandas(train_dataset)
test_dataset = Dataset.from_pandas(test_dataset)

NameError: name 'Dataset' is not defined

In [None]:
train_dataset, test_dataset

: 

In [None]:
train_dataset = train_dataset.map(preprocess, remove_columns=train_dataset.column_names)
test_dataset = test_dataset.map(preprocess, remove_columns=test_dataset.column_names)

In [None]:
# save to disk
train_dataset.save_to_disk("train_dataset_v3_2_query_more_than_equal_30")
test_dataset.save_to_disk("test_dataset_v3_2_query_more_than_equal_30")
