In [None]:
!pip install --upgrade --no-cache-dir gdown

!pip install -q hazm
!pip install -q parsivar

!pip install -q datasets  --no-cache-dir
!pip install -q transformers  --no-cache-dir

In [None]:
# !pip install -q "datasets==2.10.1" # previously 2.9.0

In [1]:
import datasets
datasets.__version__ # '2.9.0'

'2.9.0'

In [2]:
import csv
import json
import pickle
import joblib
import sqlite3

from collections import defaultdict, Counter

import hazm
from parsivar import Normalizer

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm
from tqdm.notebook import tqdm_notebook

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

import torch
from torch import nn

from datasets import (
    Dataset,
    DatasetDict,
    load_dataset,
    load_metric,
    load_from_disk,
    concatenate_datasets,
)
from transformers import AutoTokenizer, AutoModel
from transformers import AdamW

## Normalizer

In [4]:
class MyNormalizer:
    def __init__(self):
        self.parsivar_normalizer = Normalizer(
            statistical_space_correction=True,
            half_space_char=" ",
            pinglish_conversion_needed=True,
        )
        self.hazm_normalizer = hazm.Normalizer(
            remove_extra_spaces=True,
            persian_numbers=True,
            persian_style=True,
            punctuation_spacing=False,
            remove_diacritics=True,
            affix_spacing=False,
            token_based=True,
        )

    def normalize(self, txt):
        return self.hazm_normalizer.normalize(
            self.parsivar_normalizer.normalize(
                txt.replace("\n", " ").replace("\u200c", " ").lower().strip()
            )
        )


In [5]:
normalizer = MyNormalizer()
normalizer.normalize("34.0")

'۳۴ ٫۰'

## Load Data

In [6]:
class JsonFileIterator:
    def __init__(self, path):
        self.path = path
        self.f = open(path, "r")
        self.i = 0
        self.length = self.counter_lines()

    def __iter__(self):
        return self

    def __next__(self):
        line = self.f.readline()
        if not line:
            # End of file
            self.f.close()
            raise StopIteration
        self.i += 1
        return json.loads(line)

    def counter_lines(self):
        with open(self.path, "r") as f1:
            return sum(1 for _ in f1)

    def __len__(self):
        return self.length


In [51]:
search_data = JsonFileIterator("./data/torob-search-data_v1.jsonl")

In [52]:
search_df = pd.DataFrame(search_data)

In [53]:
search_df.iloc[0]

raw_query                                           لوستر سقفی برنز
result            [7151290, 6462477, 7385791, 8451497, None, 269...
clicked_result                                            [9457219]
clicked_rank                                                   [16]
timestamp                          2022-07-24T09:21:58.752000+00:00
Name: 0, dtype: object

In [54]:
correction = joblib.load(f"spell_check-corrections.pkl")

In [55]:
len(correction) #75895

1775

In [None]:
print("دوچرخ" in correction_complete)

In [None]:
for i, row in tqdm(temp_df.iterrows(), total=len(search_df)):
    word = row['raw_query']
    if word in correction_complete:
        corrected_word = correction_complete[word]
#         print(word)
        search_df.at[i, 'raw_query'] = corrected_word

In [49]:
# search_df[search_df["raw_query"] == "دوچرخه"].shape (37035, 5)

(37035, 5)

In [None]:
queries = dict()
for search in tqdm(search_query.to_dict('records')):
    query = search["raw_query"]
    normalized_query = normalizer.normalize(query)
    if normalized_query not in queries:
        queries[normalized_query] = 1
    else:
        queries[normalized_query] += 1

In [46]:
normalizer = MyNormalizer()
queries = dict()

for i, search in tqdm_notebook(search_df.iterrows(), total=len(search_df)):
    raw_query = search["raw_query"]
    normalized_query = normalizer.normalize(raw_query)
    if queries.get(normalized_query, -1) == -1:
        queries[normalized_query] = 1
    else:
        queries[normalized_query] += 1

0it [00:00, ?it/s]

In [None]:
queries

In [36]:
# search_df.reset_index(inplace=True)

In [43]:
search_df['raw_query']

0           لوستر سقفی برنز
1            قسیمت هلیکوپتر
2              سساعت هوشمند
3               تفلشون مایع
4                    خط زنن
                 ...       
2499896    مبل راحتی 9 نفره
2499897                 اپل
2499898           مبل چسبتر
2499899      ششلوار مردانهه
2499900    ساندویچ ساز میگل
Name: raw_query, Length: 2499901, dtype: object

In [None]:
agg_searches = defaultdict(
    lambda: dict(
        results = Counter(),
        clicks = Counter(),
    )
)

In [30]:
print("Aggregating searches based on raw query...")

for i, search in tqdm_notebook(search_df.iterrows(), total=len(search_df)):
    raw_query = search["raw_query"]
    normalized_query = normalizer.normalize(raw_query)

    if queries[normalized_query] >= 30:
        results = search["result"][: np.max(search["clicked_rank"]) + 8]
        clicked_results = search["clicked_result"]
        agg_searches[normalized_query]["results"].update(results)
        agg_searches[normalized_query]["clicks"].update(clicked_results)

Aggregating searches based on raw query...


  0%|          | 0/2499901 [00:00<?, ?it/s]

In [31]:
len(agg_searches) # 10435

10435

<!--  -->

In [32]:
stopwords = []
with open('stop-words.txt', encoding='utf-8' ) as f:
    for line in f:
        stopwords.append(normalizer.normalize(line.strip()))

In [33]:
def find_new_words(titles_list, index_high_len):
    highest_len_title_words = titles_list[index_high_len].split()
    total_new_words = []
    for i in range(1, len(titles_list)):
        if i != index_high_len:
            words = titles_list[i].split()
            new_words = [word for word in words if word not in highest_len_title_words]
            new_words = [word for word in new_words if word not in stopwords]
            total_new_words += new_words
    return list(set(total_new_words))

<!--  -->

In [34]:
agg_s = list(agg_searches.keys())

In [19]:
# with open('./data/torob_list_search', 'wb') as fp:
#     pickle.dump(agg_s, fp, protocol=pickle.HIGHEST_PROTOCOL)

<!--  -->

In [35]:
len(agg_s)

10435

In [36]:
len(agg_searches)

10435

In [37]:
agg_s[:10]

['ساعت هوشمند',
 'خط زن',
 'پرده پذیرایی',
 'ایفون ۱۳ پرو',
 'عینک افتابی',
 'لوازم ارایشی',
 'ترازو',
 'گوشیa ۳۰',
 'گوشی ایفون ۱۲',
 'گوشی سامسونگ']

In [None]:
agg_searches[agg_s[0]]

<!--  -->

In [39]:
product_info = JsonFileIterator("./data/products-info_v1.jsonl")

<!--  -->

In [40]:
def read_json_lines(path, n_lines=None):
    """Creates a generator which reads and returns lines of
    a json lines file, one line at a time, each as a dictionary.

    This could be used as a memory-efficient alternative of `pandas.read_json`
    for reading a json lines file.
    """
    with open(path, 'r') as f:
        for i, line in enumerate(f):
            if n_lines == i:
                break
            yield json.loads(line)

In [41]:
product = pd.DataFrame(read_json_lines('./data/products-info_v1.jsonl'))

In [42]:
product = product.set_index("id")

In [43]:
len(product)

3612277

In [44]:
product.loc[1867826]

category_name                                   میکروسکوپ
titles           [میکروسکوپ اپتیکی سلسترون مدل 44121 CGL]
min_price                                       1900000.0
max_price                                       2082000.0
avg_price                                  1928314.393939
min_num_shops                                         1.0
max_num_shops                                         2.0
avg_num_shops                                    1.655303
Name: 1867826, dtype: object

In [45]:
product.loc[1867826]["titles"]

['میکروسکوپ اپتیکی سلسترون مدل 44121 CGL']

In [None]:
product.loc[1867826].min_price

<!--  -->

## Prepare data

In [None]:
with open("dataset_v4_2.txt", "w", encoding="utf-8", newline="") as csvfile:
    wrtiter = csv.writer(csvfile)
    wrtiter.writerow(
        [
            "query",
            "product_id",
            "p_des",
            "product_title",
            "category_name",
            "min_num_shops",
            "max_num_shops",
            "avg_num_shops",
            "min_price",
            "max_price",
            "avg_price",
            "mean_min_prices",
            "mean_max_prices",
            "mean_avg_prices",
            "std_min_prices",
            "std_max_prices",
            "std_avg_prices",
            "num_query",
            "impression",
            "candidate_score1",
            "candidate_score2",
            "clicks",
            "max_clicks",
            "len_results",
            "impressions",
            "ctr",
            "max_shop_processed",
            "popularity",
        ]
    )
    # conn = sqlite3.connect("my_database.db")
    # c = conn.cursor()
    
    data_list = [] 
    
    for query in tqdm_notebook(agg_s):
        
#         print(query)
        results = agg_searches[query]

        # print(results) #

        min_prices = []
        max_prices = []
        avg_prices = []

        for product_id, res_clicks in results["results"].most_common(60):
            if product_id != None:

                result_product = product.loc[product_id]

                # c.execute("SELECT * FROM products WHERE id = ?", (product_id,))
                # result_product = c.fetchone()

                if result_product[2] != None:
                    if result_product[2] != None:
                        min_prices.append(result_product[2])
                    if result_product[3] != None:
                        max_prices.append(result_product[3])
                    if result_product[4] != None:
                        avg_prices.append(result_product[4])

            mean_min_prices = np.mean(min_prices)
            mean_max_prices = np.mean(max_prices)
            mean_avg_prices = np.mean(avg_prices)

            std_min_prices = np.std(min_prices)
            std_max_prices = np.std(max_prices)
            std_avg_prices = np.std(avg_prices)

        for product_id, res_clicks in results["results"].most_common(60):
            if product_id != None:
#                 print(product_id)
                # c.execute("SELECT * FROM products WHERE id = ?", (product_id,))
                # result_product = c.fetchone()

#                 print(res_clicks) # impression

                result_product = product.loc[product_id]

                ##################################################
                # print(result_product)
                ##################################################

                category_name = result_product[0]
                min_price = result_product[2]
                max_price = result_product[3]
                avg_price = result_product[4]

                titles_list_product = product.loc[product_id]["titles"]
#                 print(titles_list_product)

                # titles_list_product = json.loads(result_product[2])

                if len(titles_list_product) > 0:
                    min_num_shops = result_product[5]
                    max_num_shops = result_product[6]
                    avg_num_shops = result_product[7]
                    highest_len_product = max(titles_list_product, key=len)
                    index_highest_len_product = titles_list_product.index(
                        highest_len_product
                    )
                    product_title_new_words = find_new_words(
                        titles_list_product, index_highest_len_product
                    )

                    # highest length product title
                    product_title = highest_len_product

                    p_des = " ".join(
                        [highest_len_product, " ".join(product_title_new_words)]
                    ).replace("\u200c", " ")

                    clicks = results["clicks"].get(product_id, 0)
                    max_clicks = np.max(list(results["clicks"].values()))

#                     print("clicks: ", clicks, "max_clicks: ", max_clicks)
#                     print("clicks by max clicks:", clicks/max_clicks)

                    ##### score 
                    candidate_score = results["clicks"].get(product_id, 0)
                    candidate_score1 = np.log2(candidate_score + 1)
                    candidate_score2 = np.log2(candidate_score + 1) / np.log2(
                        max_clicks + 1
                    )

                    # clicks, impressions, ctr, ctr_laplace_normalized, 

                    clicks = results["clicks"].get(product_id, 0)
# 
#                     print("=======================s of impre=======================")
#                     print(results["results"].get(product_id, 0))
#                     print("=======================e of impre=======================")
# 
                    impressions = results["results"].get(product_id, 0)

                    ctr = clicks / impressions

#                     print(max_clicks)
#                     ctr_laplace_normalized = (clicks + 1) / (impressions + len(results["results"]))

                    len_result =  len(results["results"])
 
                    # click_per_maxclick = clicks + 1 / max_clicks + 1
                    # ctr_normalized_multiplied_clicks_normalized = ctr_laplace_normalized * click_per_maxclick

                    max_shop_processed = normalizer.normalize(str(int(np.log2(int(max_num_shops) + 1))))
                    popularity = "popularity is " + max_shop_processed

                    wrtiter.writerow(
                        [
                            query,
                            product_id,
                            p_des,
                            product_title,
                            category_name,
                            min_num_shops,
                            max_num_shops,
                            avg_num_shops,
                            min_price,
                            max_price,
                            avg_price,
                            mean_min_prices,
                            mean_max_prices,
                            mean_avg_prices,
                            std_min_prices,
                            std_max_prices,
                            std_avg_prices,
                            queries[query],
                            res_clicks, # impression
                            candidate_score1,
                            candidate_score2,
                            clicks,
                            max_clicks,
                            len_result, # number of product for intended query
                            impressions, # impression
                            ctr, # not normalized with laplacian
                            max_shop_processed,
                            popularity,
                        ]
                    )
    # conn.close()

df = pd.read_csv("dataset_v4_2.txt", sep=",")
df.head()

In [105]:
df.iloc[0]

query                                                       ساعت هوشمند
product_id                                                      9391819
p_des                 ساعت هوشمند مدل T55 WATCH به همراه  یک عدد بند...
product_title         ساعت هوشمند مدل T55 WATCH به همراه‌ یک عدد بند...
category_name                                      ساعت و مچ بند هوشمند
min_num_shops                                                       3.0
max_num_shops                                                       9.0
avg_num_shops                                                   5.32913
min_price                                                      245000.0
max_price                                                      400000.0
avg_price                                                 345821.466588
mean_min_prices                                          1709868.338983
mean_max_prices                                          2879725.423729
mean_avg_prices                                          2728835

In [68]:
joblib.dump(df, "dataset_v4_2.pkl") # len is 603637

['dataset_v3_2.pkl']

In [6]:
df = joblib.load("dataset_v4_2.pkl")

In [78]:
result_product = product.loc[7861059]
result_product

category_name                                      کیف و کاور گوشی
titles           [Green iPhone 13 Pro Max Hard Hibrido Shield C...
min_price                                                 328000.0
max_price                                                 345000.0
avg_price                                            344958.435208
min_num_shops                                                  4.0
max_num_shops                                                  5.0
avg_num_shops                                              4.97555
Name: 7861059, dtype: object

In [None]:
results = agg_searches["ساعت هوشمند"]
agg_searches["ساعت هوشمند"]

In [98]:
max(results["results"].values())

10066

In [106]:
copy = df.copy(deep=True)
# df.iloc[226]

In [None]:
cols_to_shift = copy.columns[copy.columns.get_loc('click'):]

In [None]:
copy[cols_to_shift] = copy[cols_to_shift].shift(1, axis=1)

In [None]:
copy.drop('click', axis=1, inplace=True)

In [None]:
copy

In [None]:
temp_df = copy.copy(deep=True)

In [None]:
temp_df

In [None]:
cols_to_fill = ['min_price', 'max_price', 'avg_price', 'mean_min_prices', 'mean_max_prices', 'mean_avg_prices', 'std_min_prices', 'std_max_prices', 'std_avg_prices']
for col in cols_to_fill:
    temp_df[col] = temp_df[col].fillna(0)


In [None]:
temp_df

In [None]:
temp_df["clicks_by_max_clicks"] = (temp_df['clicks'] + 1) / (temp_df['max_clicks'] + 1)

In [None]:
temp_df

In [None]:
temp_df["ctr_normalized_by_click_normalized"] = ((temp_df['clicks'] + 1) / (temp_df['impressions'] + temp_df['len_results'])) * (temp_df['clicks_by_max_clicks'])

In [None]:
temp_df

In [None]:
temp_df["ctr_normalized"] = ((temp_df['clicks'] + 1) / (temp_df['impressions'] + temp_df['len_results']))

In [None]:
temp_df

In [None]:
grouped = temp_df.set_index("product_id").groupby("query")

In [None]:
grouped.get_group("دوچرخه")

In [None]:
# # group by "product_id" and "query" columns and calculate max of "ctr_normalized" for each group
# grouped = temp_df.groupby(["product_id", "query"]).agg({"ctr_normalized": "max"})

# # reset index to turn the groupby result into a DataFrame
# grouped = grouped.reset_index()

# # merge the grouped DataFrame with the original temp_df DataFrame
# temp_df = temp_df.merge(grouped, on=["product_id", "query"], how="left")

In [None]:
grouped = temp_df.groupby("query")
max_ctr = grouped["ctr_normalized"].transform(np.max)

In [None]:
temp_df = temp_df.reset_index(drop=True)

In [None]:
temp_df["max_ctr"] = max_ctr
temp_df

Unnamed: 0,query,product_id,p_des,product_title,category_name,min_num_shops,max_num_shops,avg_num_shops,min_price,max_price,...,max_clicks,len_results,impressions,ctr,max_shop_processed,popularity,clicks_by_max_clicks,ctr_normalized_by_click_normalized,ctr_normalized,max_ctr
0,ساعت هوشمند,9391819,ساعت هوشمند مدل T55 WATCH به همراه یک عدد بند...,ساعت هوشمند مدل T55 WATCH به همراه‌ یک عدد بند...,ساعت و مچ بند هوشمند,3.0,9.0,5.329130,245000.0,400000.0,...,2795,3597,10066,0.277667,۳,popularity is ۳,1.000000,0.204640,0.204640,0.204640
1,ساعت هوشمند,8563833,ساعت هوشمند شیائومی هایلو مدل Haylou LS02 Glob...,ساعت هوشمند شیائومی هایلو مدل Haylou LS02 Glob...,ساعت و مچ بند هوشمند,107.0,136.0,122.587388,527000.0,555000.0,...,2795,3597,9985,0.121582,۷,popularity is ۷,0.434549,0.038873,0.089457,0.204640
2,ساعت هوشمند,2459592,ساعت هوشمند شیائومی مدل Mibro Lite XPAW004 ا X...,ساعت هوشمند شیائومی مدل Mibro Lite XPAW004 ا X...,ساعت و مچ بند هوشمند,114.0,141.0,127.041878,145000.0,930000.0,...,2795,3597,9946,0.229841,۷,popularity is ۷,0.817954,0.138128,0.168870,0.204640
3,ساعت هوشمند,7824893,سیلیکون Mi Smart Band 6 - 1.56 اینچ با صفحه نم...,سیلیکون Mi Smart Band 6 - 1.56 اینچ با صفحه نم...,ساعت و مچ بند هوشمند,149.0,178.0,163.859612,426000.0,859000.0,...,2795,3597,9946,0.094209,۷,popularity is ۷,0.335479,0.023236,0.069261,0.204640
4,ساعت هوشمند,9901900,ساعت هوشمند شیائومی مدل Mibro X1 – نسخه گلوبال...,ساعت هوشمند شیائومی مدل Mibro X1 – نسخه گلوبال...,ساعت و مچ بند هوشمند,71.0,96.0,83.494704,969000.0,1125000.0,...,2795,3597,9944,0.069489,۶,popularity is ۶,0.247496,0.012648,0.051104,0.204640
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
603632,galaxy watch ۵ پرو,4191284,ساعت هوشمند سامسونگ Samsung Galaxy Watch5 40mm...,ساعت هوشمند سامسونگ Samsung Galaxy Watch5 40mm...,ساعت و مچ بند هوشمند,0.0,31.0,19.313275,6745000.0,9700000.0,...,42,27,1,1.000000,۵,popularity is ۵,0.046512,0.003322,0.071429,0.175439
603633,galaxy watch ۵ پرو,2073176,ساعت هوشمند سامسونگ مدل Galaxy Watch 5 Pro,ساعت هوشمند سامسونگ مدل Galaxy Watch 5 Pro,ساعت و مچ بند هوشمند,0.0,0.0,0.000000,0.0,0.0,...,42,27,1,0.000000,۰,popularity is ۰,0.023256,0.000831,0.035714,0.175439
603634,galaxy watch ۵ پرو,6471449,ساعت هوشمند سامسونگ Galaxy Watch5 Pro 45mm مدل...,ساعت هوشمند سامسونگ Galaxy Watch5 Pro 45mm مدل...,ساعت و مچ بند هوشمند,0.0,0.0,0.000000,0.0,0.0,...,42,27,1,0.000000,۰,popularity is ۰,0.023256,0.000831,0.035714,0.175439
603635,galaxy watch ۵ پرو,7175758,ساعت هوشمند گلکسی سامسونگ Samsung Galaxy Watch...,ساعت هوشمند گلکسی سامسونگ Samsung Galaxy Watch...,ساعت و مچ بند هوشمند,1.0,17.0,11.175274,3282200.0,4900000.0,...,42,27,1,0.000000,۴,popularity is ۴,0.023256,0.000831,0.035714,0.175439


In [None]:
temp_df['ctr_by_max_ctr'] = temp_df['ctr_normalized'] / temp_df['max_ctr']

In [None]:
temp_df.iloc[0]

query                                                                       ساعت هوشمند
product_id                                                                      9391819
p_des                                 ساعت هوشمند مدل T55 WATCH به همراه  یک عدد بند...
product_title                         ساعت هوشمند مدل T55 WATCH به همراه‌ یک عدد بند...
category_name                                                      ساعت و مچ بند هوشمند
min_num_shops                                                                       3.0
max_num_shops                                                                       9.0
avg_num_shops                                                                   5.32913
min_price                                                                      245000.0
max_price                                                                      400000.0
avg_price                                                                 345821.466588
mean_min_prices                 

In [None]:
temp_df['graded_ctr_norm'] = temp_df['ctr_by_max_ctr'] * 10

In [None]:
# temp_df.iloc[0]
temp_df

Unnamed: 0,query,product_id,p_des,product_title,category_name,min_num_shops,max_num_shops,avg_num_shops,min_price,max_price,...,impressions,ctr,max_shop_processed,popularity,clicks_by_max_clicks,ctr_normalized_by_click_normalized,ctr_normalized,max_ctr,ctr_by_max_ctr,graded_ctr_norm
0,ساعت هوشمند,9391819,ساعت هوشمند مدل T55 WATCH به همراه یک عدد بند...,ساعت هوشمند مدل T55 WATCH به همراه‌ یک عدد بند...,ساعت و مچ بند هوشمند,3.0,9.0,5.329130,245000.0,400000.0,...,10066,0.277667,۳,popularity is ۳,1.000000,0.204640,0.204640,0.204640,1.000000,10.000000
1,ساعت هوشمند,8563833,ساعت هوشمند شیائومی هایلو مدل Haylou LS02 Glob...,ساعت هوشمند شیائومی هایلو مدل Haylou LS02 Glob...,ساعت و مچ بند هوشمند,107.0,136.0,122.587388,527000.0,555000.0,...,9985,0.121582,۷,popularity is ۷,0.434549,0.038873,0.089457,0.204640,0.437141,4.371409
2,ساعت هوشمند,2459592,ساعت هوشمند شیائومی مدل Mibro Lite XPAW004 ا X...,ساعت هوشمند شیائومی مدل Mibro Lite XPAW004 ا X...,ساعت و مچ بند هوشمند,114.0,141.0,127.041878,145000.0,930000.0,...,9946,0.229841,۷,popularity is ۷,0.817954,0.138128,0.168870,0.204640,0.825202,8.252018
3,ساعت هوشمند,7824893,سیلیکون Mi Smart Band 6 - 1.56 اینچ با صفحه نم...,سیلیکون Mi Smart Band 6 - 1.56 اینچ با صفحه نم...,ساعت و مچ بند هوشمند,149.0,178.0,163.859612,426000.0,859000.0,...,9946,0.094209,۷,popularity is ۷,0.335479,0.023236,0.069261,0.204640,0.338452,3.384518
4,ساعت هوشمند,9901900,ساعت هوشمند شیائومی مدل Mibro X1 – نسخه گلوبال...,ساعت هوشمند شیائومی مدل Mibro X1 – نسخه گلوبال...,ساعت و مچ بند هوشمند,71.0,96.0,83.494704,969000.0,1125000.0,...,9944,0.069489,۶,popularity is ۶,0.247496,0.012648,0.051104,0.204640,0.249726,2.497263
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
603632,galaxy watch ۵ پرو,4191284,ساعت هوشمند سامسونگ Samsung Galaxy Watch5 40mm...,ساعت هوشمند سامسونگ Samsung Galaxy Watch5 40mm...,ساعت و مچ بند هوشمند,0.0,31.0,19.313275,6745000.0,9700000.0,...,1,1.000000,۵,popularity is ۵,0.046512,0.003322,0.071429,0.175439,0.407143,4.071429
603633,galaxy watch ۵ پرو,2073176,ساعت هوشمند سامسونگ مدل Galaxy Watch 5 Pro,ساعت هوشمند سامسونگ مدل Galaxy Watch 5 Pro,ساعت و مچ بند هوشمند,0.0,0.0,0.000000,0.0,0.0,...,1,0.000000,۰,popularity is ۰,0.023256,0.000831,0.035714,0.175439,0.203571,2.035714
603634,galaxy watch ۵ پرو,6471449,ساعت هوشمند سامسونگ Galaxy Watch5 Pro 45mm مدل...,ساعت هوشمند سامسونگ Galaxy Watch5 Pro 45mm مدل...,ساعت و مچ بند هوشمند,0.0,0.0,0.000000,0.0,0.0,...,1,0.000000,۰,popularity is ۰,0.023256,0.000831,0.035714,0.175439,0.203571,2.035714
603635,galaxy watch ۵ پرو,7175758,ساعت هوشمند گلکسی سامسونگ Samsung Galaxy Watch...,ساعت هوشمند گلکسی سامسونگ Samsung Galaxy Watch...,ساعت و مچ بند هوشمند,1.0,17.0,11.175274,3282200.0,4900000.0,...,1,0.000000,۴,popularity is ۴,0.023256,0.000831,0.035714,0.175439,0.203571,2.035714


In [None]:
temp_df['ctr_norm_by_max_click_norm'] = temp_df['graded_ctr_norm'] * temp_df['clicks_by_max_clicks']

In [None]:
temp_df

Unnamed: 0,query,product_id,p_des,product_title,category_name,min_num_shops,max_num_shops,avg_num_shops,min_price,max_price,...,ctr,max_shop_processed,popularity,clicks_by_max_clicks,ctr_normalized_by_click_normalized,ctr_normalized,max_ctr,ctr_by_max_ctr,graded_ctr_norm,ctr_norm_by_max_click_norm
0,ساعت هوشمند,9391819,ساعت هوشمند مدل T55 WATCH به همراه یک عدد بند...,ساعت هوشمند مدل T55 WATCH به همراه‌ یک عدد بند...,ساعت و مچ بند هوشمند,3.0,9.0,5.329130,245000.0,400000.0,...,0.277667,۳,popularity is ۳,1.000000,0.204640,0.204640,0.204640,1.000000,10.000000,10.000000
1,ساعت هوشمند,8563833,ساعت هوشمند شیائومی هایلو مدل Haylou LS02 Glob...,ساعت هوشمند شیائومی هایلو مدل Haylou LS02 Glob...,ساعت و مچ بند هوشمند,107.0,136.0,122.587388,527000.0,555000.0,...,0.121582,۷,popularity is ۷,0.434549,0.038873,0.089457,0.204640,0.437141,4.371409,1.899593
2,ساعت هوشمند,2459592,ساعت هوشمند شیائومی مدل Mibro Lite XPAW004 ا X...,ساعت هوشمند شیائومی مدل Mibro Lite XPAW004 ا X...,ساعت و مچ بند هوشمند,114.0,141.0,127.041878,145000.0,930000.0,...,0.229841,۷,popularity is ۷,0.817954,0.138128,0.168870,0.204640,0.825202,8.252018,6.749773
3,ساعت هوشمند,7824893,سیلیکون Mi Smart Band 6 - 1.56 اینچ با صفحه نم...,سیلیکون Mi Smart Band 6 - 1.56 اینچ با صفحه نم...,ساعت و مچ بند هوشمند,149.0,178.0,163.859612,426000.0,859000.0,...,0.094209,۷,popularity is ۷,0.335479,0.023236,0.069261,0.204640,0.338452,3.384518,1.135436
4,ساعت هوشمند,9901900,ساعت هوشمند شیائومی مدل Mibro X1 – نسخه گلوبال...,ساعت هوشمند شیائومی مدل Mibro X1 – نسخه گلوبال...,ساعت و مچ بند هوشمند,71.0,96.0,83.494704,969000.0,1125000.0,...,0.069489,۶,popularity is ۶,0.247496,0.012648,0.051104,0.204640,0.249726,2.497263,0.618064
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
603632,galaxy watch ۵ پرو,4191284,ساعت هوشمند سامسونگ Samsung Galaxy Watch5 40mm...,ساعت هوشمند سامسونگ Samsung Galaxy Watch5 40mm...,ساعت و مچ بند هوشمند,0.0,31.0,19.313275,6745000.0,9700000.0,...,1.000000,۵,popularity is ۵,0.046512,0.003322,0.071429,0.175439,0.407143,4.071429,0.189369
603633,galaxy watch ۵ پرو,2073176,ساعت هوشمند سامسونگ مدل Galaxy Watch 5 Pro,ساعت هوشمند سامسونگ مدل Galaxy Watch 5 Pro,ساعت و مچ بند هوشمند,0.0,0.0,0.000000,0.0,0.0,...,0.000000,۰,popularity is ۰,0.023256,0.000831,0.035714,0.175439,0.203571,2.035714,0.047342
603634,galaxy watch ۵ پرو,6471449,ساعت هوشمند سامسونگ Galaxy Watch5 Pro 45mm مدل...,ساعت هوشمند سامسونگ Galaxy Watch5 Pro 45mm مدل...,ساعت و مچ بند هوشمند,0.0,0.0,0.000000,0.0,0.0,...,0.000000,۰,popularity is ۰,0.023256,0.000831,0.035714,0.175439,0.203571,2.035714,0.047342
603635,galaxy watch ۵ پرو,7175758,ساعت هوشمند گلکسی سامسونگ Samsung Galaxy Watch...,ساعت هوشمند گلکسی سامسونگ Samsung Galaxy Watch...,ساعت و مچ بند هوشمند,1.0,17.0,11.175274,3282200.0,4900000.0,...,0.000000,۴,popularity is ۴,0.023256,0.000831,0.035714,0.175439,0.203571,2.035714,0.047342


In [None]:
# grouped[grouped.apply(lambda x: np.max(x['ctr_normalized'])) == 'دوچرخه']

In [None]:
# df = pd.read_csv("dataset_v3_2.txt", sep=",")
# df

In [None]:
# # ctr_laplace_normalized = (clicks + 1) / (impressions + len(results["results"]))
# df["ctr_laplace_normalized_2"] = (df["clicks"] + 1) / (df["max_clicks"])

In [None]:
# df["click_per_maxclick"] = df["clicks"] / df["max_clicks"]

In [None]:
# df.shape

(585, 31)

In [None]:
# df[df["clicks"]==0].shape

(61, 31)

# len(copy)

In [None]:
# show query with minimum number of products
df.groupby("query").size().sort_values(ascending=True).head(20)

query
گوشی سامسونگ     51
ایفون ۱۳ پرو     59
ترازو            59
خط زن            59
ساعت هوشمند      59
عینک افتابی      59
لوازم ارایشی     59
پرده پذیرایی     60
گوشی ایفون ۱۲    60
گوشیa ۳۰         60
dtype: int64

In [None]:
# show "moripods" products
# df[df["query"] == "moripods"]

In [None]:
# add a column for ctr_laplace_normalized/max_ctr_laplace_normalized, note: group by query
# max_ctr_laplace_normalized = df.groupby("query")["ctr_laplace_normalized"].max()
# df["max_ctr_laplace_normalized"] = df["query"].map(max_ctr_laplace_normalized)

In [None]:
# len(df)

In [None]:
# df["ctr_laplace_normalized_by_max_ctr_laplace_normalized"] = ( df["ctr_laplace_normalized"] / df["max_ctr_laplace_normalized"] ) * 10

In [None]:
# df["ceil_ctr_laplace_normalized_by_max_ctr_laplace_normalized"] = df["ctr_laplace_normalized_by_max_ctr_laplace_normalized"].apply(np.ceil)

In [None]:
# df["ctr_laplace_normalized_by_max_ctr_laplace_normalized"].head(20)


0     10.000000
1      4.371409
2      8.252018
3      3.384518
4      2.497263
5      8.804563
6      1.618161
7      1.159291
8      2.071292
9      5.247443
10     1.160673
11     1.127305
12     1.810594
13     0.745930
14     2.121837
15     1.101672
16     1.273197
17     1.689985
18     1.073321
19     3.562143
Name: ctr_laplace_normalized_by_max_ctr_laplace_normalized, dtype: float64

In [None]:
# df["ctr_laplace_normalized_by_max_ctr_laplace_normalized_multiplied_click_per_maxclick"] = df["ctr_laplace_normalized_by_max_ctr_laplace_normalized"] * df["click_per_maxclick"]

# df["ceil_ctr_laplace_normalized_by_max_ctr_laplace_normalized_multiplied_click_per_maxclick"] = df["ctr_laplace_normalized_by_max_ctr_laplace_normalized_multiplied_click_per_maxclick"].apply(np.ceil)

In [None]:
df = temp_df.copy(deep=True)

## Preprocess

In [None]:
c_model = "HooshvareLab/bert-fa-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(c_model)

In [None]:
def preprocess(record):
    query = record["query"]
    p_des = normalizer.normalize(record["p_des"])
    category = record["category_name"]
    popularity = record["popularity"]
    avg_price = record["avg_price"]
    std_avg_prices = record["std_avg_prices"]
    mean_avg_prices = record["mean_avg_prices"]

    if (avg_price is not None) and (not np.isnan(avg_price)) and (std_avg_prices != 0):
        price_level = "price is " + normalizer.normalize(
            str(int((((avg_price - mean_avg_prices) / std_avg_prices) + 2) * 5))
        )
    else:
        price_level = "price is none"
        
    encoded_text = tokenizer(
        query,
        category + " " + popularity + " " + " " + price_level + " " + p_des,
        truncation=True,
        max_length=512,
    )
    
    
    label = record["ctr_norm_by_max_click_norm"]

    return {
        "input_ids": encoded_text["input_ids"],
        "attention_mask": encoded_text["attention_mask"],
        "token_type_ids": encoded_text["token_type_ids"],
        "label": label,
    }

In [None]:
# df = temp_df.copy(deep=True) 

In [None]:
len(df) # 603637

603637

In [None]:
train_dataset = df.sample(frac=0.9, random_state=42)
test_dataset = df.drop(train_dataset.index)

train_dataset.shape, test_dataset.shape # ((543273, 35), (60364, 35))

((543273, 35), (60364, 35))

In [None]:
train_dataset = Dataset.from_pandas(train_dataset)
test_dataset = Dataset.from_pandas(test_dataset)

In [None]:
train_dataset, test_dataset

(Dataset({
     features: ['query', 'product_id', 'p_des', 'product_title', 'category_name', 'min_num_shops', 'max_num_shops', 'avg_num_shops', 'min_price', 'max_price', 'avg_price', 'mean_min_prices', 'mean_max_prices', 'mean_avg_prices', 'std_min_prices', 'std_max_prices', 'std_avg_prices', 'num_query', 'impression', 'candidate_score1', 'candidate_score2', 'clicks', 'max_clicks', 'len_results', 'impressions', 'ctr', 'max_shop_processed', 'popularity', 'clicks_by_max_clicks', 'ctr_normalized_by_click_normalized', 'ctr_normalized', 'max_ctr', 'ctr_by_max_ctr', 'graded_ctr_norm', 'ctr_norm_by_max_click_norm', '__index_level_0__'],
     num_rows: 543273
 }),
 Dataset({
     features: ['query', 'product_id', 'p_des', 'product_title', 'category_name', 'min_num_shops', 'max_num_shops', 'avg_num_shops', 'min_price', 'max_price', 'avg_price', 'mean_min_prices', 'mean_max_prices', 'mean_avg_prices', 'std_min_prices', 'std_max_prices', 'std_avg_prices', 'num_query', 'impression', 'candidate_sco

In [None]:
train_dataset = train_dataset.map(preprocess, remove_columns=train_dataset.column_names)
test_dataset = test_dataset.map(preprocess, remove_columns=test_dataset.column_names)

  0%|          | 0/543273 [00:00<?, ?ex/s]

  0%|          | 0/60364 [00:00<?, ?ex/s]

In [None]:
# save to disk
train_dataset.save_to_disk("train_dataset_v4")
test_dataset.save_to_disk("test_dataset_v4")

Saving the dataset (0/1 shards):   0%|          | 0/543273 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/60364 [00:00<?, ? examples/s]

In [None]:
def preprocess(record):

    query = record["query"]
    p_des = normalizer.normalize(record["p_des"])
    category = record["category_name"]
    popularity = record["popularity"]

    avg_price = record["avg_price"]
    std_avg_prices = record["std_avg_prices"]
    mean_avg_prices = record["mean_avg_prices"]

    if (avg_price is not None) and (not np.isnan(avg_price)) and (std_avg_prices != 0):
        price_level = "price is " + normalizer.normalize(
            str(int((((avg_price - mean_avg_prices) / std_avg_prices) + 2) * 5))
        )
    else:
        price_level = "price is none"

    encoded_text = tokenizer(
        query,
        category + " " + popularity + " " + " " + price_level + " " + p_des,
        truncation=True,
        max_length=512,
    )

    label = record["clicks_by_max_clicks"]

    return {
        "input_ids": encoded_text["input_ids"],
        "attention_mask": encoded_text["attention_mask"],
        "token_type_ids": encoded_text["token_type_ids"],
        "label": label,
    }

In [None]:
len(df)

603637

In [None]:
train_dataset = df.sample(frac=0.9)
test_dataset = df.drop(train_dataset.index)

train_dataset.shape, test_dataset.shape

((543273, 35), (60364, 35))

In [None]:
train_dataset = Dataset.from_pandas(train_dataset)
test_dataset = Dataset.from_pandas(test_dataset)

In [None]:
train_dataset, test_dataset

(Dataset({
     features: ['query', 'product_id', 'p_des', 'product_title', 'category_name', 'min_num_shops', 'max_num_shops', 'avg_num_shops', 'min_price', 'max_price', 'avg_price', 'mean_min_prices', 'mean_max_prices', 'mean_avg_prices', 'std_min_prices', 'std_max_prices', 'std_avg_prices', 'num_query', 'impression', 'candidate_score1', 'candidate_score2', 'clicks', 'max_clicks', 'len_results', 'impressions', 'ctr', 'max_shop_processed', 'popularity', 'clicks_by_max_clicks', 'ctr_normalized_by_click_normalized', 'ctr_normalized', 'max_ctr', 'ctr_by_max_ctr', 'graded_ctr_norm', 'ctr_norm_by_max_click_norm', '__index_level_0__'],
     num_rows: 543273
 }),
 Dataset({
     features: ['query', 'product_id', 'p_des', 'product_title', 'category_name', 'min_num_shops', 'max_num_shops', 'avg_num_shops', 'min_price', 'max_price', 'avg_price', 'mean_min_prices', 'mean_max_prices', 'mean_avg_prices', 'std_min_prices', 'std_max_prices', 'std_avg_prices', 'num_query', 'impression', 'candidate_sco

In [None]:
train_dataset = train_dataset.map(preprocess, remove_columns=train_dataset.column_names)
test_dataset = test_dataset.map(preprocess, remove_columns=test_dataset.column_names)

  0%|          | 0/543273 [00:00<?, ?ex/s]

  0%|          | 0/60364 [00:00<?, ?ex/s]

In [None]:
train_dataset.save_to_disk("train_dataset_v4_model_3")
test_dataset.save_to_disk("test_dataset_v4_model_3")

Saving the dataset (0/1 shards):   0%|          | 0/543273 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/60364 [00:00<?, ? examples/s]

In [None]:
%reset -f