In [None]:
!pip install --upgrade --no-cache-dir gdown

!pip install -q hazm
!pip install -q parsivar

!pip install -q datasets  --no-cache-dir
!pip install -q transformers  --no-cache-dir

In [None]:
# !pip install -q "datasets==2.10.1" # previously 2.9.0

In [None]:
import datasets
datasets.__version__ # '2.9.0'

In [None]:
import csv
import json
import pickle
import joblib
import sqlite3

from collections import defaultdict, Counter

import hazm
from parsivar import Normalizer

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm
from tqdm.notebook import tqdm_notebook

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

import torch
from torch import nn

from datasets import (
    Dataset,
    DatasetDict,
    load_dataset,
    load_metric,
    load_from_disk,
    concatenate_datasets,
)
from transformers import AutoTokenizer, AutoModel
from transformers import AdamW

## Normalizer

In [None]:
class MyNormalizer:
    def __init__(self):
        self.parsivar_normalizer = Normalizer(
            statistical_space_correction=True,
            half_space_char=" ",
            pinglish_conversion_needed=True,
        )
        self.hazm_normalizer = hazm.Normalizer(
            remove_extra_spaces=True,
            persian_numbers=True,
            persian_style=True,
            punctuation_spacing=False,
            remove_diacritics=True,
            affix_spacing=False,
            token_based=True,
        )

    def normalize(self, txt):
        return self.hazm_normalizer.normalize(
            self.parsivar_normalizer.normalize(
                txt.replace("\n", " ").replace("\u200c", " ").lower().strip()
            )
        )


In [None]:
normalizer = MyNormalizer()
normalizer.normalize("34.0")

## Load Data

In [None]:
class JsonFileIterator:
    def __init__(self, path):
        self.path = path
        self.f = open(path, "r")
        self.i = 0
        self.length = self.counter_lines()

    def __iter__(self):
        return self

    def __next__(self):
        line = self.f.readline()
        if not line:
            # End of file
            self.f.close()
            raise StopIteration
        self.i += 1
        return json.loads(line)

    def counter_lines(self):
        with open(self.path, "r") as f1:
            return sum(1 for _ in f1)

    def __len__(self):
        return self.length


<!--  -->

In [None]:
test_data = JsonFileIterator("./data/test-offline-data_v1.jsonl")
test_df = pd.DataFrame(test_data)

In [None]:
correction = joblib.load(f"./retrive/test_query/test_spell_checked.pkl")

In [None]:
for i, row in tqdm(test_df.iterrows(), total=len(test_df)):
    word = row['raw_query']
    if word in correction:
        corrected_word = correction[word]
        test_df.at[i, 'raw_query'] = corrected_word

In [None]:
test_df.to_csv('test-offline-data_v1-torob_v4_2.csv')

<!--  -->

In [None]:
search_data = JsonFileIterator("./data/torob-search-data_v1.jsonl")

In [None]:
search_df = pd.DataFrame(search_data)

In [None]:
search_df.iloc[0]

In [None]:
del correction

In [None]:
correction = joblib.load(f"spell_check-corrections.pkl")

In [None]:
len(correction) #75895

In [None]:
print("دوچرخ" in correction)

In [None]:
for i, row in tqdm(search_df.iterrows(), total=len(search_df)):
    word = row['raw_query']
    if word in correction:
        corrected_word = correction[word]
        search_df.at[i, 'raw_query'] = corrected_word

In [None]:
search_df[search_df["raw_query"] == "دوچرخه"].shape # (37035, 5)

In [None]:
search_df.to_csv('torob_v4_2-dataset.csv')

In [None]:
normalizer = MyNormalizer()
queries = dict()
for search in tqdm(search_df.to_dict('records')):
    query = search["raw_query"]
    normalized_query = normalizer.normalize(query)
    if normalized_query not in queries:
        queries[normalized_query] = 1
    else:
        queries[normalized_query] += 1

In [None]:
queries

In [None]:
# search_df.reset_index(inplace=True)

In [None]:
search_df['raw_query']

In [None]:
search_df

In [None]:
search_df['']

In [None]:
agg_searches = defaultdict(
    lambda: dict(
        results = Counter(),
        clicks = Counter(),
    )
)

In [None]:
print("Aggregating searches based on raw query...")

for i, search in tqdm_notebook(search_df.iterrows(), total=len(search_df)):
    normalized_query     raw_query = search["raw_query"]
= normalizer.normalize(raw_query)

    if queries[normalized_query] >= 10:
        results = search["result"][: np.max(search["clicked_rank"]) + 8]
        clicked_results = search["clicked_result"]
        agg_searches[normalized_query]["results"].update(results)
        agg_searches[normalized_query]["clicks"].update(clicked_results)

In [None]:
len(agg_searches) # 10435 # 33254

<!--  -->

In [None]:
stopwords = []
with open('stop-words.txt', encoding='utf-8' ) as f:
    for line in f:
        stopwords.append(normalizer.normalize(line.strip()))

In [None]:
def find_new_words(titles_list, index_high_len):
    highest_len_title_words = titles_list[index_high_len].split()
    total_new_words = []
    for i in range(1, len(titles_list)):
        if i != index_high_len:
            words = titles_list[i].split()
            new_words = [word for word in words if word not in highest_len_title_words]
            new_words = [word for word in new_words if word not in stopwords]
            total_new_words += new_words
    return list(set(total_new_words))

<!--  -->

In [None]:
agg_s = list(agg_searches.keys())

In [None]:
# with open('./data/torob_list_search', 'wb') as fp:
#     pickle.dump(agg_s, fp, protocol=pickle.HIGHEST_PROTOCOL)

<!--  -->

In [None]:
len(agg_s)

In [None]:
len(agg_searches)

In [None]:
agg_s[:10]

In [None]:
agg_searches[agg_s[0]]

<!--  -->

In [None]:
product_info = JsonFileIterator("./data/products-info_v1.jsonl")

<!--  -->

In [None]:
def read_json_lines(path, n_lines=None):
    """Creates a generator which reads and returns lines of
    a json lines file, one line at a time, each as a dictionary.

    This could be used as a memory-efficient alternative of `pandas.read_json`
    for reading a json lines file.
    """
    with open(path, 'r') as f:
        for i, line in enumerate(f):
            if n_lines == i:
                break
            yield json.loads(line)

In [None]:
product = pd.DataFrame(read_json_lines('./data/products-info_v1.jsonl'))

In [None]:
product = product.set_index("id")

In [None]:
len(product)

In [None]:
len(product.category_name.unique()) # 3569

In [None]:
product.loc[1867826]

In [None]:
product.loc[1867826]["titles"]

In [None]:
product.loc[1867826].min_price

<!--  -->

## Prepare data

In [None]:
with open("dataset_v4_2.txt", "w", encoding="utf-8", newline="") as csvfile:
    wrtiter = csv.writer(csvfile)
    wrtiter.writerow(
        [
            "query",
            "product_id",
            "p_des",
            "product_title",
            "category_name",
            "min_num_shops",
            "max_num_shops",
            "avg_num_shops",
            "min_price",
            "max_price",
            "avg_price",
            "mean_min_prices",
            "mean_max_prices",
            "mean_avg_prices",
            "std_min_prices",
            "std_max_prices",
            "std_avg_prices",
            "num_query",
            "impression",
            "candidate_score1",
            "candidate_score2",
            "clicks",
            "max_clicks",
            "len_results",
            "impressions",
            "ctr",
            "max_shop_processed",
            "popularity",
        ]
    )
    # conn = sqlite3.connect("my_database.db")
    # c = conn.cursor()
    
    data_list = [] 
    
    for query in tqdm_notebook(agg_s):
        
#         print(query)
        results = agg_searches[query]

        # print(results) #

        min_prices = []
        max_prices = []
        avg_prices = []

        for product_id, res_clicks in results["results"].most_common(90):
            if product_id != None:

                result_product = product.loc[product_id]

                # c.execute("SELECT * FROM products WHERE id = ?", (product_id,))
                # result_product = c.fetchone()

                if result_product[2] != None:
                    if result_product[2] != None:
                        min_prices.append(result_product[2])
                    if result_product[3] != None:
                        max_prices.append(result_product[3])
                    if result_product[4] != None:
                        avg_prices.append(result_product[4])

            mean_min_prices = np.mean(min_prices)
            mean_max_prices = np.mean(max_prices)
            mean_avg_prices = np.mean(avg_prices)

            std_min_prices = np.std(min_prices)
            std_max_prices = np.std(max_prices)
            std_avg_prices = np.std(avg_prices)

        for product_id, res_clicks in results["results"].most_common(90):
            if product_id != None:
#                 print(product_id)
                # c.execute("SELECT * FROM products WHERE id = ?", (product_id,))
                # result_product = c.fetchone()

#                 print(res_clicks) # impression

                result_product = product.loc[product_id]

                ##################################################
                # print(result_product)
                ##################################################

                category_name = result_product[0]
                min_price = result_product[2]
                max_price = result_product[3]
                avg_price = result_product[4]

                titles_list_product = product.loc[product_id]["titles"]
#                 print(titles_list_product)

                # titles_list_product = json.loads(result_product[2])

                if len(titles_list_product) > 0:
                    min_num_shops = result_product[5]
                    max_num_shops = result_product[6]
                    avg_num_shops = result_product[7]
                    highest_len_product = max(titles_list_product, key=len)
                    index_highest_len_product = titles_list_product.index(
                        highest_len_product
                    )
                    product_title_new_words = find_new_words(
                        titles_list_product, index_highest_len_product
                    )

                    # highest length product title
                    product_title = highest_len_product

                    p_des = " ".join(
                        [highest_len_product, " ".join(product_title_new_words)]
                    ).replace("\u200c", " ")

                    clicks = results["clicks"].get(product_id, 0)
                    max_clicks = np.max(list(results["clicks"].values()))

#                     print("clicks: ", clicks, "max_clicks: ", max_clicks)
#                     print("clicks by max clicks:", clicks/max_clicks)

                    ##### score 
                    candidate_score = results["clicks"].get(product_id, 0)
                    candidate_score1 = np.log2(candidate_score + 1)
                    candidate_score2 = np.log2(candidate_score + 1) / np.log2(
                        max_clicks + 1
                    )

                    # clicks, impressions, ctr, ctr_laplace_normalized, 

                    clicks = results["clicks"].get(product_id, 0)
# 
#                     print("=======================s of impre=======================")
#                     print(results["results"].get(product_id, 0))
#                     print("=======================e of impre=======================")
# 
                    impressions = results["results"].get(product_id, 0)

                    ctr = clicks / impressions

#                     print(max_clicks)
#                     ctr_laplace_normalized = (clicks + 1) / (impressions + len(results["results"]))

                    len_result =  len(results["results"])
 
                    # click_per_maxclick = clicks + 1 / max_clicks + 1
                    # ctr_normalized_multiplied_clicks_normalized = ctr_laplace_normalized * click_per_maxclick

                    max_shop_processed = normalizer.normalize(str(int(np.log2(int(max_num_shops) + 1))))
                    popularity = "popularity is " + max_shop_processed

                    wrtiter.writerow(
                        [
                            query,
                            product_id,
                            p_des,
                            product_title,
                            category_name,
                            min_num_shops,
                            max_num_shops,
                            avg_num_shops,
                            min_price,
                            max_price,
                            avg_price,
                            mean_min_prices,
                            mean_max_prices,
                            mean_avg_prices,
                            std_min_prices,
                            std_max_prices,
                            std_avg_prices,
                            queries[query],
                            res_clicks, # impression
                            candidate_score1,
                            candidate_score2,
                            clicks,
                            max_clicks,
                            len_result, # number of product for intended query
                            impressions, # impression
                            ctr, # not normalized with laplacian
                            max_shop_processed,
                            popularity,
                        ]
                    )
    # conn.close()

df = pd.read_csv("dataset_v4_2.txt", sep=",")
df.head()

In [None]:
df.iloc[0]

In [None]:
df

In [None]:
joblib.dump(df, "dataset_v4_2.pkl") # len is 603637

In [None]:
# df = joblib.load("dataset_v4_2.pkl")

In [None]:
result_product = product.loc[7861059]
result_product

In [None]:
results = agg_searches["ساعت هوشمند"]
agg_searches["ساعت هوشمند"]

In [None]:
max(results["results"].values())

In [None]:
copy = df.copy(deep=True)
# df.iloc[226]

In [None]:
temp_df = copy.copy(deep=True)

In [None]:
temp_df

In [None]:
cols_to_fill = ['min_price', 'max_price', 'avg_price', 'mean_min_prices', 'mean_max_prices', 'mean_avg_prices', 'std_min_prices', 'std_max_prices', 'std_avg_prices']
for col in cols_to_fill:
    temp_df[col] = temp_df[col].fillna(0)


In [None]:
temp_df

In [None]:
temp_df["clicks_by_max_clicks"] = (temp_df['clicks'] + 1) / (temp_df['max_clicks'] + 1)

In [None]:
temp_df

In [None]:
temp_df["ctr_normalized_by_click_normalized"] = ((temp_df['clicks'] + 1) / (temp_df['impressions'] + temp_df['len_results'])) * (temp_df['clicks_by_max_clicks'])

In [None]:
temp_df

In [None]:
temp_df["ctr_normalized"] = ((temp_df['clicks'] + 1) / (temp_df['impressions'] + temp_df['len_results']))

In [None]:
temp_df

In [None]:
grouped = temp_df.set_index("product_id").groupby("query")

In [None]:
grouped.get_group("دوچرخه")

In [None]:
# grouped = temp_df.groupby(["product_id", "query"]).agg({"ctr_normalized": "max"})

# grouped = grouped.reset_index()

# temp_df = temp_df.merge(grouped, on=["product_id", "query"], how="left")

In [None]:
grouped = temp_df.groupby("query")
max_ctr = grouped["ctr_normalized"].transform(np.max)

In [None]:
temp_df = temp_df.reset_index(drop=True)

In [None]:
temp_df["max_ctr"] = max_ctr
temp_df

In [None]:
temp_df['ctr_by_max_ctr'] = temp_df['ctr_normalized'] / temp_df['max_ctr']

In [None]:
temp_df.iloc[0]

In [None]:
temp_df['graded_ctr_norm'] = temp_df['ctr_by_max_ctr'] * 4

In [None]:
# temp_df.iloc[0]
temp_df

In [None]:
temp_df['ctr_norm_by_max_click_norm'] = temp_df['graded_ctr_norm'] * temp_df['clicks_by_max_clicks']

In [None]:
temp_df

In [None]:
# # ctr_laplace_normalized = (clicks + 1) / (impressions + len(results["results"]))
# df["ctr_laplace_normalized_2"] = (df["clicks"] + 1) / (df["max_clicks"])

In [None]:
# df["click_per_maxclick"] = df["clicks"] / df["max_clicks"]

In [None]:
# df.shape

In [None]:
# df[df["clicks"]==0].shape

# len(copy)

In [None]:
# show query with minimum number of products
temp_df.groupby("query").size().sort_values(ascending=True).head(20)

In [None]:
# show "moripods" products
# df[df["query"] == "moripods"]

In [None]:
df = temp_df.copy(deep=True)

## Preprocess

In [None]:
c_model = "HooshvareLab/bert-fa-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(c_model)

In [None]:
def preprocess(record):
    query = record["query"]
    p_des = normalizer.normalize(record["p_des"])
    category = record["category_name"]
    popularity = record["popularity"]
    avg_price = record["avg_price"]
    std_avg_prices = record["std_avg_prices"]
    mean_avg_prices = record["mean_avg_prices"]

    if (avg_price is not None) and (not np.isnan(avg_price)) and (std_avg_prices != 0):
        price_level = "price is " + normalizer.normalize(
            str(int((((avg_price - mean_avg_prices) / std_avg_prices) + 2) * 5))
        )
    else:
        price_level = "price is none"
        
    encoded_text = tokenizer(
        query,
        category + " " + popularity + " " + " " + price_level + " " + p_des,
        truncation=True,
        max_length=512,
    )
    
    
    label = record["ctr_norm_by_max_click_norm"]

    return {
        "input_ids": encoded_text["input_ids"],
        "attention_mask": encoded_text["attention_mask"],
        "token_type_ids": encoded_text["token_type_ids"],
        "label": label,
    }

In [None]:
# df = temp_df.copy(deep=True) 

In [None]:
len(df) # 603637

In [None]:
train_dataset = df.sample(frac=0.9, random_state=42)
test_dataset = df.drop(train_dataset.index)

train_dataset.shape, test_dataset.shape # ((543273, 35), (60364, 35))

In [None]:
train_dataset = Dataset.from_pandas(train_dataset)
test_dataset = Dataset.from_pandas(test_dataset)

In [None]:
train_dataset, test_dataset

In [None]:
train_dataset = train_dataset.map(preprocess, remove_columns=train_dataset.column_names)
test_dataset = test_dataset.map(preprocess, remove_columns=test_dataset.column_names)

In [None]:
# save to disk
train_dataset.save_to_disk("train_dataset_v4_2")
test_dataset.save_to_disk("test_dataset_v4_2")

In [None]:
%reset -f