In [1]:
from collections import Counter, defaultdict

import warnings

import csv
import json
import sqlite3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from parsivar import Normalizer
import hazm

from tqdm.notebook import tqdm_notebook

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

import torch
from torch import nn

from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModel


In [2]:
class MyNormalizer:
    def __init__(self):
        self.parsivar_normalizer = Normalizer(
            statistical_space_correction=True,
            half_space_char=" ",
            pinglish_conversion_needed=True,
        )
        self.hazm_normalizer = hazm.Normalizer(
            remove_extra_spaces=True,
            persian_numbers=True,
            persian_style=True,
            punctuation_spacing=False,
            remove_diacritics=True,
            affix_spacing=False,
            token_based=True,
        )

    def normalize(self, txt):
        return self.hazm_normalizer.normalize(
            self.parsivar_normalizer.normalize(
                txt.replace("\n", " ").replace("\u200c", " ").lower().strip()
            )
        )


In [3]:
class JsonFileIterator:
    def __init__(self, path):
        self.path = path
        self.f = open(path, "r")
        self.i = 0
        self.length = self.counter_lines()

    def __iter__(self):
        return self

    def __next__(self):
        line = self.f.readline()
        if not line:
            # End of file
            self.f.close()
            raise StopIteration
        self.i += 1
        return json.loads(line)

    def counter_lines(self):
        with open(self.path, "r") as f1:
            return sum(1 for _ in f1)

    def __len__(self):
        return self.length

In [4]:
def find_new_words(titles_list, index_high_len):
    highest_len_title_words = titles_list[index_high_len].split()
    total_new_words = []
    for i in range(1, len(titles_list)):
        if i != index_high_len:
            words = titles_list[i].split()
            new_words = [word for word in words if word not in highest_len_title_words]
            new_words = [word for word in new_words if word not in stopwords]
            total_new_words += new_words
    return list(set(total_new_words))

In [5]:
stopwords = []
normalizer = MyNormalizer()
with open("stop-words.txt", encoding="utf-8") as f:
    for line in f:
        stopwords.append(normalizer.normalize(line.strip()))

In [None]:
product_info = JsonFileIterator("./data/products-info_v1.jsonl")

In [None]:
def read_json_lines(path, n_lines=None):
    """Creates a generator which reads and returns lines of
    a json lines file, one line at a time, each as a dictionary.

    This could be used as a memory-efficient alternative of `pandas.read_json`
    for reading a json lines file.
    """
    with open(path, 'r') as f:
        for i, line in enumerate(f):
            if n_lines == i:
                break
            yield json.loads(line)

In [None]:
product = pd.DataFrame(read_json_lines('./data/products-info_v1.jsonl'))

In [None]:
product = product.set_index("id")

<!--  -->

In [None]:
test_data = JsonFileIterator("./data/test-offline-data_v1.jsonl")

In [None]:
def get_min_price(record):
    # Check if any of the price fields are available
    if any([record["min_price"], record["max_price"], record["avg_price"]]):
        # Use the minimum of the available prices
        return min([record["min_price"], record["max_price"], record["avg_price"]])
    # If no price is available, return None
    return None

In [8]:
with open("test_dataset_torob_v3_2.txt", "w", encoding="utf-8", newline="") as csvfile:

    #######################################
    # with open("test_dataset_v4.txt", "w", encoding="utf-8", newline="") as csvfile:
    #######################################

    wrtiter = csv.writer(csvfile)
    wrtiter.writerow(
        [
            "row_number",
            "query",
            "product_id",
            "p_des",
            "category_name",
            "min_num_shops",
            "max_num_shops",
            "avg_num_shops",
            "min_price",
            "max_price",
            "avg_price",
            "mean_min_prices",
            "mean_max_prices",
            "mean_avg_prices",
            "std_min_prices",
            "std_max_prices",
            "std_avg_prices",
            "price_level"
        ]
    )

    #######################################
    # conn = sqlite3.connect("my_database.db")
    # c = conn.cursor()
    #######################################

    data_list = []
    for idx, test_rec in enumerate(tqdm_notebook(test_data)):
        query = test_rec["raw_query"]
        results = test_rec["result_not_ranked"]
        min_prices = []
        max_prices = []
        avg_prices = []

        for product_id in results:
            if product_id != None:
                result_product = product.loc[product_id]

                #######################################
                # c.execute("SELECT * FROM products WHERE id = ?", (product_id,))
                # result_product = c.fetchone()
                #######################################

                if result_product[2] != None:
                    if result_product[2] != None:
                        min_prices.append(result_product[2])
                    if result_product[3] != None:
                        max_prices.append(result_product[3])
                    if result_product[4] != None:
                        avg_prices.append(result_product[4])

            mean_min_prices = np.mean(min_prices)
            mean_max_prices = np.mean(max_prices)
            mean_avg_prices = np.mean(avg_prices)

            std_min_prices = np.std(min_prices)
            std_max_prices = np.std(max_prices)
            std_avg_prices = np.std(avg_prices)

        for product_id in results:
            if product_id != None:
                result_product = product.loc[product_id]

                #######################################
                # c.execute("SELECT * FROM products WHERE id = ?", (product_id,))
                # result_product = c.fetchone()
                #######################################

                category_name = result_product[0]
                min_price = result_product[2]
                max_price = result_product[3]
                avg_price = result_product[4]

                #######################################
                # titles_list_product = json.loads(result_product[2])
                #######################################

                titles_list_product = product.loc[product_id]["titles"]

                if len(titles_list_product) > 0:
                    min_num_shops = result_product[5]
                    max_num_shops = result_product[6]
                    avg_num_shops = result_product[7]

                    highest_len_product = max(titles_list_product, key=len)
                    index_highest_len_product = titles_list_product.index(
                        highest_len_product
                    )
                    product_title_new_words = find_new_words(
                        titles_list_product, index_highest_len_product
                    )
                    p_des = " ".join(
                        [highest_len_product, " ".join(product_title_new_words)]
                    ).replace("\u200c", " ")
                    
                    # Get the minimum price
                    # print(product.loc[product_id])
                    minimum_price = get_min_price(product.loc[product_id])
                    
                    if minimum_price is not None and minimum_price > 0:
                        price_level = int(np.log2(int(minimum_price) + 1))
                    else:
                        price_level = int(np.log2(0 + 1))

                    # Format the result
                    # price_level = f"price level is {price}"

                    wrtiter.writerow(
                        [
                            idx,
                            normalizer.normalize(query),
                            product_id,
                            p_des,
                            category_name,
                            min_num_shops,
                            max_num_shops,
                            avg_num_shops,
                            min_price,
                            max_price,
                            avg_price,
                            mean_min_prices,
                            mean_max_prices,
                            mean_avg_prices,
                            std_min_prices,
                            std_max_prices,
                            std_avg_prices,
                            price_level,
                        ]
                    )

  0%|          | 0/23140 [00:00<?, ?it/s]

In [None]:
df = pd.read_csv("test_dataset_torob_v3_2.txt")

In [None]:
# 

In [None]:
!wget https://kkb-production.jupyter-proxy.kaggle.net/k/121749410/eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2IiwidHlwIjoiSldUIn0..29XB1I5T3_TLKwAe_WqY7Q.ZYbku6W-CUIjUa4EdNr59IoVeaAG1K5-G3q4YcfDzWz_s9E5ys6pXOnxSAiWKMpS4v6RO-UvQWT-KHlTsFqgTbSZU9lKSogRXgVjzVOvksfyk55_zeK6pizUv0hh6rxUooYHlPsymAqz2R4BRVm9jMqHZd_4Tm4ien3sQ73MQFDWiOY_xoV69R-LZNnlfqplstml3Cy9VkZfWRAbag9A6w.Mbb_mG6vQ0U4maSp3wdGOQ/proxy/files/torobv3_bert_epoch-3_1st.w

from google.colab import drive
drive.mount('/content/drive')

!rsync -rhPu --info=progress2 "/content/torobv3_bert_epoch-3_1st.w" "/content/drive/MyDrive/"