In [35]:
import pandas as pd
import numpy as np
import json
from parsivar import Normalizer
import hazm
from collections import defaultdict, Counter
from tqdm.notebook import tqdm_notebook
from transformers import AutoTokenizer, AutoModel
from datasets import Dataset, DatasetDict
from collections import Counter, defaultdict
from sklearn.model_selection import train_test_split
import torch
from torch import nn
from sklearn.metrics import accuracy_score
import sqlite3
import csv
import matplotlib.pyplot as plt

In [36]:
class MyNormalizer:
    def __init__(self):
        self.parsivar_normalizer = Normalizer(
            statistical_space_correction=True,
            half_space_char=" ",
            pinglish_conversion_needed=True,
        )
        self.hazm_normalizer = hazm.Normalizer(
            remove_extra_spaces=True,
            persian_numbers=True,
            persian_style=True,
            punctuation_spacing=False,
            remove_diacritics=True,
            affix_spacing=False,
            token_based=True,
        )

    def normilize(self, txt):
        return self.hazm_normalizer.normalize(
            self.parsivar_normalizer.normalize(
                txt.replace("\n", " ").replace("\u200c", " ").lower().strip()
            )
        )


In [37]:
class JsonFileIterator:
    def __init__(self, path):
        self.path = path
        self.f = open(path, "r")
        self.i = 0
        self.length = self.counter_lines()

    def __iter__(self):
        return self

    def __next__(self):
        line = self.f.readline()
        if not line:
            # End of file
            self.f.close()
            raise StopIteration
        self.i += 1
        return json.loads(line)

    def counter_lines(self):
        with open(self.path, "r") as f1:
            return sum(1 for _ in f1)

    def __len__(self):
        return self.length


In [38]:
normalizer = MyNormalizer()
search_data = JsonFileIterator("./data/torob-search-data_v1.jsonl")
product_res = dict()
clicked_result = dict()
queries = dict()

for search in tqdm_notebook(search_data):
    for pid in search["result"]:
        if product_res.get(pid, -1) == -1:
            product_res[pid] = 1
        else:
            product_res[pid] += 1
    for pid in search["clicked_result"]:
        if clicked_result.get(pid, -1) == -1:
            clicked_result[pid] = 1
        else:
            clicked_result[pid] += 1
    raw_query = search["raw_query"]
    normalized_query = normalizer.normilize(raw_query)
    if queries.get(normalized_query, -1) == -1:
        queries[normalized_query] = 1
    else:
        queries[normalized_query] += 1

  0%|          | 0/2499901 [00:00<?, ?it/s]

In [None]:
# average of clicked result
print("average of clicked result: ", sum(clicked_result.values()) / len(clicked_result))

In [None]:
agg_searches = defaultdict(
    lambda: dict(
        results=Counter(),
        clicks=Counter(),
    )
)

search_data = JsonFileIterator("./torob-search-data_v1.jsonl")
print("Aggregating searches based on raw query...")

for search in tqdm_notebook(search_data):
    raw_query = search['raw_query']
    normalized_query = normalizer.normilize(raw_query)

    if queries[normalized_query] > 50:
        results = search['result'][:np.max(search['clicked_rank']) + 8]
        clicked_results = search['clicked_result']
        agg_searches[normalized_query]['results'].update(results)
        agg_searches[normalized_query]['clicks'].update(clicked_results)

In [42]:
len(queries)

176427

In [None]:
len()

In [None]:
# average click
avg_click = 0
for query in agg_searches:
    avg_click += sum(agg_searches[query]['clicks'].values())

In [43]:
test_data = JsonFileIterator("./data/test-offline-data_v1.jsonl")
with open('predictions.txt', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    for idx , row in enumerate(tqdm_notebook(test_data)):
        result_not_ranked = row["result_not_ranked"]
        result_list = []
        for res in result_not_ranked:
            # score is (click + 1 / impression + avg(clicks)) * (click)
            score = (clicked_result.get(res , 0) + 1) / (product_res.get(res , 0) + avg_click) * clicked_result.get(res , 0)
            item = (res ,score)
            result_list.append(item)
        sorted_list = sorted(result_list, key=lambda x: x[1] , reverse=True)
        writer.writerow([x[0] for x in sorted_list])

  0%|          | 0/23140 [00:00<?, ?it/s]

In [None]:
# get difference of two files
def get_diff(file1, file2):
    '''get difference of two files'''
    

In [14]:
normalizer = MyNormalizer()
search_data = JsonFileIterator("./data/torob-search-data_v1.jsonl")
product_res = dict()
clicked_result = dict()
queries = dict()
num_products = 0  # number of products in each search
num_searches = 0  # number of searches
num_clicks = 0  # number of clicks
num_impressions = 0  # number of impressions

for search in tqdm_notebook(search_data):
    num_products += len(search["result"])
    num_searches += 1
    
    for pid in search["result"]:
        if product_res.get(pid, -1) == -1:
            product_res[pid] = 1
        else:
            product_res[pid] += 1
    
    for pid in search["clicked_result"]:
        if clicked_result.get(pid, -1) == -1:
            clicked_result[pid] = 1
        else:
            clicked_result[pid] += 1
        num_clicks += 1
    
    raw_query = search["raw_query"]
    normalized_query = normalizer.normilize(raw_query)
    if queries.get(normalized_query, -1) == -1:
        queries[normalized_query] = 1
    else:
        queries[normalized_query] += 1
    
    num_impressions += len(search["result"])
    
agg_ctr = (num_clicks + 1) / (num_impressions + num_products)

print(f"Aggregated Clicks: {num_clicks}")
print(f"Aggregated Impressions: {num_impressions}")
print(f"Aggregated CTR: {agg_ctr}")


  0%|          | 0/2499901 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [20]:
normalizer = MyNormalizer()
search_data = JsonFileIterator("./data/torob-search-data_v1.jsonl")
product_res = dict()l
clicked_result = dict()
queries = dict()
impressions = dict()
ctr = dict()

for search in tqdm_notebook(search_data):
    # update product_res dictionary
    for pid in search["result"]:
        if pid is not None:
            product_res[pid] = product_res.get(pid, 0) + 1
            
    # update clicked_result dictionary
    for pid in search["clicked_result"]:
        if pid is not None:
            clicked_result[pid] = clicked_result.get(pid, 0) + 1
    
    # update queries dictionary
    raw_query = search["raw_query"]
    normalized_query = normalizer.normilize(raw_query)
    queries[normalized_query] = queries.get(normalized_query, 0) + 1
    
    # update impressions dictionary
    for pid in search["result"]:
        if pid is not None:
            if pid not in impressions:
                impressions[pid] = {"count": 1, "query_count": {normalized_query: 1}}
            else:
                impressions[pid]["count"] += 1
                if normalized_query not in impressions[pid]["query_count"]:
                    impressions[pid]["query_count"][normalized_query] = 1
                else:
                    impressions[pid]["query_count"][normalized_query] += 1
    
    # update ctr dictionary
    for pid in search["clicked_result"]:
        if pid is not None:
            if pid not in ctr:
                ctr[pid] = {"click_count": 1, "impression_count": 1 + len(search["result"])}
            else:
                ctr[pid]["click_count"] += 1
                ctr[pid]["impression_count"] += 1

# calculate aggregated ctr
agg_ctr = dict()
for pid in impressions:
    agg_ctr[pid] = (clicked_result.get(pid, 0) + 1) / (impressions[pid]["count"] + len(queries))


  0%|          | 0/2499901 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [26]:
# product_res = dict()
# product_res
# clicked_result = dict()
# clicked_result
# queries = dict()
# queries
# impressions = dict()
# impressions
# ctr = dict()
# ctr

{9457219: {'click_count': 1, 'impression_count': 31},
 3147253: {'click_count': 12, 'impression_count': 32},
 7135387: {'click_count': 2, 'impression_count': 22},
 900897: {'click_count': 19, 'impression_count': 89},
 2931230: {'click_count': 10, 'impression_count': 80},
 31302: {'click_count': 4, 'impression_count': 74},
 7660686: {'click_count': 1, 'impression_count': 71},
 5901997: {'click_count': 1, 'impression_count': 71},
 2376830: {'click_count': 1, 'impression_count': 71},
 2383125: {'click_count': 1, 'impression_count': 71},
 3852139: {'click_count': 3, 'impression_count': 75},
 9614943: {'click_count': 4, 'impression_count': 24},
 7088803: {'click_count': 28, 'impression_count': 48},
 2912003: {'click_count': 55, 'impression_count': 95},
 6666498: {'click_count': 8, 'impression_count': 118},
 1913648: {'click_count': 3, 'impression_count': 143},
 2498914: {'click_count': 3, 'impression_count': 23},
 707374: {'click_count': 194, 'impression_count': 214},
 9280877: {'click_coun

In [13]:
agg_searches = defaultdict(
    lambda: dict(
        results=Counter(),
        clicks=Counter(),
        impressions=Counter(),
        ctr=Counter(),
    )
)

search_data = JsonFileIterator("./data/torob-search-data_v1.jsonl")
print("Aggregating searches based on raw query...")

for search in tqdm_notebook(search_data):
    print(search)
    raw_query = search["raw_query"]
    normalized_query = normalizer.normilize(raw_query)
    print(len(queries))
    if queries[normalized_query] > 50:
        results = search["result"][: np.max(search["clicked_rank"]) + 8]
        clicked_results = search["clicked_result"]
        impressions = search["impressions"]
        max_clicked = max(clicked_result.values())

        # calculate CTR and add to the counter
        for pid in clicked_results:
            ctr = (clicked_result[pid] + 1) / (impressions.get(pid, 0) + len(product_res))
            agg_searches[normalized_query]["ctr"][pid] += ctr

        # update impression counter
        agg_searches[normalized_query]["impressions"].update(impressions)

        # update result and click counter
        agg_searches[normalized_query]["results"].update(results)
        agg_searches[normalized_query]["clicks"].update(clicked_results)

# multiply clicked_normalized by CTR and update the dictionary
for query in agg_searches:
    for pid in agg_searches[query]["clicks"]:
        clicked_normalized = clicked_result[pid] / max_clicked
        ctr = agg_searches[query]["ctr"][pid]
        agg_searches[query]["clicks"][pid] *= clicked_normalized * ctr


Aggregating searches based on raw query...


  0%|          | 0/2499901 [00:00<?, ?it/s]

{'raw_query': 'لوستر سقفی برنز', 'result': [7151290, 6462477, 7385791, 8451497, None, 269453, 144158, 9839958, 2116610, 994641, 4384068, 6954861, 245867, 6283712, 3654237, 5506230, 9457219, 3775536, 1663988, 6946932, 4456008, 489939, 9930140, 9994201, 2819798, 1919015, 7121158, 1677646, 6458804, 5377684], 'clicked_result': [9457219], 'clicked_rank': [16], 'timestamp': '2022-07-24T09:21:58.752000+00:00'}
176427
{'raw_query': 'قیمت هلیکوپتر', 'result': [363737, 3147253, 8720128, 9796388, 1420685, 5680879, 1268796, 7135387, 5854510, 8107623, 1349969, 2403444, 9728269, 8900752, 3316100, 2435225, 3218936, 977920, 3258142, 4887809], 'clicked_result': [3147253, 7135387], 'clicked_rank': [1, 7], 'timestamp': '2022-07-24T07:32:12.261000+00:00'}
176427
{'raw_query': 'ساعت هوشمند', 'result': [2459592, 9391819, 4229448, 7824893, 1670767, 9901900, 8563833, 900897, 7611444, 7451228, 1665693, 6462973, 8258186, 4648687, 232002, 5815030, 2342213, None, 4111195, 5289550, 9083224, 7231049, 1509175, 85727

KeyError: 'impressions'

In [9]:
test_data = JsonFileIterator("./data/test-offline-data_v1.jsonl")
with open('prediction.txt', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    for idx , row in enumerate(tqdm_notebook(test_data)):
        result_not_ranked = row["result_not_ranked"]
        result_list = []
        for res in result_not_ranked:
            item = (res ,np.ceil(clicked_result.get(res , 0)) )
            result_list.append(item)
        sorted_list = sorted(result_list, key=lambda x: x[1] , reverse=True)
        writer.writerow([x[0] for x in sorted_list])
        

  0%|          | 0/23140 [00:00<?, ?it/s]

In [29]:
len(product_res)

511942

In [28]:
test_data = JsonFileIterator("./data/test-offline-data_v1.jsonl")
with open('prediction.txt', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    for idx , row in enumerate(tqdm_notebook(test_data)):
        result_not_ranked = row["result_not_ranked"]
        result_list = []
        for res in result_not_ranked:
            # item = (res ,np.ceil(clicked_result.get(res , 0)) )
            score = (clicked_result.get(res , 0) + 1 / product_res.get(res, 0) + len(product_res)) *  clicked_result.get(res , 0)
            item = (res ,score)
            result_list.append(item)
        sorted_list = sorted(result_list, key=lambda x: x[1] , reverse=True)
        writer.writerow([x[0] for x in sorted_list])

  0%|          | 0/23140 [00:00<?, ?it/s]

ZeroDivisionError: division by zero