In [None]:
!pip install parsivar
!pip install git+https://github.com/RoboEpics/roboepics-client.git

In [None]:
from roboepics_client.roboepics_client import RoboEpicsClient

problem_id = 4
problem_enter_id = 2003

In [None]:
!gdown --id 1uYwzBe8nLhOQ2Q3rCScEXvljLkQqJrHc
!7z x data.7z
!ls

In [None]:
from __future__ import unicode_literals

import collections
import gc
import json
import re
import os

import numpy as np
import pandas as pd
import parsivar
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

In [None]:
def read_json(path, n_lines_to_read=None):

    with open(path) as f:
        for i, line in enumerate(tqdm(f)):
            if n_lines_to_read == i:
                break
            yield json.loads(line)

In [None]:
parsivar_normalizer = parsivar.Normalizer(statistical_space_correction=True)

char_mappings = {
    "٥": "5",
    "А": "a",
    "В": "b",
    "Е": "e",
    "Н": "h",
    "Р": "P",
    "С": "C",
    "Т": "T",
    "а": "a",
    "г": "r",
    "е": "e",
    "к": "k",
    "м": "m",
    "о": "o",
    "р": "p",
    "ڈ": "د",
    "ڇ": "چ",
    "۰": "0",
    "۱": "1",
    "۲": "2",
    "۳": "3",
    "۴": "4",
    "۵": "5",
    "۶": "6",
    "۷": "7",
    "۸": "8",
    "۹": "9",
    ".": ".",
    "٠": "0",
    "١": "1",
    "٢": "2",
    "٣": "3",
    "٤": "4",
    "٥": "5",
    "٦": "6",
    "٧": "7",
    "٨": "8",
    "٩": "9",
    "ك": "ک",
    "ى": "ی",
    "ي": "ی",
    "ؤ": "و",
    "ئ": "ی",
    "إ": "ا",
    "أ": "ا",
    "آ": "ا",
    "ة": "ه",
    "ء": "ی",
    "à": "a",
    "ä": "a",
    "ç": "c",
    "é": "e",
    "è": "e",
    "ê": "e",
    "ë": "e",
    "î": "i",
    "ï": "i",
    "ô": "o",
    "ù": "u",
    "û": "u",
    "ü": "u",
    ",": ".",
    "&": " and ",
    "ّ": "", 
    "َ": "", 
    "ِ": "", 
    "ُ": "", 
    "ـ": "",
    "‍": "",
    "‌": " ",
    "ﭐ": "ا",
    "ﭑ": "ا",
    "ﭖ": "پ",
    "ﭗ": "پ",
    "ﭘ": "پ",
    "ﭙ": "پ",
    "ﭞ": "ت",
    "ﭟ": "ت",
    "ﭠ": "ت",
    "ﭡ": "ت",
    "ﭺ": "چ",
    "ﭻ": "چ",
    "ﭼ": "چ",
    "ﭽ": "چ",
    "ﮊ": "ژ",
    "ﮋ": "ژ",
    "ﮎ": "ک",
    "ﮏ": "ک",
    "ﮐ": "ک",
    "ﮑ": "ک",
    "ﮒ": "گ",
    "ﮓ": "گ",
    "ﮔ": "گ",
    "ﮕ": "گ",
    "ﮤ": "ه",
    "ﮥ": "ه",
    "ﮦ": "ه",
    "ﮪ": "ه",
    "ﮫ": "ه",
    "ﮬ": "ه",
    "ﮭ": "ه",
    "ﮮ": "ی",
    "ﮯ": "ی",
    "ﮰ": "ی",
    "ﮱ": "ی",
    "ﯼ": "ی",
    "ﯽ": "ی",
    "ﯾ": "ی",
    "ﯿ": "ی",
    "ﹰ": "",
    "ﹱ": "",
    "ﹲ": "",
    "ﹳ": "",
    "ﹴ": "",
    "﹵": "",
    "ﹶ": "",
    "ﹷ": "",
    "ﹸ": "",
    "ﹹ": "",
    "ﹺ": "",
    "ﹻ": "",
    "ﹼ": "",
    "ﹽ": "",
    "ﹾ": "",
    "ﹿ": "",
    
    "ﺀ": "ی",
    "ﺁ": "ا",
    "ﺂ": "ا",
    "ﺃ": "ا",
    "ﺄ": "ا",
    "ﺅ": "و",
    "ﺆ": "و",
    "ﺇ": "ا",
    "ﺈ": "ا",
    "ﺉ": "ی",
    "ﺊ": "ی",
    "ﺋ": "ی",
    "ﺌ": "ی",
    "ﺍ": "ا",
    "ﺎ": "ا",
    "ﺏ": "ب",
    "ﺐ": "ب",
    "ﺑ": "ب",
    "ﺒ": "ب",
    "ﺓ": "ه",
    "ﺔ": "ه",
    "ﺕ": "ت",
    "ﺖ": "ت",
    "ﺗ": "ت",
    "ﺘ": "ت",
    "ﺙ": "ث",
    "ﺚ": "ث",
    "ﺛ": "ث",
    "ﺜ": "ث",
    "ﺝ": "ج",
    "ﺞ": "ج",
    "ﺟ": "ج",
    "ﺠ": "ج",
    "ﺡ": "ح",
    "ﺢ": "ح",
    "ﺣ": "ح",
    "ﺤ": "ح",
    "ﺥ": "خ",
    "ﺦ": "خ",
    "ﺧ": "خ",
    "ﺨ": "خ",
    "ﺩ": "د",
    "ﺪ": "د",
    "ﺫ": "ذ",
    "ﺬ": "ذ",
    "ﺭ": "ر",
    "ﺮ": "ر",
    "ﺯ": "ز",
    "ﺰ": "ز",
    "ﺱ": "س",
    "ﺲ": "س",
    "ﺳ": "س",
    "ﺴ": "س",
    "ﺵ": "ش",
    "ﺶ": "ش",
    "ﺷ": "ش",
    "ﺸ": "ش",
    "ﺹ": "ص",
    "ﺺ": "ص",
    "ﺻ": "ص",
    "ﺼ": "ص",
    "ﺽ": "ض",
    "ﺾ": "ض",
    "ﺿ": "ض",
    "ﻀ": "ض",
    "ﻁ": "ط",
    "ﻂ": "ط",
    "ﻃ": "ط",
    "ﻄ": "ط",
    "ﻅ": "ظ",
    "ﻆ": "ظ",
    "ﻇ": "ظ",
    "ﻈ": "ظ",
    "ﻉ": "ع",
    "ﻊ": "ع",
    "ﻋ": "ع",
    "ﻌ": "ع",
    "ﻍ": "غ",
    "ﻎ": "غ",
    "ﻏ": "غ",
    "ﻐ": "غ",
    "ﻑ": "ف",
    "ﻒ": "ف",
    "ﻓ": "ف",
    "ﻔ": "ف",
    "ﻕ": "ق",
    "ﻖ": "ق",
    "ﻗ": "ق",
    "ﻘ": "ق",
    "ﻙ": "ک",
    "ﻚ": "ک",
    "ﻛ": "ک",
    "ﻜ": "ک",
    "ﻝ": "ل",
    "ﻞ": "ل",
    "ﻟ": "ل",
    "ﻠ": "ل",
    "ﻡ": "م",
    "ﻢ": "م",
    "ﻣ": "م",
    "ﻤ": "م",
    "ﻥ": "ن",
    "ﻦ": "ن",
    "ﻧ": "ن",
    "ﻨ": "ن",
    "ﻩ": "ه",
    "ﻪ": "ه",
    "ﻫ": "ه",
    "ﻬ": "ه",
    "ﻭ": "و",
    "ﻮ": "و",
    "ﻯ": "ی",
    "ﻰ": "ی",
    "ﻱ": "ی",
    "ﻲ": "ی",
    "ﻳ": "ی",
    "ﻴ": "ی",
    "ﻵ": "لا",
    "ﻶ": "لا",
    "ﻷ": "لا",
    "ﻸ": "لا",
    "ﻹ": "لا",
    "ﻺ": "لا",
    "ﻻ": "لا",
    "ﻼ": "لا",
}

valid_chars = [
    " ",
    "0",
    "1",
    "2",
    "3",
    "4",
    "5",
    "6",
    "7",
    "8",
    "9",
    "A",
    "B",
    "C",
    "D",
    "E",
    "F",
    "G",
    "H",
    "I",
    "J",
    "K",
    "L",
    "M",
    "N",
    "O",
    "P",
    "Q",
    "R",
    "S",
    "T",
    "U",
    "V",
    "W",
    "X",
    "Y",
    "Z",
    "a",
    "b",
    "c",
    "d",
    "e",
    "f",
    "g",
    "h",
    "i",
    "j",
    "k",
    "l",
    "m",
    "n",
    "o",
    "p",
    "q",
    "r",
    "s",
    "t",
    "u",
    "v",
    "w",
    "x",
    "y",
    "z",
    "آ",
    "ئ",
    "ا",
    "ب",
    "ت",
    "ث",
    "ج",
    "ح",
    "خ",
    "د",
    "ذ",
    "ر",
    "ز",
    "س",
    "ش",
    "ص",
    "ض",
    "ط",
    "ظ",
    "ع",
    "غ",
    "ف",
    "ق",
    "ل",
    "م",
    "ن",
    "ه",
    "و",
    "پ",
    "چ",
    "ژ",
    "ک",
    "گ",
    "ی",
]


def _replace_rep(t):
    "Replace repetitions at the character level: ccc -> c"

    def __replace_rep(m):
        c, cc = m.groups()
        return f"{c}"

    re_rep = re.compile(r"(\S)(\1{2,})")
    return re_rep.sub(__replace_rep, t)


def _replace_wrep(t):
    "Replace word repetitions: word word word -> word"

    def __replace_wrep(m):
        c, cc = m.groups()
        return f"{c}"

    re_wrep = re.compile(r"(\b\w+\W+)(\1{2,})")
    return re_wrep.sub(__replace_wrep, t)


def _normalize_text(x):
    """normalize a sentence"""

    x = str(x)
    x = parsivar_normalizer.normalize(x)  # apply `parsivar` normalizations
    x = re.sub(r"[\u200c\r\n]", " ", x)  # remove half space and new line characters
    x = x.lower()
    x = "".join(
        [char_mappings[xx] if xx in char_mappings else xx for xx in x]
    )  # substitue bad characters with appropriate ones
    x = re.sub(
        r"[^{}]".format("".join(valid_chars)), " ", x
    )  # just keep valid characters and substitue others with space
    x = re.sub(r"[a-z]+", r" \g<0> ", x)  # put space around words and numbers
    x = re.sub(r"[0-9]+", r" \g<0> ", x)  # put space around words and numbers
    x = re.sub(r"\s+", " ", x)  # remove more than one white spaces with space
    x = _replace_rep(x)
    x = _replace_wrep(x)
    return x.strip()


def normalize_texts(X, use_tqdm=False):
    """normalize list of sentences"""

    if use_tqdm:
        X = [_normalize_text(x) for x in tqdm(X)]
    else:
        X = [_normalize_text(x) for x in X]
    return X

In [None]:
class JSONListWriter:
    """
    auxilary class to write list of dictionaries into json file.
    each item in one line.
    """

    def __init__(self, file_path):
        self.fd = None
        self.file_path = file_path
        self.delimiter = "\n"

    def open(self):
        self.fd = open(self.file_path, "w")
        self.first_item_written = False
        return self

    def close(self):
        self.fd.close()
        self.fd = None

    def write_item(self, obj):
        if self.first_item_written:
            self.fd.write(self.delimiter)
        self.fd.write(json.dumps(obj))
        self.first_item_written = True

    def __enter__(self):
        return self.open()

    def __exit__(self, type, value, traceback):
        self.close()

# Preprocess

In this section we preprocess the data. 
It has the following steps:
- set and extract a product name for each base product
- normalize the product name
- extract and exclude invalid products that don't have seller
- aggregate clicks based on search_id
- normalize raw_query
- aggregate searches based on raw_query
- aggregate results, clicks and page views on for each aggregated search
- normalize offline test queries

In [None]:
data_folder = "./data"

products_path = os.path.join(data_folder, "base_products.json")
products_normalized_path = os.path.join(data_folder, "base_products_normalized.json")

search_log_train_path = os.path.join(data_folder, "search_log_train.json")
click_log_train_path = os.path.join(data_folder, "click_log_train.json")

queries_test_offline_path = os.path.join(data_folder, "queries_test_offline.json")
queries_test_offline_normalized_path = os.path.join(
    data_folder, "queries_test_offline_normalized.json"
)

search_clicks_file_path = os.path.join(
        data_folder, f"searches_clicks_joined_train.json"
    )

search_click_merged_path = os.path.join(data_folder, f"searches_merged_train.json")

In [None]:
def make_base_product_names(products_path: str, products_normalized_path: str):

    with JSONListWriter(products_normalized_path) as file:
        for product in read_json(products_path):
            pr_name = ""
            for seller in product["sellers"]:
                pr_name += " " + seller["name1"] + " " + seller["name2"]
            words = [w.strip() for w in pr_name.split()]
            words = set(
                [w for w in words if w != ""]
            ) 
            pr_name = (" ".join(words)).strip()

            if (
                pr_name == ""
            ):
                continue

            product["product_name"] = pr_name
            product["product_name_normalized"] = _normalize_text(pr_name)

            file.write_item(product)


def aggregate_clicks(search_path, click_path, tag, valid_base_ids):


    search_clicks_dict = {}
    for i, click_row in enumerate(
        read_json(click_path)
    ): 
        search_id = click_row["search_log_id"]
        base_product_id = click_row["base_product_id"]

        list_of_clicks = search_clicks_dict.get(search_id, [])
        list_of_clicks.append(base_product_id)
        search_clicks_dict[search_id] = list_of_clicks

    invalid_results, invalid_clicks, invalid_searches = 0, 0, 0
    
    with JSONListWriter(
        search_clicks_file_path
    ) as file: 
        for i, search_row in enumerate(read_json(search_path)):
            search_id = search_row["_id"]
            search_results = search_row["result"]

            results = [
                r for r in search_results if r in valid_base_ids
            ]  
            results_set = set(results)

            clicks = search_clicks_dict.get(search_id, [])
            clicks = [
                c for c in clicks if c in results_set
            ]  

            invalid_results += len(search_results) - len(results)
            invalid_clicks += len(search_clicks_dict.get(search_id, [])) - len(clicks)

            if len(clicks) == 0:
                invalid_searches += 1
                continue

            search_row["raw_query"] = search_row["raw_query"].strip()
            search_row["raw_query_normalized"] = _normalize_text(
                search_row["raw_query"]
            ) 
            search_row["result"] = results
            search_row["clicks"] = clicks

            file.write_item(search_row)

    print(
        f"invalid searches: {invalid_searches}, "
        + f"invalid results: {invalid_results}, "
        + f"invalid clicks: {invalid_clicks}"
    )


def aggregate_searches(tag):
    "aggregates searches based on raw query."

    search_clicks_path = os.path.join(data_folder, f"searches_clicks_joined_{tag}.json")
    groups = {}
    normalized_query_mapping = {}
        raw_query = search["raw_query"]
        normalized_query_mapping[raw_query] = search["raw_query_normalized"]

        counters = groups.get(raw_query, {})
        groups[raw_query] = counters

        counters.setdefault("results", collections.Counter())
        counters.setdefault("pages", collections.Counter())
        counters.setdefault("clicks", collections.Counter())

        counters["results"].update(search["result"])
        counters["pages"].update([search["page"]])
        counters["clicks"].update(search["clicks"])

    new_df = []
    for raw_query, counters in tqdm(groups.items()):
        results_counter = counters["results"].most_common()  
        pages_counter = counters["pages"].most_common()  
        clicks_counter = counters["clicks"].most_common()  

        new_df.append(
            {
                "raw_query": raw_query,
                "raw_query_normalized": normalized_query_mapping[raw_query],
                "results": [k for k, v in results_counter],
                "result_counts": [v for k, v in results_counter],
                "pages": [k for k, v in pages_counter],
                "page_counts": [v for k, v in pages_counter],
                "clicks": [k for k, v in clicks_counter],
                "click_counts": [v for k, v in clicks_counter],
            }
        )
    print("Number of unique queries after merge:", len(new_df))

    pd.DataFrame(new_df).to_json(
        search_click_merged_path,
        orient="records",
        lines=True,
    )


def normalize_test_queries(queries_test_path, queries_test_normalized_path):
      with JSONListWriter(queries_test_normalized_path) as file:
        for query in read_json(queries_test_path):
            normalized_query = _normalize_text(query)
            file.write_item(normalized_query)

In [None]:
make_base_product_names(products_path, products_normalized_path)

print("\nProduct names created and saved in:", products_normalized_path)

In [None]:
valid_base_ids = set(
    [product["_id"] for product in read_json(products_normalized_path)]
)
print("\nList of valid products created")

In [None]:
aggregate_clicks(
    search_path=search_log_train_path,
    click_path=click_log_train_path,
    tag="train",
    valid_base_ids=valid_base_ids,
)
print("\nSearches and clicks in the training set are merged")

In [None]:
aggregate_searches("train")
print("\nSearches are aggregated wrt the raw query")


## Extracting features

In this section we extract feature for each product and searched query.
At the end, a `dat` file is created which is the training data of LambdaMart model.

Here, as a baseline we embed a sentences as follows:
- extract the tf-idf of the setence
- project the tf-idf vector into a lower dimension space by a random projection

So, here is overview of the steps in this section:
- embed product names in a low dimension vector
- embed train raw_queries in a low dimension vector
- embed offline test raw_queries in a low dimension vector
- create `dat` file for model training
- store embeded product names and queries in a file

In [None]:
import gc
import json
import os

import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm, trange


def read_json(path, n_lines_to_read=None):

    with open(path) as f:
        for i, line in enumerate(tqdm(f)):
            if n_lines_to_read == i:
                break
            yield json.loads(line)

In [None]:
data_folder = "./data"

merged_searches_path_train = os.path.join(data_folder, "searches_merged_train.json")

products_path = os.path.join(data_folder, "base_products.json")
products_normalized_path = os.path.join(data_folder, "base_products_normalized.json")

queries_path_test = os.path.join(data_folder, "queries_test_offline.json")
queries_normalized_path_test = os.path.join(
    data_folder, "queries_test_offline_normalized.json"
)

random_projection_path = os.path.join(data_folder, "random_projection.npz")
product_features_path = os.path.join(data_folder, "product_features.npz")
query_train_features_path = os.path.join(data_folder, "query_train_features.npz")
query_test_features_path = os.path.join(data_folder, "query_test_features.npz")

train_dat_path = os.path.join(data_folder, f"train_emb.dat")

In [None]:
vocab_size = 4000 
emb_dim = 256

sample_num_train = 10000
sample_num_test = None

In [None]:
print("read base products!")
products_df = pd.read_json(products_normalized_path, orient="records", lines=True)
products_df = products_df.drop("sellers", axis=1)
product_id_dict = {_id: ind for ind, _id in enumerate(products_df["_id"])}
product_names = products_df["product_name"]

In [None]:
print("read merged searches!")
merged_searches_train_df = list(read_json(merged_searches_path_train, sample_num_train))
merged_searches_train_df = pd.DataFrame(merged_searches_train_df)
queries_train = merged_searches_train_df["raw_query"]
queries_test = list(read_json(queries_path_test, sample_num_test))

In [None]:
product_names_normalized = products_df["product_name_normalized"]
queries_train_normalized = merged_searches_train_df["raw_query_normalized"]
queries_test_normalized = list(read_json(queries_normalized_path_test, sample_num_test))

del products_df  
gc.collect()

In [None]:
random_projection_mat = np.random.rand(vocab_size, emb_dim)  # random projection matrix

vectorizer = TfidfVectorizer(max_features=vocab_size, lowercase=True, use_idf=True)  # tfidf vectorizer
vectorizer.fit(product_names_normalized)  # fit vectorizer

# transform product names with tfidf vectorizer
products_tfidf = vectorizer.transform(product_names_normalized)  
# project the tfidf vector with random projection matrix
products_projected = products_tfidf.dot(random_projection_mat)
del products_tfidf  # free memory
gc.collect()

queries_train_tfidf = vectorizer.transform(queries_train_normalized)
queries_train_projected = queries_train_tfidf.dot(random_projection_mat)
del queries_train_tfidf
gc.collect()

queries_test_tfidf = vectorizer.transform(queries_test_normalized)
queries_test_projected = queries_test_tfidf.dot(random_projection_mat)
del queries_test_tfidf
gc.collect()

In [None]:
def make_dat_file(
    dat_file_path,
    merged_searches,
    query_features,
    product_features,
    n_candidates=None,
):
  
    """
    more information:
     - https://xgboost.readthedocs.io/en/latest/tutorials/input_format.html#embedding-additional-information-inside-libsvm-file
     - https://www.cs.cornell.edu/people/tj/svm_light/svm_rank.html
    """
    
    features_list = []
    scores = []
    groups = []

    with open(dat_file_path, "w") as file:
        for qid, (_, merged_search) in enumerate(tqdm(merged_searches.iterrows())):
            if n_candidates is None:
                limit = len(merged_search["results"])
            limit = min(limit, len(merged_search["results"]))
            clicks = dict(zip(merged_search["clicks"], merged_search["click_counts"]))

            for candidate_product_id in merged_search["results"][:limit]:
                candidate_score = clicks.get(candidate_product_id, 0)
                candidate_score = np.log2(candidate_score + 1)

                p_idx = product_id_dict[candidate_product_id]
                feature = np.concatenate((product_features[p_idx], query_features[qid]))
                feature = np.around(feature, 3)

                file.write(
                    f"{candidate_score} qid:{qid} "
                    + " ".join([f"{i}:{s}" for i, s in enumerate(feature)])
                    + "\n"
                )

In [None]:
make_dat_file(
    train_dat_path,
    merged_searches_train_df,
    queries_train_projected,
    products_projected,
)

In [None]:
np.savez(random_projection_path, random_projection_mat)
np.savez(product_features_path, products_projected)
np.savez(query_train_features_path, queries_train_projected)
np.savez(query_test_features_path, queries_test_projected)

In [None]:
import os
import xgboost as xgb

In [None]:
data_folder = "./data"

train_dat_path = os.path.join(data_folder, f"train_emb.dat")
model_path = os.path.join(data_folder, "ranker.json")

In [None]:
train_data = xgb.DMatrix(train_dat_path)

In [None]:
param = {
    "max_depth": 20,
    "eta": 0.3,
    "objective": "rank:ndcg",
    "verbosity": 1,
    "num_parallel_tree": 1,
    "tree_method": "gpu_hist",
    "eval_metric": ["ndcg", "ndcg@10"],
}

eval_list = [(train_data, "train")]

model = xgb.train(
    param,
    train_data,
    num_boost_round=200,
    evals=eval_list,
)

model.save_model(model_path)