In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np
import os
import datetime
from catboost import CatBoostRanker, Pool
from tqdm import tqdm
from collections import Counter
from typing import Tuple

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

PATH_DATA = "/kaggle/input/ef-msu-2024-comp-3/archive/"

df_train = pd.read_parquet(PATH_DATA + "train.parquet")
df_test = pd.read_parquet(PATH_DATA + "test.parquet")
print(df_train.head(15))

/kaggle/input/ef-msu-2024-comp-3/sample-sub.ipynb
/kaggle/input/ef-msu-2024-comp-3/archive/train.parquet
/kaggle/input/ef-msu-2024-comp-3/archive/test.parquet
                 event_time  product_id    brand    price    user_id  \
0   2019-11-01 00:06:33 UTC     1801881  samsung   488.80  455871375   
1   2019-11-01 00:09:30 UTC     5100816   xiaomi    29.58  435497056   
2   2019-11-01 00:10:12 UTC     1801881  samsung   488.80  455871375   
3   2019-11-01 00:10:47 UTC     1005124    apple  1583.48  460335599   
4   2019-11-01 00:11:15 UTC     1004856  samsung   128.42  411770392   
5   2019-11-01 00:11:48 UTC     4100129     sony   462.79  414551692   
6   2019-11-01 00:12:13 UTC     1801881  samsung   488.80  455871375   
7   2019-11-01 00:13:10 UTC     1005129    apple  1337.23  416392109   
8   2019-11-01 00:13:33 UTC     1002532    apple   532.57  464406759   
9   2019-11-01 00:14:48 UTC     1801881  samsung   488.80  455871375   
10  2019-11-01 00:15:10 UTC     1801881  samsung 

In [2]:
def get_date_k_weeks_before(cur_date: str, k: int = 2) -> str:
    """
    Return date k weeks before cur_date in format YYYY-MM-DD
    """
    cur_date = datetime.datetime.strptime(cur_date, "%Y-%m-%d")
    return (cur_date - datetime.timedelta(weeks=k)).strftime("%Y-%m-%d")


def remove_duplicates_order(x: list) -> list:
    """
    Removes duplicates from a list while preserving the order
    """
    return list(dict.fromkeys(x))

In [3]:
def get_validation_split(df: pd.DataFrame, timestamp_to_split: str) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Splits the dataframe into train and validation sets
    WARNING: Something might be missed here...
    """
    data_train = df[df["timestamp"] < timestamp_to_split]
    val = df[df["timestamp"] >= timestamp_to_split]

    gt_val = val.groupby("user_id")[["product_id"]].agg(list).reset_index()
    gt_val["product_id"] = gt_val["product_id"].apply(remove_duplicates_order)

    timestamp_to_val_split = get_date_k_weeks_before(timestamp_to_split, k=2)
    data_val = data_train[data_train["timestamp"] >= timestamp_to_val_split]
    data_train = data_train[data_train["timestamp"] < timestamp_to_val_split].reset_index(drop=True)

    gt_val = gt_val[gt_val["user_id"].isin(set(data_val["user_id"]))].reset_index(drop=True)
    data_val = data_val[data_val["user_id"].isin(set(gt_val["user_id"]))].reset_index(drop=True)
    return data_train, data_val, gt_val

In [4]:
%%time
data_train, data_val, gt_val = get_validation_split(df=df_train, timestamp_to_split="2020-02-16")
print(data_train.shape, data_val.shape, gt_val.shape)
print(data_train)
print(data_val)

(3090372, 17) (247187, 17) (44916, 2)
                      event_time  product_id    brand    price    user_id  \
0        2019-11-01 00:06:33 UTC     1801881  samsung   488.80  455871375   
1        2019-11-01 00:09:30 UTC     5100816   xiaomi    29.58  435497056   
2        2019-11-01 00:10:12 UTC     1801881  samsung   488.80  455871375   
3        2019-11-01 00:10:47 UTC     1005124    apple  1583.48  460335599   
4        2019-11-01 00:11:15 UTC     1004856  samsung   128.42  411770392   
...                          ...         ...      ...      ...        ...   
3090367  2020-01-31 23:58:08 UTC   100013136       NA   305.03  412827824   
3090368  2020-01-31 23:58:35 UTC     1004210  samsung    79.54  469886562   
3090369  2020-01-31 23:58:39 UTC     5100562    apple   294.34  416974332   
3090370  2020-01-31 23:59:39 UTC     1005186  samsung   720.46  498008011   
3090371  2020-01-31 23:59:51 UTC     1004839     oppo   178.82  411474749   

                                 user

In [5]:
def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
        A list of elements that are to be predicted (order doesn't matter)
    predicted : list
        A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
        The average precision at k over the input lists
    """
    if actual is None or len(actual) == 0:
        return 0.0

    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(predicted):
        # first condition checks whether it is valid prediction
        # second condition checks if prediction is not repeated
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)

    return score / min(len(actual), k)


In [6]:
def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])

In [7]:
data_train['price'] = pd.to_numeric(data_train['price'])
df_test['price'] = pd.to_numeric(df_test['price'])

popular_from_train = data_train.groupby(["product_id"])[["event_time"]].count().reset_index().sort_values(by="event_time", ascending=False).rename(columns={"event_time": "count"})
name = 'timestamp'
popular_from_test = df_test.groupby(["product_id"])[[name]].count().reset_index().sort_values(by=name, ascending=False).rename(columns={name: "count"})
pop_brand = pd.merge(popular_from_test, df_test[['product_id', 'brand', 'price', 'cat_1']], on='product_id', how='left').drop_duplicates(subset='product_id')
np.array(Counter(df_test[['user_id', 'brand']].values[:, 1]).most_common(3))[:, 0]
users_brands = pd.pivot_table(df_test, 
               index=['user_id'],
                values=['brand'],
                aggfunc=lambda x: np.array(Counter(x).most_common(3,))[:, 0])
user_brands = {key:value for key, value in users_brands.reset_index().values}
users_price = pd.pivot_table(df_test, 
               index=['user_id'],
                values=['price'],
                aggfunc='mean')
users_price = {key:value for key, value in users_price.reset_index().values}
df_user_brands = df_test.groupby(["user_id"])[[name]].count().reset_index().sort_values(by=name, ascending=False).rename(columns={name: "count"})
item_brands = {key:value for key, value in pop_brand[['product_id', 'brand']].values}
item_price = {key:float(value) for key, value in pop_brand[['product_id', 'price']].values}

mapk(actual=gt_val["product_id"].to_list(), predicted=[popular_from_train.head(10)["product_id"].to_list() for _ in range(len(gt_val))], k=10)


dict_bought = {}
test_users = df_test[["user_id"]].drop_duplicates()['user_id'].values
df_bought = df_test[['user_id', 'product_id']]
for user in tqdm(test_users):
    dict_bought[user] = np.unique(df_bought[df_bought['user_id']==user]['product_id'].values)

ids = np.array([])
for user in tqdm(test_users):
    ids = np.append(ids, [item for item in popular_from_test['product_id'].values[:60] if item not in dict_bought[user]][:10])
ids = ids.reshape(-1, 10).astype(int)
pers_top_3 = np.array([])
i = 0
for user in tqdm(test_users):
    pers_brand = [item for item in ids[i] if ((item_brands[item] in user_brands[user]) & (item_price[item] > 0.35*users_price[user]) & (item_price[item] < 2.4*users_price[user])).all()  ] [:3]
    rec = np.append(pers_brand, popular_from_test['product_id'].values[:3])[:3]
    pers_top_3 = np.append(pers_top_3, rec)
    if i == 1:
        print(pers_brand)
        print(rec)
    i += 1

pers_top_3 = pers_top_3.reshape(-1, 3)
pers_top_3.astype(int)

df_test[df_test['user_id']==test_users[1]]
pop_brand[pop_brand['brand']==user_brands[test_users[0]][0]]
pop_brand.iloc[:100, :]



100%|██████████| 36169/36169 [00:18<00:00, 1952.96it/s]
100%|██████████| 36169/36169 [00:17<00:00, 2035.24it/s]
  4%|▍         | 1388/36169 [00:00<00:05, 6930.94it/s]

[1005100, 1004767, 100068488]
[  1005100   1004767 100068488]


100%|██████████| 36169/36169 [00:06<00:00, 5631.69it/s]


Unnamed: 0,product_id,count,brand,price,cat_1
0,1005115,3849,apple,872.09,tools
3849,1005100,3205,samsung,146.01,tools
7054,1004767,3087,samsung,241.83,tools
10141,1002544,2790,apple,397.10,tools
12931,100068488,2532,samsung,293.06,tools
...,...,...,...,...,...
57208,1002525,166,apple,591.78,tools
57374,1005203,163,xiaomi,110.43,tools
57537,5100503,162,xiaomi,22.49,shoes
57699,1005121,161,apple,872.61,tools


In [8]:
df_sub = pd.DataFrame(test_users, columns=['user_id'])
df_sub["product_id"] = list(map(lambda x: " ".join(map(str, x)), pers_top_3.astype(int)))
df_sub.to_csv("submission.csv", index=False)

#df_sub = df_test[["user_id"]].drop_duplicates()
#df_sub["product_id"] = [popular_from_train.head(10)["product_id"].to_list() for _ in range(len(df_sub))]
#df_sub["product_id"] = df_sub["product_id"].apply(lambda x: " ".join(map(str, x)))
#df_sub
#df_sub.to_csv("submission.csv", index=False)