# Ta-Feng

## Import packages and define constants

In [None]:
import numpy as np
import pandas as pd
import os
import sys
sys.path.append("../")
sys.path.append("../../")
import random
from pandas.core.frame import DataFrame
import sklearn
from utils.unigramTable import UnigramTable


DEFAULT_USER_COL = "user_ids"
DEFAULT_ITEM_COL = "item_ids"
DEFAULT_ORDER_COL = "order_ids"
DEFAULT_RATING_COL = "ratings"
DEFAULT_LABEL_COL = "label"
DEFAULT_TIMESTAMP_COL = "timestamp"
DEFAULT_PREDICTION_COL = "prediction"
DEFAULT_FLAG_COL = "flag"
data_base_dir = "../datasets/tafeng/"

negative_size = 100

## load full data to sequence DataFrame

In [None]:
def data_prepro_to_seq(data_base_dir):
    #     if not os.path.isdir(data_base_dir + "data"):  # dir to output preprosessed data
    #         print(
    #             "no data proprecessing dir found, creat dir to output the preprosessed data"
    #         )
    #         os.makedirs(data_base_dir + "data")

    #     print("start loading data from raw data")

    transaction_data = data_base_dir + "transaction_data.csv"
    prior_transaction = pd.read_csv(
        transaction_data,
        usecols=["BASKET_ID", "household_key", "PRODUCT_ID", "DAY", "TRANS_TIME"],
    )

    prior_transaction["DAY"] = prior_transaction["DAY"].astype(str)  #
    prior_transaction["TRANS_TIME"] = prior_transaction["TRANS_TIME"].astype(str)

    prior_transaction["time"] = (
        prior_transaction["DAY"] + prior_transaction["TRANS_TIME"]
    )
    prior_transaction["time"] = prior_transaction["time"].astype(int)  #
    prior_transaction.reset_index(inplace=True)
    prior_transaction = prior_transaction.sort_values(by="time", ascending=False)

    prior_transaction.drop(["DAY", "TRANS_TIME"], axis=1)

    prior_transaction = prior_transaction[
        ["BASKET_ID", "household_key", "PRODUCT_ID", "time"]
    ]
    prior_transaction.insert(3, "flag", "train")
    prior_transaction.insert(4, "ratings", 1)
    prior_transaction.rename(
        columns={
            "BASKET_ID": DEFAULT_ORDER_COL,
            "household_key": DEFAULT_USER_COL,
            "PRODUCT_ID": DEFAULT_ITEM_COL,
            "flag": DEFAULT_FLAG_COL,
            "ratings": DEFAULT_RATING_COL,
            "time": DEFAULT_TIMESTAMP_COL,
        },
        inplace=True,
    )

    print("loading raw data completed")
    return prior_transaction

### raw data address: http://www.dunnhumby.com/careers/engineering/sourcefiles

In [None]:
data_base_dir = "../../datasets/dunnhumby/"
full_data = data_prepro_to_seq(
    data_base_dir + "raw/dunnhumby_The-Complete-Journey/csv/"
)
full_data.head(10)

In [None]:
full_data.head()

## Row data staticstics

In [None]:
n_interact = len(full_data.index)
n_orders = full_data[DEFAULT_ORDER_COL].nunique()
n_users = full_data[DEFAULT_USER_COL].nunique()
n_items = full_data[DEFAULT_ITEM_COL].nunique()
(n_interact, n_orders, n_users, n_items)

In [None]:
full_data.max()

### Test the integrity of the saved data

In [None]:
n_interact = len(full_data.index)
n_orders = full_data[DEFAULT_ORDER_COL].nunique()
n_users = full_data[DEFAULT_USER_COL].nunique()
n_items = full_data[DEFAULT_ITEM_COL].nunique()
(n_interact, n_orders, n_users, n_items)

## negative sampling

In [None]:
def feed_neg_sample(eval_df, negative_num, item_sampler):
    print("sampling negative items...")
    interact_status = eval_df.groupby(["user_ids"])["item_ids"].apply(set).reset_index()
    total_interact = pd.DataFrame(
        {"user_ids": [], "item_ids": [], "ratings": []}, dtype=np.long
    )
    for index, user_items in interact_status.iterrows():
        u = int(user_items["user_ids"])
        items = set(user_items["item_ids"])  # item set for user u
        n_items = len(items)  # number of positive item for user u
        sample_neg_items = set(
            item_sampler.sample(negative_num + n_items, 1, True)
        )  # first sample negative_num+n_items items
        sample_neg_items = list(sample_neg_items - items)[:negative_num]
        # filter the positive items and truncate the first negative_num
        #     print(len(sample_neg_items))
        tp_items = np.append(list(items), sample_neg_items)
        #     print(len(tp_items))

        tp_users = np.array([1] * (negative_num + n_items), dtype=np.long) * u
        tp_ones = np.ones(n_items, dtype=np.long)
        tp_zeros = np.zeros(negative_num, dtype=np.long)
        ratings = np.append(tp_ones, tp_zeros)
        #     print(len(tp_users)),print(len(tp_items)),print(len(ratings))
        tp = pd.DataFrame(
            {"user_ids": tp_users, "item_ids": tp_items, "ratings": ratings}
        )
        total_interact = total_interact.append(tp)

    total_interact = sklearn.utils.shuffle(total_interact)
    return total_interact

##  divide data into train, validata and test sets, where validata set is in the train set  

In [None]:
def split_test_valid_by_orders(seq_data, validate_size=0.2, test_size=0.2):
    print("split_test_valid_by_orders")
    seq_data["flag"] == "train"
    orders = seq_data[DEFAULT_ORDER_COL].unique()
    total_size = len(orders)
    validate_size = int(total_size * validate_size)
    test_size = int(total_size * test_size)
    np.sort(orders)
    seq_data.loc[
        seq_data[DEFAULT_ORDER_COL].isin(orders[total_size - test_size :]), "flag"
    ] = "test"  # the last 20% of the total orders to be the test set
    seq_data.loc[
        seq_data[DEFAULT_ORDER_COL].isin(orders[: total_size - test_size]), "flag"
    ] = "train"  # the other 80% of the total orders to be the test set
    #     np.random.shuffle(orders[:validate_size])
    # the last 20% of the training orders to be the validating set
    unique_user_ids_test = seq_data[seq_data["flag"] == "test"][
        DEFAULT_USER_COL
    ].unique()
    unique_user_ids_train = seq_data[seq_data["flag"] != "test"][
        DEFAULT_USER_COL
    ].unique()
    unique_item_ids_test = seq_data[seq_data["flag"] == "test"][
        DEFAULT_ITEM_COL
    ].unique()
    unique_item_ids_train = seq_data[seq_data["flag"] != "test"][
        DEFAULT_ITEM_COL
    ].unique()

    seq_data.loc[
        seq_data[DEFAULT_ORDER_COL].isin(
            orders[total_size - test_size - validate_size : total_size - test_size]
        ),
        "flag",
    ] = "validate"
    # seq_data.drop('time', axis = 1, inplace = True)
    print("labeling train validate test dataset finished ")

## filter data by count of users, items and orders

In [None]:
# group by group_col and filter filter_col that has less num unique() count
def fiter_by_count(tp, group_col, filter_col, num):
    ordercount = (
        tp.groupby([group_col])[filter_col].nunique().rename("count").reset_index()
    )
    filter_tp = tp[
        tp[group_col].isin(ordercount[ordercount["count"] >= num][group_col])
    ]
    return filter_tp


# filter data by the minimum purcharce number of items and users
def filter_triplets(tp, min_u_c=5, min_i_c=5, min_o_c=5):
    print("filter data by the minimum purcharce number of items and users and orders")
    n_interact = len(tp.index)
    n_orders = tp[DEFAULT_ORDER_COL].nunique()
    n_users = tp[DEFAULT_USER_COL].nunique()
    n_items = tp[DEFAULT_ITEM_COL].nunique()
    print("before filter", n_interact, n_orders, n_users, n_items)
    # Filter users by mixmum number of orders
    if min_o_c > 0:
        tp = fiter_by_count(tp, DEFAULT_USER_COL, DEFAULT_ORDER_COL, min_o_c)

    # Filter users by mixmum number of items
    if min_i_c > 0:
        tp = fiter_by_count(tp, DEFAULT_USER_COL, DEFAULT_ITEM_COL, min_i_c)

    # Filter items by mixmum number of users
    if min_u_c > 0:
        tp = fiter_by_count(tp, DEFAULT_ITEM_COL, DEFAULT_USER_COL, min_u_c)

    n_interact = len(tp.index)
    n_orders = tp[DEFAULT_ORDER_COL].nunique()
    n_users = tp[DEFAULT_USER_COL].nunique()
    n_items = tp[DEFAULT_ITEM_COL].nunique()
    print("after filter", n_interact, n_orders, n_users, n_items)
    # Update both usercount and itemcount after filtering
    # usercount, itemcount = get_count(tp, 'user_ids'), get_count(tp, 'item_ids')
    return tp

In [None]:
def save_train_to_files(data, data_base_dir):

    user_ids = data[DEFAULT_USER_COL].to_numpy(dtype=np.long)
    item_ids = data[DEFAULT_ITEM_COL].to_numpy(dtype=np.long)
    order_ids = data[DEFAULT_ORDER_COL].to_numpy(dtype=np.long)
    timestamps = data[DEFAULT_TIMESTAMP_COL].to_numpy(dtype=np.long)
    ratings = data[DEFAULT_RATING_COL].to_numpy(dtype=np.float32)
    data_file = os.path.join(data_base_dir, "leave_one_item")
    if not os.path.exists(data_file):
        os.makedirs(data_file)
    data_file = os.path.join(data_file, "train.npz")
    np.savez_compressed(
        data_file,
        user_ids=user_ids,
        item_ids=item_ids,
        order_ids=order_ids,
        timestamp=timestamps,
        ratings=ratings,
    )
    print(
        "Data saving to file:",
        data_base_dir,
        "max_item_num:",
        np.max(item_ids),
        "max_user_num:",
        np.max(user_ids),
    )

In [None]:
def save_test_to_files(data, data_base_dir, suffix):
    user_ids = data[DEFAULT_USER_COL].to_numpy(dtype=np.long)
    item_ids = data[DEFAULT_ITEM_COL].to_numpy(dtype=np.long)
    ratings = data[DEFAULT_RATING_COL].to_numpy(dtype=np.float32)
    data_file = os.path.join(data_base_dir, "leave_one_item")
    if not os.path.exists(data_file):
        os.makedirs(data_file)
    data_file = os.path.join(data_file, suffix)
    np.savez_compressed(
        data_file, user_ids=user_ids, item_ids=item_ids, ratings=ratings,
    )
    print(
        "Data saving to file:",
        data_base_dir,
        "max_item_num:",
        np.max(item_ids),
        "max_user_num:",
        np.max(user_ids),
    )

In [None]:
fiter_full_data = fiter_by_count(full_data, DEFAULT_ITEM_COL, DEFAULT_USER_COL, 10)
fiter_full_data = fiter_by_count(fiter_full_data, DEFAULT_USER_COL, DEFAULT_ITEM_COL, 3)
users = fiter_full_data[DEFAULT_USER_COL].unique()

In [None]:
from tqdm import tqdm

for user in tqdm(users):
    top_2_item_index = (
        fiter_full_data[fiter_full_data[DEFAULT_USER_COL] == user]
        .sort_values(by=[DEFAULT_TIMESTAMP_COL], ascending=False)
        .head(2)
        .index
    )
    fiter_full_data.loc[top_2_item_index[0], [DEFAULT_FLAG_COL]] = "test"
    fiter_full_data.loc[top_2_item_index[1], [DEFAULT_FLAG_COL]] = "validate"

In [None]:
print("sampling negatives and saving")
data_base_dir = "../../datasets/dunnhumby/"

tp_train = fiter_full_data[fiter_full_data[DEFAULT_FLAG_COL] == "train"]
tp_validate = fiter_full_data[fiter_full_data[DEFAULT_FLAG_COL] == "validate"]
tp_test = fiter_full_data[fiter_full_data[DEFAULT_FLAG_COL] == "test"]

save_train_to_files(tp_train, data_base_dir)
item_sampler = UnigramTable(tp_train[DEFAULT_ITEM_COL].value_counts().to_dict())
# generate 10 different validation and testing sets
for i in range(10):
    tp_validate_new = feed_neg_sample(tp_validate, 100, item_sampler)
    tp_test_new = feed_neg_sample(tp_test, 100, item_sampler)
    save_test_to_files(tp_validate_new, data_base_dir, suffix="valid" + "_" + str(i))
    save_test_to_files(tp_test_new, data_base_dir, suffix="test" + "_" + str(i))