# Ta-Feng

## Import packages and define constants

In [1]:
import numpy as np
import pandas as pd
import os
import sys

sys.path.append("../")
import random
from pandas.core.frame import DataFrame
import sklearn
from utils.unigramTable import UnigramTable


DEFAULT_USER_COL = "user_ids"
DEFAULT_ITEM_COL = "item_ids"
DEFAULT_ORDER_COL = "order_ids"
DEFAULT_RATING_COL = "ratings"
DEFAULT_LABEL_COL = "label"
DEFAULT_TIMESTAMP_COL = "timestamp"
DEFAULT_PREDICTION_COL = "prediction"
DEFAULT_FLAG_COL = "flag"
data_base_dir = "../../datasets/tafeng/"


test_percents = [0.1, 0.15, 0.2, 0.25, 0.3]
# validate percent just choose the same percent in the training set

negative_size = 100

min_u_c = 20  # items which were purcharsed by at least min_u_c users
min_i_c = 30  # users buy at least min_i_c items
min_o_c = 10  ##users have at least min_o_c orders

/users/tr.zaiqm/mycode/tr_rec/configs/
/users/tr.zaiqm/mycode/tr_rec/datasets/
/users/tr.zaiqm/mycode/tr_rec/checkpoints/
/users/tr.zaiqm/mycode/tr_rec/results/
/users/tr.zaiqm/mycode/tr_rec/samples/
/users/tr.zaiqm/mycode/tr_rec/logs/
/users/tr.zaiqm/mycode/tr_rec/runs/


## load full data to sequence DataFrame

In [2]:
original_train_file = data_base_dir + "train.txt"
original_test_file = data_base_dir + "test.txt"
prior_test_df = DataFrame(columns=["order_id", "user_id", "time", "item_id"])
prior_train_df = DataFrame(
    columns=["order_id", "user_id", "time", "item_id"]
)  # initial dataframe
interaction_list = []
with open(original_train_file) as ori_test_df:
    for line in ori_test_df:
        temp_list = line.replace("\n", "\t").split(
            "\t"
        )  # replace '\n' in the end of the line by '\t'
        # split line by '\t'
        # store splited items in a list
        order_id = temp_list[0]
        item_ids_list = temp_list[1:-3]  # itemids
        time_order = temp_list[-2].replace("-", "")
        user_id = temp_list[-3]
        for item_id in item_ids_list:
            interaction_list.append(
                [order_id, user_id, item_id, "train", "1", time_order]
            )
    print(len(interaction_list))
with open(original_test_file) as ori_test_df:
    for line in ori_test_df:
        temp_list = line.replace("\n", "\t").split(
            "\t"
        )  # replace '\n' in the end of the line by '\t'
        # split line by '\t'
        # store splited items in a list
        order_id = temp_list[0]
        item_ids_list = temp_list[1:-3]  # itemids
        time_order = temp_list[-2].replace("-", "")
        user_id = temp_list[-3]
        for item_id in item_ids_list:
            interaction_list.append(
                [order_id, user_id, item_id, "train", "1", time_order]
            )
    print(len(interaction_list))

405742
464118


In [4]:
interactions = np.array(interaction_list)
interactions.shape

(464118, 6)

In [5]:
full_data = pd.DataFrame(
    data={
        DEFAULT_ORDER_COL: interactions[:, 0],
        DEFAULT_USER_COL: interactions[:, 1],
        DEFAULT_ITEM_COL: interactions[:, 2],
        DEFAULT_FLAG_COL: interactions[:, 3],
        DEFAULT_RATING_COL: interactions[:, 4],
        DEFAULT_TIMESTAMP_COL: interactions[:, 5],
    }
)

In [6]:
full_data.head()

Unnamed: 0,order_ids,user_ids,item_ids,flag,ratings,timestamp
0,0,261647,4710852001013,train,1,20001214
1,0,261647,4711128778882,train,1,20001214
2,0,261647,4710467551224,train,1,20001214
3,0,261647,4902775015356,train,1,20001214
4,0,261647,4710362500051,train,1,20001214


## Row data staticstics

In [7]:
n_interact = len(full_data.index)
n_orders = full_data[DEFAULT_ORDER_COL].nunique()
n_users = full_data[DEFAULT_USER_COL].nunique()
n_items = full_data[DEFAULT_ITEM_COL].nunique()
(n_interact, n_orders, n_users, n_items)

(464118, 77202, 9238, 7973)

## Save these sequence data to csv files

In [8]:
def save_raw_seq_to_files(data, data_base_dir, suff_str="full_raw"):
    data.to_csv(data_base_dir + "data/ta-feng_" + str(suff_str) + ".csv")
    print("Data saving to file:", data_base_dir)

In [9]:
save_raw_seq_to_files(full_data, data_base_dir)

Data saving to file: ../../datasets/ta-feng/


## load the seqence data from file

In [10]:
def load_raw_seq_from_files(data_base_dir, suff_str="full_raw"):
    print("Loading data from:", data_base_dir)
    loaded = pd.read_csv(
        data_base_dir + "data/ta-feng_" + str(suff_str) + ".csv", index_col=0
    )
    return loaded

In [11]:
full_data = load_raw_seq_from_files(data_base_dir)
full_data.head()

Loading data from: ../../datasets/ta-feng/


Unnamed: 0,order_ids,user_ids,item_ids,flag,ratings,timestamp
0,0,261647,4710852001013,train,1,20001214
1,0,261647,4711128778882,train,1,20001214
2,0,261647,4710467551224,train,1,20001214
3,0,261647,4902775015356,train,1,20001214
4,0,261647,4710362500051,train,1,20001214


In [54]:
full_data.max()

order_ids             9999
user_ids          20002000
item_ids     9789578032880
flag                 train
ratings                  1
timestamp         20010228
dtype: object

### Test the integrity of the saved data

In [12]:
n_interact = len(full_data.index)
n_orders = full_data[DEFAULT_ORDER_COL].nunique()
n_users = full_data[DEFAULT_USER_COL].nunique()
n_items = full_data[DEFAULT_ITEM_COL].nunique()
(n_interact, n_orders, n_users, n_items)

(464118, 77202, 9238, 7973)

## negative sampling

In [8]:
def feed_neg_sample(eval_df, negative_num, item_sampler):
    print("sampling negative items...")
    interact_status = eval_df.groupby(["user_ids"])["item_ids"].apply(set).reset_index()
    total_interact = pd.DataFrame(
        {"user_ids": [], "item_ids": [], "ratings": []}, dtype=np.long
    )
    for index, user_items in interact_status.iterrows():
        u = int(user_items["user_ids"])
        items = set(user_items["item_ids"])  # item set for user u
        n_items = len(items)  # number of positive item for user u
        sample_neg_items = set(
            item_sampler.sample(negative_num + n_items, 1, True)
        )  # first sample negative_num+n_items items
        sample_neg_items = list(sample_neg_items - items)[:negative_num]
        # filter the positive items and truncate the first negative_num
        #     print(len(sample_neg_items))
        tp_items = np.append(list(items), sample_neg_items)
        #     print(len(tp_items))

        tp_users = np.array([1] * (negative_num + n_items), dtype=np.long) * u
        tp_ones = np.ones(n_items, dtype=np.long)
        tp_zeros = np.zeros(negative_num, dtype=np.long)
        ratings = np.append(tp_ones, tp_zeros)
        #     print(len(tp_users)),print(len(tp_items)),print(len(ratings))
        tp = pd.DataFrame(
            {"user_ids": tp_users, "item_ids": tp_items, "ratings": ratings}
        )
        total_interact = total_interact.append(tp)

    total_interact = sklearn.utils.shuffle(total_interact)
    return total_interact

##  divide data into train, validata and test sets, where validata set is in the train set  

In [9]:
def split_test_valid_by_orders(seq_data, validate_size=0.2, test_size=0.2):
    print("split_test_valid_by_orders")
    seq_data["flag"] == "train"
    orders = seq_data[DEFAULT_ORDER_COL].unique()
    total_size = len(orders)
    validate_size = int(total_size * validate_size)
    test_size = int(total_size * test_size)
    np.sort(orders)
    seq_data.loc[
        seq_data[DEFAULT_ORDER_COL].isin(orders[total_size - test_size :]), "flag"
    ] = "test"  # the last 20% of the total orders to be the test set
    seq_data.loc[
        seq_data[DEFAULT_ORDER_COL].isin(orders[: total_size - test_size]), "flag"
    ] = "train"  # the other 80% of the total orders to be the test set
    #     np.random.shuffle(orders[:validate_size])
    # the last 20% of the training orders to be the validating set
    unique_user_ids_test = seq_data[seq_data["flag"] == "test"][
        DEFAULT_USER_COL
    ].unique()
    unique_user_ids_train = seq_data[seq_data["flag"] != "test"][
        DEFAULT_USER_COL
    ].unique()
    unique_item_ids_test = seq_data[seq_data["flag"] == "test"][
        DEFAULT_ITEM_COL
    ].unique()
    unique_item_ids_train = seq_data[seq_data["flag"] != "test"][
        DEFAULT_ITEM_COL
    ].unique()

    seq_data.loc[
        seq_data[DEFAULT_ORDER_COL].isin(
            orders[total_size - test_size - validate_size : total_size - test_size]
        ),
        "flag",
    ] = "validate"
    # seq_data.drop('time', axis = 1, inplace = True)
    print("labeling train validate test dataset finished ")

## filter data by count of users, items and orders

In [10]:
# group by group_col and filter filter_col that has less num unique() count
def fiter_by_count(tp, group_col, filter_col, num):
    ordercount = (
        tp.groupby([group_col])[filter_col].nunique().rename("count").reset_index()
    )
    filter_tp = tp[
        tp[group_col].isin(ordercount[ordercount["count"] >= num][group_col])
    ]
    return filter_tp


# filter data by the minimum purcharce number of items and users
def filter_triplets(tp, min_u_c=5, min_i_c=5, min_o_c=5):
    print("filter data by the minimum purcharce number of items and users and orders")
    n_interact = len(tp.index)
    n_orders = tp[DEFAULT_ORDER_COL].nunique()
    n_users = tp[DEFAULT_USER_COL].nunique()
    n_items = tp[DEFAULT_ITEM_COL].nunique()
    print("before filter", n_interact, n_orders, n_users, n_items)
    # Filter users by mixmum number of orders
    if min_o_c > 0:
        tp = fiter_by_count(tp, DEFAULT_USER_COL, DEFAULT_ORDER_COL, min_o_c)

    # Filter users by mixmum number of items
    if min_i_c > 0:
        tp = fiter_by_count(tp, DEFAULT_USER_COL, DEFAULT_ITEM_COL, min_i_c)

    # Filter items by mixmum number of users
    if min_u_c > 0:
        tp = fiter_by_count(tp, DEFAULT_ITEM_COL, DEFAULT_USER_COL, min_u_c)

    n_interact = len(tp.index)
    n_orders = tp[DEFAULT_ORDER_COL].nunique()
    n_users = tp[DEFAULT_USER_COL].nunique()
    n_items = tp[DEFAULT_ITEM_COL].nunique()
    print("after filter", n_interact, n_orders, n_users, n_items)
    # Update both usercount and itemcount after filtering
    # usercount, itemcount = get_count(tp, 'user_ids'), get_count(tp, 'item_ids')
    return tp

In [11]:
def save_train_to_files(data, data_base_dir, test_percent):

    if test_percent < 1:
        test_percent = int(test_percent * 100)
    user_ids = data[DEFAULT_USER_COL].to_numpy(dtype=np.long)
    item_ids = data[DEFAULT_ITEM_COL].to_numpy(dtype=np.long)
    order_ids = data[DEFAULT_ORDER_COL].to_numpy(dtype=np.long)
    timestamps = data[DEFAULT_TIMESTAMP_COL].to_numpy(dtype=np.long)
    ratings = data[DEFAULT_RATING_COL].to_numpy(dtype=np.float32)
    data_file = os.path.join(data_base_dir, "temporal", str(test_percent))
    if not os.path.exists(data_file):
        os.makedirs(data_file)
    data_file = os.path.join(data_file, "train.npz")
    np.savez_compressed(
        data_file,
        user_ids=user_ids,
        item_ids=item_ids,
        order_ids=order_ids,
        timestamp=timestamps,
        ratings=ratings,
    )
    print(
        "Data saving to file:",
        data_base_dir,
        "max_item_num:",
        np.max(item_ids),
        "max_user_num:",
        np.max(user_ids),
    )

In [12]:
def save_test_to_files(data, data_base_dir, test_percent, suffix):
    if test_percent < 1:
        test_percent = int(test_percent * 100)
    user_ids = data[DEFAULT_USER_COL].to_numpy(dtype=np.long)
    item_ids = data[DEFAULT_ITEM_COL].to_numpy(dtype=np.long)
    ratings = data[DEFAULT_RATING_COL].to_numpy(dtype=np.float32)
    data_file = os.path.join(data_base_dir, "temporal", str(test_percent))
    if not os.path.exists(data_file):
        os.makedirs(data_file)
    data_file = os.path.join(data_file, suffix)
    np.savez_compressed(
        data_file, user_ids=user_ids, item_ids=item_ids, ratings=ratings,
    )
    print(
        "Data saving to file:",
        data_base_dir,
        "max_item_num:",
        np.max(item_ids),
        "max_user_num:",
        np.max(user_ids),
    )

In [None]:
filter_tp = filter_triplets(
    full_data, min_u_c=min_u_c, min_i_c=min_i_c, min_o_c=min_o_c
)
for test_percent in test_percents:
    split_test_valid_by_orders(
        filter_tp, validate_size=test_percent, test_size=test_percent
    )

    tp_train = filter_tp[filter_tp["flag"] == "train"]
    tp_validate = filter_tp[filter_tp["flag"] == "validate"]
    tp_test = filter_tp[filter_tp["flag"] == "test"]
    save_train_to_files(tp_train, data_base_dir, test_percent)

    item_sampler = UnigramTable(tp_train[DEFAULT_ITEM_COL].value_counts().to_dict())
    for i in range(10):
        tp_validate_new = feed_neg_sample(tp_validate, 100, item_sampler)
        tp_test_new = feed_neg_sample(tp_test, 100, item_sampler)
        save_test_to_files(
            tp_validate_new,
            data_base_dir,
            test_percent,
            suffix="valid" + "_" + str(i),
        )
        save_test_to_files(
            tp_test_new, data_base_dir, test_percent, suffix="test" + "_" + str(i),
        )

filter data by the minimum purcharce number of items and users and orders
before filter 464118 77202 9238 7973
after filter 170589 36433 2265 4181
split_test_valid_by_orders
labeling train validate test dataset finished 
Data saving to file: ../../datasets/ta-feng/ max_item_num: 9557615168355 max_user_num: 20002000
Filling unigram table
sampling negative items...
sampling negative items...
Data saving to file: ../../datasets/ta-feng/ max_item_num: 9557615168355 max_user_num: 20002000
Data saving to file: ../../datasets/ta-feng/ max_item_num: 9557615168355 max_user_num: 20002000
sampling negative items...
sampling negative items...
Data saving to file: ../../datasets/ta-feng/ max_item_num: 9557615168355 max_user_num: 20002000
Data saving to file: ../../datasets/ta-feng/ max_item_num: 9557615168355 max_user_num: 20002000
sampling negative items...
sampling negative items...
Data saving to file: ../../datasets/ta-feng/ max_item_num: 9557615168355 max_user_num: 20002000
Data saving to file