In [None]:
import pandas as pd
import numpy as np
import os
import sklearn
import utils.constants as Constants
from utils.unigramTable import UnigramTable

# indicators of the colunmn name
DEFAULT_USER_COL = Constants.DEFAULT_USER_COL
DEFAULT_ITEM_COL = Constants.DEFAULT_ITEM_COL
DEFAULT_ORDER_COL = Constants.DEFAULT_ORDER_COL
DEFAULT_RATING_COL = Constants.DEFAULT_RATING_COL
DEFAULT_LABEL_COL = Constants.DEFAULT_LABEL_COL
DEFAULT_TIMESTAMP_COL = Constants.DEFAULT_TIMESTAMP_COL
DEFAULT_PREDICTION_COL = Constants.DEFAULT_PREDICTION_COL
DEFAULT_FLAG_COL = Constants.DEFAULT_FLAG_COL

ml1m_dir = "../datasets/ml-1m/raw/ratings.dat"
ml1m_rating = pd.read_csv(
    ml1m_dir,
    sep="::",
    header=None,
    names=["uid", "mid", "rating", "timestamp"],
    engine="python",
)

In [None]:
data_df = ml1m_rating.rename(
    columns={
        "uid": DEFAULT_USER_COL,
        "mid": DEFAULT_ITEM_COL,
        "rating": DEFAULT_RATING_COL,
        "timestamp": DEFAULT_TIMESTAMP_COL,
    }
)

In [None]:
def feed_neg_sample(eval_df, negative_num, item_sampler):
    """ 
    sampling negative sampling for evaluation.


    Parameters:
            eval_df: Dataframe with column naming 'user_ids', 'item_ids' and 'ratings',
            where all the ratings is 1

    Returns:
            eval_df with column naming 'user_ids', 'item_ids' and 'ratings' appended 
            with negetive samples and the ratings is 0
    """
    print("sampling negative items...")
    interact_status = (
        eval_df.groupby([DEFAULT_USER_COL])[DEFAULT_ITEM_COL].apply(set).reset_index()
    )
    total_interact = pd.DataFrame(
        {DEFAULT_USER_COL: [], DEFAULT_ITEM_COL: [], DEFAULT_RATING_COL: []},
        dtype=np.int32,
    )
    for index, user_items in interact_status.iterrows():
        u = int(user_items[DEFAULT_USER_COL])
        items = set(user_items[DEFAULT_ITEM_COL])  # item set for user u
        n_items = len(items)  # number of positive item for user u
        sample_neg_items = set(
            item_sampler.sample(negative_num + n_items, 1, True)
        )  # first sample negative_num+n_items items
        sample_neg_items = list(sample_neg_items - items)[:negative_num]
        # filter the positive items and truncate the first negative_num
        #     print(len(sample_neg_items))
        tp_items = np.append(list(items), sample_neg_items)
        #     print(len(tp_items))

        tp_users = np.ones(negative_num + n_items, dtype=np.int32) * u
        tp_ones = np.ones(n_items, dtype=np.int32)
        tp_zeros = np.zeros(negative_num, dtype=np.int32)
        ratings = np.append(tp_ones, tp_zeros)
        #     print(len(tp_users)),print(len(tp_items)),print(len(ratings))
        tp = pd.DataFrame(
            {
                DEFAULT_USER_COL: tp_users,
                DEFAULT_ITEM_COL: tp_items,
                DEFAULT_RATING_COL: ratings,
            }
        )
        total_interact = total_interact.append(tp)

    total_interact = sklearn.utils.shuffle(total_interact)
    return total_interact

In [None]:
### Filter items less than 10
def fiter_by_count(tp, group_col, filter_col, num):
    print("Filter", group_col, "by ", filter_col, "less than", num)
    ordercount = (
        tp.groupby([group_col])[filter_col].nunique().rename("count").reset_index()
    )
    filter_tp = tp[
        tp[group_col].isin(ordercount[ordercount["count"] >= num][group_col])
    ]
    n_interact = len(filter_tp.index)
    n_users = filter_tp[DEFAULT_USER_COL].nunique()
    n_items = filter_tp[DEFAULT_ITEM_COL].nunique()
    print("|", n_users, "|", n_items, "|", n_interact, "|")
    return filter_tp

In [21]:
def save_test_to_files(data, data_base_dir, suff_str="full"):
    user_ids = data[DEFAULT_USER_COL].to_numpy(dtype=np.int32)
    item_ids = data[DEFAULT_ITEM_COL].to_numpy(dtype=np.int32)
    ratings = data[DEFAULT_RATING_COL].to_numpy(dtype=np.float32)
    file_path = data_base_dir + "leave_one_out/"
    if not os.path.exists(file_path):
        os.mkdir(file_path)
    np.savez_compressed(
        file_path + suff_str, user_ids=user_ids, item_ids=item_ids, ratings=ratings,
    )
    print(
        "Data saving to file:",
        data_base_dir,
        "max_item_num:",
        np.max(item_ids),
        "max_user_num:",
        np.max(user_ids),
    )


def save_train_to_files(data, data_base_dir, suff_str=""):
    user_ids = data[DEFAULT_USER_COL].to_numpy(dtype=np.int32)
    item_ids = data[DEFAULT_ITEM_COL].to_numpy(dtype=np.int32)
    timestamps = data[DEFAULT_TIMESTAMP_COL].to_numpy(dtype=np.int32)
    ratings = data[DEFAULT_RATING_COL].to_numpy(dtype=np.float32)
    file_path = data_base_dir + "leave_one_out/"
    if not os.path.exists(file_path):
        os.mkdir(file_path)
    np.savez_compressed(
        file_path + suff_str + "train",
        user_ids=user_ids,
        item_ids=item_ids,
        timestamp=timestamps,
        ratings=ratings,
    )
    print(
        "Data saving to file:",
        data_base_dir,
        "max_item_num:",
        np.max(item_ids),
        "max_user_num:",
        np.max(user_ids),
    )

In [None]:
fiter_full_data = fiter_by_count(data_df, DEFAULT_USER_COL, DEFAULT_ITEM_COL, 3)
users = fiter_full_data[DEFAULT_USER_COL].unique()

In [None]:
fiter_full_data[DEFAULT_FLAG_COL] = "train"

In [None]:
for user in users:
    top_2_item_index = (
        fiter_full_data[fiter_full_data[DEFAULT_USER_COL] == user]
        .sort_values(by=[DEFAULT_TIMESTAMP_COL], ascending=False)
        .head(2)
        .index
    )
    fiter_full_data.loc[top_2_item_index[0], [DEFAULT_FLAG_COL]] = "test"
    fiter_full_data.loc[top_2_item_index[1], [DEFAULT_FLAG_COL]] = "validate"

In [22]:
print("sampling negatives and saving")
data_base_dir = "../datasets/ml-1m/"

tp_train = fiter_full_data[fiter_full_data[DEFAULT_FLAG_COL] == "train"]
tp_validate = fiter_full_data[fiter_full_data[DEFAULT_FLAG_COL] == "validate"]
tp_test = fiter_full_data[fiter_full_data[DEFAULT_FLAG_COL] == "test"]

save_train_to_files(tp_train, data_base_dir, suff_str="")
item_sampler = UnigramTable(tp_train[DEFAULT_ITEM_COL].value_counts().to_dict())
for i in range(10):
    tp_validate_new = feed_neg_sample(tp_validate, 100, item_sampler)
    tp_test_new = feed_neg_sample(tp_test, 100, item_sampler)
    save_test_to_files(tp_validate_new, data_base_dir, suff_str="valid" + "_" + str(i))
    save_test_to_files(tp_test_new, data_base_dir, suff_str="test" + "_" + str(i))

sampling negatives and saving
Data saving to file: ../datasets/ml-1m/ max_item_num: 3952 max_user_num: 6040
Filling unigram table
sampling negative items...
sampling negative items...
Data saving to file: ../datasets/ml-1m/ max_item_num: 3952 max_user_num: 6040
Data saving to file: ../datasets/ml-1m/ max_item_num: 3952 max_user_num: 6040
sampling negative items...
sampling negative items...
Data saving to file: ../datasets/ml-1m/ max_item_num: 3952 max_user_num: 6040
Data saving to file: ../datasets/ml-1m/ max_item_num: 3952 max_user_num: 6040
sampling negative items...
sampling negative items...
Data saving to file: ../datasets/ml-1m/ max_item_num: 3952 max_user_num: 6040
Data saving to file: ../datasets/ml-1m/ max_item_num: 3952 max_user_num: 6040
sampling negative items...
sampling negative items...
Data saving to file: ../datasets/ml-1m/ max_item_num: 3952 max_user_num: 6040
Data saving to file: ../datasets/ml-1m/ max_item_num: 3952 max_user_num: 6040
sampling negative items...
sam