In [1]:
import pandas as pd
import numpy as np
import os
import sklearn
import utils.constants as Constants
from utils.unigramTable import UnigramTable

# indicators of the colunmn name
DEFAULT_USER_COL = Constants.DEFAULT_USER_COL
DEFAULT_ITEM_COL = Constants.DEFAULT_ITEM_COL
DEFAULT_ORDER_COL = Constants.DEFAULT_ORDER_COL
DEFAULT_RATING_COL = Constants.DEFAULT_RATING_COL
DEFAULT_LABEL_COL = Constants.DEFAULT_LABEL_COL
DEFAULT_TIMESTAMP_COL = Constants.DEFAULT_TIMESTAMP_COL
DEFAULT_PREDICTION_COL = Constants.DEFAULT_PREDICTION_COL
DEFAULT_FLAG_COL = Constants.DEFAULT_FLAG_COL

ml1m_dir = "../datasets/ml-1m/raw/ratings.dat"
ml1m_rating = pd.read_csv(
    ml1m_dir,
    sep="::",
    header=None,
    names=["uid", "mid", "rating", "timestamp"],
    engine="python",
)

In [2]:
data_df = ml1m_rating.rename(
    columns={
        "uid": DEFAULT_USER_COL,
        "mid": DEFAULT_ITEM_COL,
        "rating": DEFAULT_RATING_COL,
        "timestamp": DEFAULT_TIMESTAMP_COL,
    }
)

In [3]:
def feed_neg_sample(eval_df, negative_num, item_sampler):
    """ 
    sampling negative sampling for evaluation.


    Parameters:
            eval_df: Dataframe with column naming 'user_ids', 'item_ids' and 'ratings',
            where all the ratings is 1

    Returns:
            eval_df with column naming 'user_ids', 'item_ids' and 'ratings' appended 
            with negetive samples and the ratings is 0
    """
    print("sampling negative items...")
    interact_status = (
        eval_df.groupby([DEFAULT_USER_COL])[DEFAULT_ITEM_COL].apply(set).reset_index()
    )
    total_interact = pd.DataFrame(
        {DEFAULT_USER_COL: [], DEFAULT_ITEM_COL: [], DEFAULT_RATING_COL: []},
        dtype=np.int32,
    )
    for index, user_items in interact_status.iterrows():
        u = int(user_items[DEFAULT_USER_COL])
        items = set(user_items[DEFAULT_ITEM_COL])  # item set for user u
        n_items = len(items)  # number of positive item for user u
        sample_neg_items = set(
            item_sampler.sample(negative_num + n_items, 1, True)
        )  # first sample negative_num+n_items items
        sample_neg_items = list(sample_neg_items - items)[:negative_num]
        # filter the positive items and truncate the first negative_num
        #     print(len(sample_neg_items))
        tp_items = np.append(list(items), sample_neg_items)
        #     print(len(tp_items))

        tp_users = np.ones(negative_num + n_items, dtype=np.int32) * u
        tp_ones = np.ones(n_items, dtype=np.int32)
        tp_zeros = np.zeros(negative_num, dtype=np.int32)
        ratings = np.append(tp_ones, tp_zeros)
        #     print(len(tp_users)),print(len(tp_items)),print(len(ratings))
        tp = pd.DataFrame(
            {
                DEFAULT_USER_COL: tp_users,
                DEFAULT_ITEM_COL: tp_items,
                DEFAULT_RATING_COL: ratings,
            }
        )
        total_interact = total_interact.append(tp)

    total_interact = sklearn.utils.shuffle(total_interact)
    return total_interact

In [4]:
### Filter items less than 10
def fiter_by_count(tp, group_col, filter_col, num):
    print("Filter", group_col, "by ", filter_col, "less than", num)
    ordercount = (
        tp.groupby([group_col])[filter_col].nunique().rename("count").reset_index()
    )
    filter_tp = tp[
        tp[group_col].isin(ordercount[ordercount["count"] >= num][group_col])
    ]
    n_interact = len(filter_tp.index)
    n_users = filter_tp[DEFAULT_USER_COL].nunique()
    n_items = filter_tp[DEFAULT_ITEM_COL].nunique()
    print("|", n_users, "|", n_items, "|", n_interact, "|")
    return filter_tp

In [6]:
def save_test_to_files(data, data_base_dir, suff_str="full"):
    user_ids = data[DEFAULT_USER_COL].to_numpy(dtype=np.int32)
    item_ids = data[DEFAULT_ITEM_COL].to_numpy(dtype=np.int32)
    ratings = data[DEFAULT_RATING_COL].to_numpy(dtype=np.float32)
    file_path = data_base_dir + "temporal/"
    if not os.path.exists(file_path):
        os.mkdir(file_path)
    np.savez_compressed(
        file_path + suff_str, user_ids=user_ids, item_ids=item_ids, ratings=ratings,
    )
    print(
        "Data saving to file:",
        data_base_dir,
        "max_item_num:",
        np.max(item_ids),
        "max_user_num:",
        np.max(user_ids),
    )


def save_train_to_files(data, data_base_dir, suff_str=""):
    user_ids = data[DEFAULT_USER_COL].to_numpy(dtype=np.int32)
    item_ids = data[DEFAULT_ITEM_COL].to_numpy(dtype=np.int32)
    timestamps = data[DEFAULT_TIMESTAMP_COL].to_numpy(dtype=np.int32)
    ratings = data[DEFAULT_RATING_COL].to_numpy(dtype=np.float32)
    file_path = data_base_dir + "temporal/"
    if not os.path.exists(file_path):
        os.mkdir(file_path)
    np.savez_compressed(
        file_path + suff_str + "train",
        user_ids=user_ids,
        item_ids=item_ids,
        timestamp=timestamps,
        ratings=ratings,
    )
    print(
        "Data saving to file:",
        data_base_dir,
        "max_item_num:",
        np.max(item_ids),
        "max_user_num:",
        np.max(user_ids),
    )

In [None]:
def temporal_split(seq_data, validate_size=0.2, test_size=0.2):
    print("split_test_valid_by_orders")
    orders = seq_data[DEFAULT_ORDER_COL].unique()
    total_size = len(orders)
    validate_size = int(total_size * validate_size)
    test_size = int(total_size * test_size)
    np.sort(orders)
    seq_data.loc[
        seq_data[DEFAULT_ORDER_COL].isin(orders[total_size - test_size :]), "flag"
    ] = "test"  # the last 20% of the total orders to be the test set
    seq_data.loc[
        seq_data[DEFAULT_ORDER_COL].isin(orders[: total_size - test_size]), "flag"
    ] = "train"  # the other 80% of the total orders to be the test set
    #     np.random.shuffle(orders[:validate_size])
    # the last 20% of the training orders to be the validating set
    unique_user_ids_test = seq_data[seq_data["flag"] == "test"][
        DEFAULT_USER_COL
    ].unique()
    unique_user_ids_train = seq_data[seq_data["flag"] != "test"][
        DEFAULT_USER_COL
    ].unique()
    unique_item_ids_test = seq_data[seq_data["flag"] == "test"][
        DEFAULT_ITEM_COL
    ].unique()
    unique_item_ids_train = seq_data[seq_data["flag"] != "test"][
        DEFAULT_ITEM_COL
    ].unique()

    seq_data.loc[
        seq_data[DEFAULT_ORDER_COL].isin(
            orders[total_size - test_size - validate_size : total_size - test_size]
        ),
        "flag",
    ] = "validate"
    print("labeling train validate test dataset finished ")

In [7]:
fiter_full_data = fiter_by_count(data_df, DEFAULT_USER_COL, DEFAULT_ITEM_COL, 3)
users = fiter_full_data[DEFAULT_USER_COL].unique()

Filter col_user by  col_item less than 3
| 6040 | 3706 | 1000209 |


In [8]:
fiter_full_data[DEFAULT_FLAG_COL] = "train"

In [25]:
fiter_full_data

Unnamed: 0,col_user,col_item,col_rating,col_timestamp,col_flag
0,1,1193,5,978300760,train
1,1,661,3,978302109,train
2,1,914,3,978301968,train
3,1,3408,4,978300275,train
4,1,2355,5,978824291,train
...,...,...,...,...,...
1000204,6040,1091,1,956716541,train
1000205,6040,1094,5,956704887,train
1000206,6040,562,5,956704746,train
1000207,6040,1096,4,956715648,train


In [55]:
validate_size = 0.2
test_size = 0.2
total_size = len(fiter_full_data.index)
validate_size = int(total_size * validate_size)
test_size = int(total_size * test_size)
train_size = total_size - test_size
validate_index = train_size - validate_size
sorted_data = fiter_full_data.sort_values(by=[DEFAULT_TIMESTAMP_COL]).reset_index()

In [72]:
sorted_data.loc[sorted_data.index[0:train_size],DEFAULT_FLAG_COL]='train'
sorted_data.loc[sorted_data.index[validate_index:train_size],DEFAULT_FLAG_COL]='validate'
sorted_data.loc[sorted_data.index[train_size:total_size],DEFAULT_FLAG_COL]='test'

In [73]:
(len(sorted_data[sorted_data[DEFAULT_FLAG_COL]=='train'].index),
len(sorted_data[sorted_data[DEFAULT_FLAG_COL]=='validate'].index),
len(sorted_data[sorted_data[DEFAULT_FLAG_COL]=='test'].index))

(600127, 200041, 200041)

In [76]:
sorted_data

Unnamed: 0,index,col_user,col_item,col_rating,col_timestamp,col_flag
0,1000138,6040,858,4,956703932,train
1,1000153,6040,2384,4,956703954,train
2,999873,6040,593,5,956703954,train
3,1000007,6040,1961,4,956703977,train
4,1000192,6040,2019,5,956703977,train
...,...,...,...,...,...,...
1000204,825793,4958,2399,1,1046454338,test
1000205,825438,4958,1407,5,1046454443,test
1000206,825724,4958,3264,4,1046454548,test
1000207,825731,4958,2634,3,1046454548,test


In [75]:
600127+200041+200041

1000209

In [77]:
print("sampling negatives and saving")
data_base_dir = "../datasets/ml-1m/"

tp_train = sorted_data[sorted_data[DEFAULT_FLAG_COL] == "train"]
tp_validate = sorted_data[sorted_data[DEFAULT_FLAG_COL] == "validate"]
tp_test = sorted_data[sorted_data[DEFAULT_FLAG_COL] == "test"]

save_train_to_files(tp_train, data_base_dir, suff_str="")
item_sampler = UnigramTable(tp_train[DEFAULT_ITEM_COL].value_counts().to_dict())
for i in range(10):
    tp_validate_new = feed_neg_sample(tp_validate, 100, item_sampler)
    tp_test_new = feed_neg_sample(tp_test, 100, item_sampler)
    save_test_to_files(tp_validate_new, data_base_dir, suff_str="valid" + "_" + str(i))
    save_test_to_files(tp_test_new, data_base_dir, suff_str="test" + "_" + str(i))

sampling negatives and saving
Data saving to file: ../datasets/ml-1m/ max_item_num: 3952 max_user_num: 6040
Filling unigram table
sampling negative items...
sampling negative items...
Data saving to file: ../datasets/ml-1m/ max_item_num: 3952 max_user_num: 5996
Data saving to file: ../datasets/ml-1m/ max_item_num: 3952 max_user_num: 6040
sampling negative items...
sampling negative items...
Data saving to file: ../datasets/ml-1m/ max_item_num: 3952 max_user_num: 5996
Data saving to file: ../datasets/ml-1m/ max_item_num: 3952 max_user_num: 6040
sampling negative items...
sampling negative items...
Data saving to file: ../datasets/ml-1m/ max_item_num: 3952 max_user_num: 5996
Data saving to file: ../datasets/ml-1m/ max_item_num: 3952 max_user_num: 6040
sampling negative items...
sampling negative items...
Data saving to file: ../datasets/ml-1m/ max_item_num: 3952 max_user_num: 5996
Data saving to file: ../datasets/ml-1m/ max_item_num: 3952 max_user_num: 6040
sampling negative items...
sam