In [2]:
import pandas as pd
from globals import BASE_DIR, available_datasets
import json


In [3]:
def group_user_events(user_events, user_groups):
    stats = {}
    for group in user_groups.keys():
        stats[group] = {}
        user_events_group = user_events.copy()
        user_events_group = user_events_group.loc[user_events_group["user_id:token"].isin(user_groups[group])]
        user_dist = user_events_group["user_id:token"].value_counts()
        stats[group]["num_users"] = user_dist.shape[0]
        stats[group]["mean_checkins"] = user_events_group["user_id:token"].value_counts().mean()
        stats[group]["min_checkins"] = user_events_group["user_id:token"].value_counts().min()
        stats[group]["max_checkins"] = user_events_group["user_id:token"].value_counts().max()
        item_dist = user_events_group["item_id:token"].value_counts()
        stats[group]["num_items"] = item_dist.shape[0]
        stats[group]["num_checkins"] = user_events_group.shape[0]
        stats[group]["sparsity"] = 1 - len(user_events_group) / (len(user_events_group["user_id:token"].unique()) * len(user_events_group["item_id:token"].unique()))

    return stats



In [7]:
full_stats = {}
for dataset in available_datasets:
    train_data = pd.read_csv(f"{BASE_DIR}{dataset}_dataset/processed_data_recbole/{dataset}_sample.train.inter", sep="\t")
    test_data = pd.read_csv(f"{BASE_DIR}{dataset}_dataset/processed_data_recbole/{dataset}_sample.test.inter", sep="\t")
    valid_data = pd.read_csv(f"{BASE_DIR}{dataset}_dataset/processed_data_recbole/{dataset}_sample.valid.inter", sep="\t")

    user_group_dir = f"{BASE_DIR}{dataset}_dataset/{dataset}_user_id_popularity.json"
    with open(user_group_dir) as f:
        user_groups = json.load(f)


    all_user_ids = (
    set(user_groups["high"])
    | set(user_groups["medium"])
    | set(user_groups["low"])
    )
    user_groups["all"] = list(all_user_ids)
    user_events = pd.concat([train_data, valid_data, test_data])
    user_events = user_events.drop_duplicates(subset=["user_id:token", "item_id:token"])

    full_stats[dataset] = group_user_events(user_events, user_groups)

long_format_df = []

for dataset, group_stats in full_stats.items():
    for group, metrics in group_stats.items():
        row = {"dataset": dataset, "group": group}
        row.update(metrics)
        long_format_df.append(row)


long_format_df = pd.DataFrame(long_format_df)


In [8]:
long_format_df


Unnamed: 0,dataset,group,num_users,mean_checkins,min_checkins,max_checkins,num_items,num_checkins,sparsity
0,foursquaretky,high,300,32.34,15,69,1992,9702,0.983765
1,foursquaretky,medium,900,53.182222,15,168,2803,47864,0.981027
2,foursquaretky,low,300,39.45,15,271,2608,11835,0.984873
3,foursquaretky,all,1500,46.267333,15,271,2804,69401,0.9835


In [None]:
stats_all_users = long_format_df.loc[long_format_df["group"] == "all"]
stats_all_users.to_csv(f"{BASE_DIR}/descriptive_stats.csv", index=False)