In [1]:
import pandas as pd
import os
import json
from globals import BASE_DIR

include_categories = False

dataset = "foursquaretky"

DATASET_DIR = f"{BASE_DIR}{dataset}_dataset/"
#DATASET_DIR = f"/Users/andreaforster/Documents/data_thesis/{dataset}_dataset/"


available_datasets = ["foursquarenyc", "foursquaretky", "yelp", "gowalla", "brightkite", "snowcard"]

In [2]:
def open_big_json(file_path):
    data = []

    # Open the file and read it line by line
    with open(file_path, 'r') as file:
        for line in file:
            # Parse the JSON data and append to the list
            data.append(json.loads(line))

    # Create a DataFrame from the list of records
    df = pd.DataFrame(data)

    return df

In [3]:
def convert_to_unix_timestamp(df, column_name):
    """
    Convert a column of timestamps in a DataFrame to Unix timestamps.

    Args:
        df (pd.DataFrame): The DataFrame containing the timestamp column.
        column_name (str): The name of the column with timestamps in "%Y-%m-%d %H:%M:%S" format.

    Returns:
        pd.DataFrame: The DataFrame with an additional column for Unix timestamps.
    """
    # Convert the column to datetime objects
    df[column_name] = pd.to_datetime(df[column_name], format="mixed")
    
    # Convert datetime objects to Unix timestamps
    df[f'{column_name}'] = df[column_name].apply(lambda x: x.timestamp())
    
    return df

In [4]:
if dataset == "snowcard":
    checkin_df = pd.read_csv(DATASET_DIR+"TSC_EEL_EXPORT.csv", encoding="latin1", sep=";", header=None, names=["timestamp:float", "user_id:token", "category_id:token", "category_name:token_seq", "name:token_seq", "user_type:token_seq"])
    checkin_df["item_id:token"], item_id = pd.factorize(checkin_df["name:token_seq"])
    user_df = checkin_df[["user_id:token", "user_type:token_seq"]].drop_duplicates(subset=["user_id:token"])
    poi_df = checkin_df[["item_id:token", "name:token_seq", "category_id:token", "category_name:token_seq"]].drop_duplicates(subset=["item_id:token"])
    checkin_df = checkin_df[["user_id:token", "item_id:token", "timestamp:float"]]

elif dataset == "foursquarenyc" or dataset == "foursquaretky":
    checkin_df = pd.read_csv(DATASET_DIR + "foursquare_data.csv", sep=",")
    checkin_df = checkin_df.drop(columns=["timezoneOffset"])
    checkin_df = checkin_df.rename(columns={"venueId": "item_id:token", "venueCategoryId": "category_id:token", "venueCategory": "category_name:token_seq", "userId": "user_id:token", "utcTimestamp": "timestamp:float", "latitude": "lat:float", "longitude": "lon:float"})
    user_df = checkin_df[["user_id:token"]].drop_duplicates()

    poi_df = checkin_df[["item_id:token", "category_id:token", "category_name:token_seq", "lat:float", "lon:float"]].drop_duplicates(subset=["item_id:token"])
    checkin_df = checkin_df[["user_id:token", "item_id:token", "timestamp:float"]]

elif dataset == "gowalla" or dataset == "brightkite":
    checkin_df = pd.read_csv(DATASET_DIR + f"loc-{dataset}_totalCheckins.txt", sep="\t", header=None, names=['user_id:token', 'timestamp:float', 'lat:float', 'lon:float', 'item_id:token'])
    checkin_df = checkin_df[~checkin_df['item_id:token'].isin(["00000000000000000000000000000000", "ede07eeea22411dda0ef53e233ec57ca"])]
    user_df = pd.read_csv(DATASET_DIR + f"loc-{dataset}_edges.txt", sep="\t", header=None, names=['user_id:token', 'friends:token_seq'])
    user_df = user_df.groupby('user_id:token')['friends:token_seq'].apply(lambda x: ','.join(map(str, x))).reset_index()
    user_df.columns = ['user_id:token', 'friends:token_seq']
    poi_df = checkin_df[['item_id:token', "lat:float", "lon:float"]].drop_duplicates(subset="item_id:token")
    checkin_df = checkin_df.drop(columns=["lat:float", "lon:float"])

elif dataset == "yelp":
    poi_df = pd.read_json(DATASET_DIR + "yelp_academic_dataset_business.json", lines=True)
    poi_df = poi_df.loc[poi_df['is_open'] == 1]
    poi_df = poi_df.drop(columns=["review_count", "stars", "hours", "is_open", "city", "state", "postal_code", "attributes", "address"])
    poi_df = poi_df.rename(columns={"latitude": "lat:float", "longitude": "lon:float", "business_id": "item_id:token", "name":"name:token_seq", "categories":"category_name:token_seq"})
    user_df = open_big_json(DATASET_DIR + "yelp_academic_dataset_user.json")
    user_df = user_df.drop(columns=["review_count", "name", "yelping_since", "useful", "funny", "cool", "elite", "fans", "compliment_hot", "average_stars", "compliment_more", "compliment_profile", "compliment_cute", "compliment_list", "compliment_note", "compliment_plain", "compliment_cool", "compliment_funny", "compliment_writer", "compliment_photos"])
    user_df = user_df.rename(columns={"user_id": "user_id:token", "friends": "friends:token_seq"})
    checkin_df = open_big_json(DATASET_DIR + "yelp_academic_dataset_review.json")
    checkin_df = checkin_df.drop(columns=["text", "cool", "stars", "useful", "funny", "review_id"])
    checkin_df = checkin_df.rename(columns={"user_id": "user_id:token", "business_id": "item_id:token", "date": "timestamp:float"})
    

In [5]:
checkin_df_timestamp = checkin_df.copy()

In [6]:
#Step 1: Group by user_id and business_id and count check-ins
checkin_df['checkin_count:float'] = checkin_df.groupby(['user_id:token', 'item_id:token'])['item_id:token'].transform('count')
checkin_df = checkin_df.drop_duplicates(subset=["user_id:token", "item_id:token"], keep="first")


In [7]:
checkin_df

Unnamed: 0,user_id:token,item_id:token,timestamp:float,checkin_count:float
0,1541,4f0fd5a8e4b03856eeb6c8cb,Tue Apr 03 18:17:18 +0000 2012,8
1,868,4b7b884ff964a5207d662fe3,Tue Apr 03 18:22:04 +0000 2012,1
2,114,4c16fdda96040f477cc473a5,Tue Apr 03 19:12:07 +0000 2012,1
3,868,4c178638c2dfc928651ea869,Tue Apr 03 19:12:13 +0000 2012,5
4,1458,4f568309e4b071452e447afe,Tue Apr 03 19:18:23 +0000 2012,3
...,...,...,...,...
573691,390,50ada82ce4b0d4508a244756,Sat Feb 16 02:31:44 +0000 2013,1
573698,326,4bab3456f964a5204d993ae3,Sat Feb 16 02:34:35 +0000 2013,1
573699,853,4b559c09f964a520efe827e3,Sat Feb 16 02:34:53 +0000 2013,1
573700,1502,5101e81ee4b020384100b0a5,Sat Feb 16 02:34:55 +0000 2013,1


In [8]:
print("Number of users, number of POIs", len(checkin_df["user_id:token"].unique()), len(checkin_df["item_id:token"].unique())
)
print("Sparsity:", 1 - len(checkin_df) / (len(checkin_df["user_id:token"].unique()) * len(checkin_df["item_id:token"].unique())))

Number of users, number of POIs 2293 61858
Sparsity: 0.9985056795598015


In [9]:
def filter_df(df, min_reviews_user=15, min_reviews_business=10):
    while True:
        # Filter businesses with at least min_reviews reviews
        business_counts = df["item_id:token"].value_counts()
        business_mask = df['item_id:token'].map(business_counts) >= min_reviews_business
        df_filtered = df.loc[business_mask]

        # Filter users with at least min_reviews reviews
        user_counts = df_filtered['user_id:token'].value_counts()
        user_mask = df_filtered['user_id:token'].map(user_counts) >= min_reviews_user
        df_filtered = df_filtered.loc[user_mask]

        # If the size of the filtered DataFrame didn't change, break the loop
        if df_filtered.shape[0] == df.shape[0]:
            break

        # Update the DataFrame for the next iteration
        df = df_filtered

    return df_filtered

In [10]:
checkin_df_filtered = filter_df(checkin_df, min_reviews_business=10, min_reviews_user=15)

In [11]:
checkin_df_filtered["user_id:token"].nunique(), checkin_df_filtered["item_id:token"].nunique()

(2110, 2804)

In [12]:
# Step 1: Calculate the value counts of `business_id`
value_counts = checkin_df_filtered['item_id:token'].value_counts().reset_index()
value_counts.columns = ['item_id:token', 'count']

# Step 2: Normalize the counts y dividing by the maximum value count
max_count = value_counts['count'].max()
value_counts['business_popularity:float'] = value_counts['count'] / max_count

# Step 3: Merge the normalized counts back into the original DataFrame
checkin_df_filtered = checkin_df_filtered.merge(value_counts[['item_id:token', 'business_popularity:float']], on = "item_id:token", how='left')


In [13]:
def user_popularity_sample_calculator(checkin_df_filtered, poi_df, user_df, sep_num, checkin_df_timestamp):
    # Calculate average popularity per user
    average_popularity_per_user = checkin_df_filtered.groupby('user_id:token')['business_popularity:float'].mean().reset_index()
    average_popularity_per_user.columns = ['user_id:token', 'average_popularity']

    average_popularity_per_user = average_popularity_per_user.sort_values(by="average_popularity", ascending=False)

    
    # Sort by average popularity
    

    # Get top 1000 users
    high_pop_user_df_sample = average_popularity_per_user.head(sep_num)
    
    # Get the middle 1000 users around the median
    median_index = len(average_popularity_per_user) // 2
    start_med_index = max(median_index -int (sep_num/2), 0)
    end_med_index = min(median_index + int(sep_num/2), len(average_popularity_per_user))
    med_pop_user_df_sample = average_popularity_per_user.iloc[start_med_index:end_med_index]
    
    # Get the lowest 1000 users
    low_pop_user_df_sample = average_popularity_per_user.tail(sep_num)

    unique_users = list(set(high_pop_user_df_sample["user_id:token"].tolist() + med_pop_user_df_sample["user_id:token"].tolist() + low_pop_user_df_sample["user_id:token"].tolist()))

    checkin_df_sample = checkin_df_filtered[checkin_df_filtered["user_id:token"].isin(unique_users)]
    checkin_df_sample = checkin_df_sample[checkin_df_sample["user_id:token"].isin(unique_users)]

    # unique_items = checkin_df_sample["item_id:token"].unique()
    # print(len(unique_items))

    user_df_sample = user_df[user_df["user_id:token"].isin(unique_users)]
    poi_df_sample = poi_df[poi_df["item_id:token"].isin(checkin_df_sample["item_id:token"])]

    checkin_df_sample = checkin_df_sample[checkin_df_sample["item_id:token"].isin(poi_df_sample["item_id:token"])]

    checkin_df_timestamp = checkin_df_timestamp[checkin_df_timestamp["user_id:token"].isin(unique_users)]
    checkin_df_timestamp = checkin_df_timestamp[checkin_df_timestamp["item_id:token"].isin(poi_df_sample["item_id:token"])]

    
    



    
    return checkin_df_sample, high_pop_user_df_sample, med_pop_user_df_sample, low_pop_user_df_sample, user_df_sample, poi_df_sample, checkin_df_timestamp

In [14]:
if checkin_df_filtered["user_id:token"].nunique() > 1500:
    sep_num = 500
else:
    sep_num = checkin_df_filtered["user_id:token"].nunique() // 3

print(sep_num)

500


In [15]:
checkin_df_sample, high_pop_user_df_sample, med_pop_user_df_sample, low_pop_user_df_sample, user_df_sample, poi_df_sample, checkin_df_timestamp = user_popularity_sample_calculator(checkin_df_filtered, poi_df, user_df, sep_num, checkin_df_timestamp)

In [16]:
checkin_df_sample["item_id:token"].nunique()

2804

In [17]:
checkin_df_timestamp["item_id:token"].nunique()

2804

In [18]:
checkin_df_sample["user_id:token"].nunique()

1500

In [19]:
checkin_df_timestamp["user_id:token"].nunique()

1500

In [20]:
high_pop_user_df_sample

Unnamed: 0,user_id:token,average_popularity
1539,1663,0.375598
1255,1355,0.320627
1688,1826,0.315125
1878,2037,0.293083
1523,1647,0.292703
...,...,...
692,751,0.161415
558,603,0.161411
887,961,0.161367
1865,2020,0.161305


In [21]:
med_pop_user_df_sample

Unnamed: 0,user_id:token,average_popularity
2034,2211,0.142278
803,873,0.142158
416,452,0.142158
1410,1529,0.142045
2021,2198,0.141976
...,...,...
60,65,0.116345
1803,1953,0.116162
516,558,0.116140
1404,1522,0.116065


In [22]:
low_pop_user_df_sample

Unnamed: 0,user_id:token,average_popularity
114,125,0.096929
1925,2091,0.096907
389,423,0.096900
652,708,0.096900
393,427,0.096863
...,...,...
1033,1117,0.026903
1295,1401,0.026694
103,112,0.018677
1751,1892,0.017872


In [23]:
def id_factorizer(checkin_df_sample, high_pop_user_df_sample, med_pop_user_df_sample, low_pop_user_df_sample, user_df_sample, poi_df_sample):
    """Overwriting the actual ID with a factorized ID so that we can use the same ID both in RecBole and CAPRI"""
    checkin_df_sample['user_id:token'], user_id_map = pd.factorize(checkin_df_sample['user_id:token'])
    checkin_df_sample['item_id:token'], business_id_map = pd.factorize(checkin_df_sample['item_id:token'])

    # Create mapping dictionaries
    user_id_mapping = {original: i for i, original in enumerate(user_id_map)}
    business_id_mapping = {original: j for j, original in enumerate(business_id_map)}

    high_pop_user_df_sample['user_id:token'] = high_pop_user_df_sample['user_id:token'].map(user_id_mapping)
    med_pop_user_df_sample['user_id:token'] = med_pop_user_df_sample['user_id:token'].map(user_id_mapping)
    low_pop_user_df_sample['user_id:token'] = low_pop_user_df_sample['user_id:token'].map(user_id_mapping)

    user_df_sample['user_id:token'] = user_df_sample['user_id:token'].map(user_id_mapping)
    poi_df_sample['item_id:token'] = poi_df_sample['item_id:token'].map(business_id_mapping)


    return checkin_df_sample, high_pop_user_df_sample, med_pop_user_df_sample, low_pop_user_df_sample, user_df_sample, poi_df_sample

In [24]:
checkin_df_sample, high_pop_user_df_sample, med_pop_user_df_sample, low_pop_user_df_sample, user_df_sample, poi_df_sample = id_factorizer(checkin_df_sample, high_pop_user_df_sample, med_pop_user_df_sample, low_pop_user_df_sample, user_df_sample, poi_df_sample)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_df_sample['user_id:token'] = user_df_sample['user_id:token'].map(user_id_mapping)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  poi_df_sample['item_id:token'] = poi_df_sample['item_id:token'].map(business_id_mapping)


In [25]:
def user_id_token_adder(df, column_name_list = ["user_id:token", "item_id:token"]):
    """ Recbole needs a token instead of a number for the user and item ID"""
    for column_name in column_name_list:
        try:
            df[column_name] = df[column_name].astype(str) + "_x"
        except KeyError:
            pass
    return df

In [26]:
checkin_df_sample = user_id_token_adder(checkin_df_sample)
high_pop_user_df_sample = user_id_token_adder(high_pop_user_df_sample)
med_pop_user_df_sample = user_id_token_adder(med_pop_user_df_sample)
low_pop_user_df_sample = user_id_token_adder(low_pop_user_df_sample)
user_df_sample = user_id_token_adder(user_df_sample)
poi_df_sample = user_id_token_adder(poi_df_sample)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column_name] = df[column_name].astype(str) + "_x"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column_name] = df[column_name].astype(str) + "_x"


In [27]:
med_pop_user_df_sample

Unnamed: 0,user_id:token,average_popularity
2034,326_x,0.142278
803,1488_x,0.142158
416,657_x,0.142158
1410,246_x,0.142045
2021,581_x,0.141976
...,...,...
60,397_x,0.116345
1803,448_x,0.116162
516,766_x,0.116140
1404,1166_x,0.116065


In [28]:
# get a json with the user id's of the respective popularity groups
user_id_popularity = {}
user_id_popularity["high"] = high_pop_user_df_sample["user_id:token"].tolist()
user_id_popularity["medium"] = med_pop_user_df_sample["user_id:token"].tolist()
user_id_popularity["low"] = low_pop_user_df_sample["user_id:token"].tolist()
json.dump(user_id_popularity, open(f"{DATASET_DIR}/{dataset}_user_id_popularity.json", "w"))



In [29]:
def data_saver(df, filename, framework):
    if not os.path.exists(DATASET_DIR + "processed_data_" + framework):
        os.makedirs(DATASET_DIR + "processed_data_" + framework)
    
    df.to_csv(DATASET_DIR + "processed_data_" + framework + "/" + filename + ".csv")
    print("Data saved as " + framework + filename + ".csv")
    

In [30]:
# first of all saving data for cornac
data_saver(checkin_df_sample, "user_events", "cornac")
data_saver(high_pop_user_df_sample, "high_pop_user_sample", "cornac")
data_saver(med_pop_user_df_sample, "medium_pop_user_sample", "cornac")
data_saver(low_pop_user_df_sample, "low_pop_user_sample", "cornac")


Data saved as cornacuser_events.csv
Data saved as cornachigh_pop_user_sample.csv
Data saved as cornacmedium_pop_user_sample.csv
Data saved as cornaclow_pop_user_sample.csv


In [31]:
def data_saver_recbole(df, framework, suffix):
    
    if not os.path.exists(DATASET_DIR + "processed_data_" + framework):
        os.makedirs(DATASET_DIR + "processed_data_" + framework)

    df.to_csv(f"{DATASET_DIR}processed_data_{framework}/{dataset}_sample.{suffix}", sep="\t", index=False)

In [32]:
checkin_df_sample['review_id:token'] = range(1, len(checkin_df_sample) + 1)
# Step 1: Group by user_id and business_id and count check-ins
checkin_df_sample['checkin_count:float'] = checkin_df_sample.groupby(['user_id:token', 'item_id:token'])['item_id:token'].transform('count')


In [33]:
checkin_df_sample

Unnamed: 0,user_id:token,item_id:token,timestamp:float,checkin_count:float,business_popularity:float,review_id:token
0,0_x,0_x,Tue Apr 03 19:35:36 +0000 2012,1,0.104466,1
1,1_x,1_x,Tue Apr 03 19:59:06 +0000 2012,1,0.173844,2
2,1_x,2_x,Tue Apr 03 20:09:41 +0000 2012,1,0.053429,3
3,2_x,3_x,Tue Apr 03 20:14:18 +0000 2012,1,0.039075,4
4,2_x,4_x,Tue Apr 03 20:28:32 +0000 2012,1,0.086124,5
...,...,...,...,...,...,...
99678,321_x,911_x,Sat Feb 16 02:15:32 +0000 2013,1,0.157895,67522
99681,1309_x,966_x,Sat Feb 16 02:27:43 +0000 2013,1,0.048644,67523
99683,789_x,1498_x,Sat Feb 16 02:28:47 +0000 2013,1,0.012759,67524
99684,383_x,1129_x,Sat Feb 16 02:34:35 +0000 2013,1,0.015949,67525


In [34]:
checkin_df_sample = convert_to_unix_timestamp(checkin_df_sample, "timestamp:float")
checkin_df_timestamp = convert_to_unix_timestamp(checkin_df_timestamp, "timestamp:float")

In [35]:
checkin_df_timestamp

Unnamed: 0,user_id:token,item_id:token,timestamp:float
7,114,4b3eae5cf964a520b4a025e3,1.333482e+09
10,589,4b5ed39cf964a520079a29e3,1.333483e+09
14,589,4d69a46cde28224b27ff45be,1.333484e+09
15,2290,4b53b05ef964a520e8a727e3,1.333484e+09
17,2290,4b6e3e46f964a520e2b32ce3,1.333485e+09
...,...,...,...
573689,1718,4b0587a6f964a5203d9e22e3,1.360982e+09
573692,2200,4b0587a6f964a5203d9e22e3,1.360982e+09
573697,2277,4b56c4c5f964a520c41a28e3,1.360982e+09
573698,326,4bab3456f964a5204d993ae3,1.360982e+09


In [36]:
checkin_df_sample

Unnamed: 0,user_id:token,item_id:token,timestamp:float,checkin_count:float,business_popularity:float,review_id:token
0,0_x,0_x,1.333482e+09,1,0.104466,1
1,1_x,1_x,1.333483e+09,1,0.173844,2
2,1_x,2_x,1.333484e+09,1,0.053429,3
3,2_x,3_x,1.333484e+09,1,0.039075,4
4,2_x,4_x,1.333485e+09,1,0.086124,5
...,...,...,...,...,...,...
99678,321_x,911_x,1.360981e+09,1,0.157895,67522
99681,1309_x,966_x,1.360982e+09,1,0.048644,67523
99683,789_x,1498_x,1.360982e+09,1,0.012759,67524
99684,383_x,1129_x,1.360982e+09,1,0.015949,67525


In [37]:
checkin_df_sample.sort_values(by="checkin_count:float", ascending=False)
# very important: keeping the duplicate check-ins for the context aware recommendation to have the timestamps saved


# very important: dropping duplicate check-ins 
checkin_df_sample = checkin_df_sample.drop_duplicates(subset=["user_id:token", "item_id:token"], keep="first")

In [38]:
user_df_sample = user_df_sample[["user_id:token"]]

In [40]:
data_saver_recbole(checkin_df_sample, "recbole", "inter")
data_saver_recbole(user_df_sample, "recbole", "user")
data_saver_recbole(poi_df_sample, "recbole", "item")

In [41]:
poi_df_sample

Unnamed: 0,item_id:token,category_id:token,category_name:token_seq,lat:float,lon:float
7,0_x,4bf58dd8d48988d129951735,Train Station,35.700253,139.480255
10,1_x,4bf58dd8d48988d1eb931735,Airport,35.548963,139.784611
14,2_x,4bf58dd8d48988d1df941735,Bridge,35.609929,139.825659
15,3_x,4bf58dd8d48988d129951735,Train Station,35.749538,139.586540
17,4_x,4bf58dd8d48988d129951735,Train Station,35.729025,139.711096
...,...,...,...,...,...
412051,2797_x,4bf58dd8d48988d120941735,Bar,35.697700,139.770384
425285,2799_x,4bf58dd8d48988d16d941735,Café,35.702436,139.770470
450169,2800_x,4bf58dd8d48988d129951735,Train Station,35.607054,139.734894
462519,2801_x,4d954b0ea243a5684a65b473,Convenience Store,35.701178,139.771038


In [42]:
checkin_df_timestamp = checkin_df_timestamp[["user_id:token", "item_id:token", "timestamp:float"]] # FINAL
checkins_capri_train_test_tune = checkin_df_sample[["user_id:token", "item_id:token", "timestamp:float", "checkin_count:float"]]
try:
    poi_df_sample_capri = poi_df_sample[["item_id:token", "lat:float", "lon:float"]] # FINAL
except KeyError: # in the snowcard data the coordinates are not given
    poi_df_sample_capri = poi_df_sample[["item_id:token"]]
datasize_capri = pd.DataFrame(data={"num_users" : [len(checkins_capri_train_test_tune["user_id:token"].unique())], "num_items" : [len(checkins_capri_train_test_tune["item_id:token"].unique())]}) # FINAL

In [43]:
# splitting the data into train, test, and tune
checkins_capri_train_test_tune = checkins_capri_train_test_tune.sort_values(by=["user_id:token", "timestamp:float"])
checkins_capri_train_test_tune = checkins_capri_train_test_tune[["user_id:token", "item_id:token", "checkin_count:float"]]

# Split the data
train_list = []
val_list = []
test_list = []

for user, group in checkins_capri_train_test_tune.groupby('user_id:token'):
    n = len(group)
    train_end = int(n * 0.65)
    val_end = int(n * 0.80)
    
    train_list.append(group.iloc[:train_end])
    val_list.append(group.iloc[train_end:val_end])
    test_list.append(group.iloc[val_end:])

# Combine lists into DataFrames
train_df = pd.concat(train_list)
val_df = pd.concat(val_list)
test_df = pd.concat(test_list)



# Check the splits

# FINAL 6-8
print("Train Set:")
print(train_df.head())
print("\nValidation Set:")
print(val_df.head())
print("\nTest Set:")
print(test_df.head())


Train Set:
     user_id:token item_id:token  checkin_count:float
0              0_x           0_x                    1
9              0_x           9_x                    1
2998           0_x          56_x                    1
4963           0_x        1318_x                    1
4983           0_x         406_x                    1

Validation Set:
      user_id:token item_id:token  checkin_count:float
34388           0_x          93_x                    1
34397           0_x        1933_x                    1
34416           0_x          33_x                    1
34504           0_x        2384_x                    1
34543           0_x        1345_x                    1

Test Set:
      user_id:token item_id:token  checkin_count:float
38256           0_x        1220_x                    1
38335           0_x        2599_x                    1
38650           0_x         296_x                    1
41874           0_x          23_x                    1
42233           0_x         206_

In [44]:
def datasaver_capri(df, filename):
    
    if not os.path.exists(DATASET_DIR + "processed_data_capri"):
        os.makedirs(DATASET_DIR + "processed_data_capri")
    
    df.to_csv(DATASET_DIR + "processed_data_capri/" + filename + ".txt", sep='\t', index=False, header=False)
    print("Data saved as " + filename + ".txt")
    



In [45]:
# adding a category column
if include_categories is True:
    if dataset == "yelp":
        # Split the 'category_name' column by commas
        poi_df_sample['category_name_unstacked:token_seq'] = poi_df_sample['category_name:token_seq'].str.split(', ')

        # Unstack the categories into multiple rows
        category_df_sample = poi_df_sample.explode('category_name_unstacked:token_seq')
        category_counts = category_df_sample["category_name_unstacked:token_seq"].value_counts()
        category_mask = category_df_sample["category_name_unstacked:token_seq"].map(category_counts) >= 25
        category_df_sample_filtered = category_df_sample.loc[category_mask]
        category_df_sample_filtered["category_id:token"], category_id = pd.factorize(category_df_sample_filtered["category_name_unstacked:token_seq"])
        category_df_sample_filtered.dropna(inplace=True)
        datasize_capri = pd.DataFrame(data={"num_users" : [len(checkins_capri_train_test_tune["user_id:token"].unique())], "num_items" : [len(checkins_capri_train_test_tune["item_id:token"].unique())], "num_categories" : [len(category_df_sample_filtered["category_id:token"].unique())]}) # FINAL
        datasaver_capri(category_df_sample_filtered, "poiCategories")


    elif dataset == "foursquarenyc" or dataset == "foursquaretky":
        poi_df_sample["category_id:token"], category_id = pd.factorize(poi_df_sample["category_name:token_seq"])
        datasize_capri = pd.DataFrame(data={"num_users" : [len(checkins_capri_train_test_tune["user_id:token"].unique())], "num_items" : [len(checkins_capri_train_test_tune["item_id:token"].unique())], "num_categories" : [len(poi_df_sample["category_id:token"].unique())]})
        poi_df_categories = poi_df_sample[["item_id:token", "category_id:token"]]
        datasaver_capri(poi_df_categories, "poiCategories")

    elif dataset == "snowcard":
        datasize_capri = pd.DataFrame(data={"num_users" : [len(checkins_capri_train_test_tune["user_id:token"].unique())], "num_items" : [len(checkins_capri_train_test_tune["item_id:token"].unique())], "num_categories" : [len(poi_df_sample["category_id:token"].unique())]})
        poi_df_categories = poi_df_sample[["item_id:token", "category_id:token"]]
        datasaver_capri(poi_df_categories, "poiCategories")
    

In [46]:
train_df

Unnamed: 0,user_id:token,item_id:token,checkin_count:float
0,0_x,0_x,1
9,0_x,9_x,1
2998,0_x,56_x,1
4963,0_x,1318_x,1
4983,0_x,406_x,1
...,...,...,...
15808,9_x,1324_x,1
33365,9_x,1818_x,1
42277,9_x,679_x,1
45786,9_x,1206_x,1


In [47]:
data_saver_recbole(train_df, "recbole_debias", "train.inter")
data_saver_recbole(test_df, "recbole_debias", "test.inter")
data_saver_recbole(val_df, "recbole_debias", "valid.inter")

In [48]:
def user_id_cleaner(df, column_name_list = ["user_id:token", "item_id:token"]):
    for column_name in column_name_list:
        df[column_name] = df[column_name].str.split("_")
        df[column_name] = df[column_name].apply(lambda x: x[0])

    return df

In [49]:
poi_df_sample_capri = user_id_cleaner(poi_df_sample_capri, ["item_id:token"])
train_df = user_id_cleaner(train_df)
val_df = user_id_cleaner(val_df)
test_df = user_id_cleaner(test_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column_name] = df[column_name].str.split("_")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column_name] = df[column_name].apply(lambda x: x[0])


In [50]:
datasaver_capri(checkin_df_timestamp, "checkins")
datasaver_capri(datasize_capri, "dataSize")
datasaver_capri(poi_df_sample_capri, "poiCoos")
datasaver_capri(train_df, "train")
datasaver_capri(val_df, "tune")
datasaver_capri(test_df, "test")

Data saved as checkins.txt
Data saved as dataSize.txt
Data saved as poiCoos.txt
Data saved as train.txt
Data saved as tune.txt
Data saved as test.txt
