In [233]:
import pandas as pd
import os
import json
from globals import BASE_DIR

available_datasets = ["foursquaretky", "yelp", "gowalla", "brightkite", "snowcard"]


dataset = "yelp" # beware: opening the yelp file with pandas will take a lot of time, approx 10 min on my machine
include_categories = False # for context-aware recommendation

DATASET_DIR = f"{BASE_DIR}{dataset}_dataset/"
#DATASET_DIR = f"/Users/andreaforster/Documents/data_thesis/{dataset}_dataset/"




In [234]:
def open_big_json(file_path):
    """This function is used to open the Yelp data"""
    data = []

    with open(file_path, 'r') as file:
        for line in file:
            data.append(json.loads(line))
    df = pd.DataFrame(data)

    return df

In [235]:
def convert_to_unix_timestamp(df, column_name):
    """
    Convert a column of timestamps in a DataFrame to Unix timestamps.

    Args:
        df (pd.DataFrame): The DataFrame containing the timestamp column.
        column_name (str): The name of the column with timestamps in "%Y-%m-%d %H:%M:%S" format.

    Returns:
        pd.DataFrame: The DataFrame with an additional column for Unix timestamps.
    """
    df[column_name] = pd.to_datetime(df[column_name], format="mixed")
    
    df[f'{column_name}'] = df[column_name].apply(lambda x: x.timestamp())
    
    return df

In [236]:
# dataset-specific preprocessing
if dataset == "snowcard":
    checkin_df = pd.read_csv(DATASET_DIR+"TSC_EEL_EXPORT.csv", encoding="latin1", sep=";", header=None, names=["timestamp:float", "user_id:token", "category_id:token", "category_name:token_seq", "name:token_seq", "user_type:token_seq"])
    checkin_df["item_id:token"], item_id = pd.factorize(checkin_df["name:token_seq"])
    user_df = checkin_df[["user_id:token", "user_type:token_seq"]].drop_duplicates(subset=["user_id:token"])
    poi_df = checkin_df[["item_id:token", "name:token_seq", "category_id:token", "category_name:token_seq"]].drop_duplicates(subset=["item_id:token"])
    coordinates_df = pd.read_excel(DATASET_DIR+"snowcard_lifts.xlsx")
    coordinates_df[["lat:float", "lon:float"]] = coordinates_df["lat_lon"].str.split(', ', expand=True)
    coordinates_df.drop(columns=["lat_lon"], inplace=True)
    poi_df = pd.merge(poi_df, coordinates_df[['category_name:token_seq', 'lat:float', 'lon:float']], on='category_name:token_seq', how='left')
    checkin_df = checkin_df[["user_id:token", "item_id:token", "timestamp:float"]]

elif dataset == "foursquarenyc" or dataset == "foursquaretky":
    checkin_df = pd.read_csv(DATASET_DIR + "foursquare_data.csv", sep=",")
    checkin_df = checkin_df.drop(columns=["timezoneOffset"])
    checkin_df = checkin_df.rename(columns={"venueId": "item_id:token", "venueCategoryId": "category_id:token", "venueCategory": "category_name:token_seq", "userId": "user_id:token", "utcTimestamp": "timestamp:float", "latitude": "lat:float", "longitude": "lon:float"})
    user_df = checkin_df[["user_id:token"]].drop_duplicates()

    poi_df = checkin_df[["item_id:token", "category_id:token", "category_name:token_seq", "lat:float", "lon:float"]].drop_duplicates(subset=["item_id:token"])
    checkin_df = checkin_df[["user_id:token", "item_id:token", "timestamp:float"]]

elif dataset == "gowalla" or dataset == "brightkite":
    checkin_df = pd.read_csv(DATASET_DIR + f"loc-{dataset}_totalCheckins.txt", sep="\t", header=None, names=['user_id:token', 'timestamp:float', 'lat:float', 'lon:float', 'item_id:token'])
    checkin_df = checkin_df[~checkin_df['item_id:token'].isin(["00000000000000000000000000000000", "ede07eeea22411dda0ef53e233ec57ca"])]
    user_df = pd.read_csv(DATASET_DIR + f"loc-{dataset}_edges.txt", sep="\t", header=None, names=['user_id:token', 'friends:token_seq'])
    user_df = user_df.groupby('user_id:token')['friends:token_seq'].apply(lambda x: ','.join(map(str, x))).reset_index()
    user_df.columns = ['user_id:token', 'friends:token_seq']
    poi_df = checkin_df[['item_id:token', "lat:float", "lon:float"]].drop_duplicates(subset="item_id:token")
    checkin_df = checkin_df.drop(columns=["lat:float", "lon:float"])

elif dataset == "yelp":
    poi_df = pd.read_json(DATASET_DIR + "yelp_academic_dataset_business.json", lines=True)
    poi_df = poi_df.loc[poi_df['is_open'] == 1]
    poi_df = poi_df.drop(columns=["review_count", "stars", "hours", "is_open", "city", "state", "postal_code", "attributes", "address"])
    poi_df = poi_df.rename(columns={"latitude": "lat:float", "longitude": "lon:float", "business_id": "item_id:token", "name":"name:token_seq", "categories":"category_name:token_seq"})
    user_df = open_big_json(DATASET_DIR + "yelp_academic_dataset_user.json")
    user_df = user_df.drop(columns=["review_count", "name", "yelping_since", "useful", "funny", "cool", "elite", "fans", "compliment_hot", "average_stars", "compliment_more", "compliment_profile", "compliment_cute", "compliment_list", "compliment_note", "compliment_plain", "compliment_cool", "compliment_funny", "compliment_writer", "compliment_photos"])
    user_df = user_df.rename(columns={"user_id": "user_id:token", "friends": "friends:token_seq"})
    checkin_df = open_big_json(DATASET_DIR + "yelp_academic_dataset_review.json")
    checkin_df = checkin_df.drop(columns=["text", "cool", "stars", "useful", "funny", "review_id"])
    checkin_df = checkin_df.rename(columns={"user_id": "user_id:token", "business_id": "item_id:token", "date": "timestamp:float"})
    checkin_df['timestamp'] = pd.to_datetime(checkin_df['timestamp:float'], errors='coerce')

    checkin_df['year'] = checkin_df['timestamp'].dt.year      # Extract the year from the 'timestamp' column
    checkin_df = checkin_df[checkin_df['year'] >= 2018]       # Keep only the check-ins from 2018 and 2019
    checkin_df = checkin_df[checkin_df['year'] < 2020]
    checkin_df.drop(columns=["year", "timestamp"], inplace=True)
    

In [237]:
checkin_df.sort_values(by="timestamp:float", ascending=True, inplace=True)  

In [238]:
poi_df

Unnamed: 0,item_id:token,name:token_seq,lat:float,lon:float,category_name:token_seq
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,38.551126,-90.335695,"Shipping Centers, Local Services, Notaries, Ma..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,39.955505,-75.155564,"Restaurants, Food, Bubble Tea, Coffee & Tea, B..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,40.338183,-75.471659,"Brewpubs, Breweries, Food"
5,CF33F8-E6oudUQ46HnavjQ,Sonic Drive-In,36.269593,-87.058943,"Burgers, Fast Food, Sandwiches, Food, Ice Crea..."
6,n_0UpQx1hsNbnPUSlodU8w,Famous Footwear,38.627695,-90.340465,"Sporting Goods, Fashion, Shoe Stores, Shopping..."
...,...,...,...,...,...
150341,IUQopTMmYQG-qRtBk-8QnA,Binh's Nails,53.468419,-113.492054,"Nail Salons, Beauty & Spas"
150342,c8GjPIOTGVmIemT7j5_SyQ,Wild Birds Unlimited,36.115118,-86.766925,"Pets, Nurseries & Gardening, Pet Stores, Hobby..."
150343,_QAMST-NrQobXduilWEqSw,Claire's Boutique,39.908707,-86.065088,"Shopping, Jewelry, Piercing, Toy Stores, Beaut..."
150344,mtGm22y5c2UHNXDFAjaPNw,Cyclery & Fitness Center,38.782351,-89.950558,"Fitness/Exercise Equipment, Eyewear & Optician..."


In [239]:
user_df

Unnamed: 0,user_id:token,friends:token_seq
0,qVc8ODYU5SZjKXVBgXdI7w,"NSCy54eWehBJyZdG2iE84w, pe42u7DcCH2QmI81NX-8qA..."
1,j14WgRoU_-2ZE1aw1dXrJg,"ueRPE0CX75ePGMqOFVj6IQ, 52oH4DrRvzzl8wh5UXyU0A..."
2,2WnXYQFK0hXEoTxPtV2zvg,"LuO3Bn4f3rlhyHIaNfTlnA, j9B4XdHUhDfTKVecyWQgyA..."
3,SZDeASXq7o05mMNLshsdIA,"enx1vVPnfdNUdPho6PH_wg, 4wOcvMLtU6a9Lslggq74Vg..."
4,hA5lMy-EnncsH4JoR-hFGQ,"PBK4q9KEEBHhFvSXCUirIw, 3FWPpM7KU1gXeOM_ZbYMbA..."
...,...,...
1987892,fB3jbHi3m0L2KgGOxBv6uw,
1987893,68czcr4BxJyMQ9cJBm6C7Q,
1987894,1x3KMskYxOuJCjRz70xOqQ,
1987895,ulfGl4tdbrH05xKzh5lnog,


In [240]:
checkin_df

Unnamed: 0,user_id:token,item_id:token,timestamp:float
6664443,vYPvl2ngX8UdDB8Pf0JCPA,THJ0i8yRyx1OfvzLsJXgng,2018-01-01 00:00:11
2542113,PcgyHvdduilIJ-O-z_05Sw,HhZDu-IEC7owaHCeEXSf1g,2018-01-01 00:00:28
4742901,SsL2nYQGx-l_MVOKoJExJw,D8UM3J3mx2cyYPy84yU9ag,2018-01-01 00:00:52
4240067,1T-Y8oA4Frv3eRmGO3ciew,Sf9E4E8yo8actSXHFvrZbQ,2018-01-01 00:02:05
4947842,unekdBHjTsiBVtOX5UwbZg,rDr7zhYBOmC3NzJXcZd1BQ,2018-01-01 00:02:12
...,...,...,...
2636555,Yy5uYBI7PH5fVtfQjYI76Q,l4uH-6afJzbm0NFRp7lKog,2019-12-31 23:58:17
2677248,IlGYj_XAMG3v75rfmtBs_Q,EagkHaaC-kUozD3MPzbRIw,2019-12-31 23:58:18
1252247,FhBHx01UWFh3_R_sphucMA,gWJSE8CNWsHS_sUG_sVoRw,2019-12-31 23:58:52
2728567,U4iKsl_nFscRfNDxNvdZ8Q,xG9oeXDZldT5_CZLLzABsw,2019-12-31 23:59:11


In [241]:
checkin_df_timestamp = checkin_df.copy()

In [242]:
#Step 1: Group by user_id and business_id and count check-ins
checkin_df['checkin_count:float'] = checkin_df.groupby(['user_id:token', 'item_id:token'])['item_id:token'].transform('count')
checkin_df = checkin_df.drop_duplicates(subset=["user_id:token", "item_id:token"], keep="first")


In [243]:
print("Number of users, number of POIs", len(checkin_df["user_id:token"].unique()), len(checkin_df["item_id:token"].unique())
)
print("Sparsity:", 1 - len(checkin_df) / (len(checkin_df["user_id:token"].unique()) * len(checkin_df["item_id:token"].unique())))

Number of users, number of POIs 747847 119726
Sparsity: 0.9999802560021843


In [244]:
def filter_df(df, min_reviews_user=15, min_reviews_business=10):
    while True:

        # Filter users with at least min_reviews reviews
        user_counts = df['user_id:token'].value_counts()
        user_mask = df['user_id:token'].map(user_counts) >= min_reviews_user
        df_filtered = df.loc[user_mask]

        # Filter businesses with at least min_reviews reviews
        business_counts = df_filtered["item_id:token"].value_counts()
        business_mask = df_filtered['item_id:token'].map(business_counts) >= min_reviews_business
        df_filtered = df_filtered.loc[business_mask]

        

        # If the size of the filtered DataFrame didn't change, break the loop
        if df_filtered.shape[0] == df.shape[0]:
            break

        # Update the DataFrame for the next iteration
        df = df_filtered

    return df_filtered

In [245]:
checkin_df_filtered = filter_df(checkin_df, min_reviews_business=10, min_reviews_user=15)

In [246]:
checkin_df_filtered["user_id:token"].value_counts().min()

15

In [247]:
checkin_df_filtered["user_id:token"].nunique()

4150

In [248]:
checkin_df_filtered["user_id:token"].nunique(), checkin_df_filtered["item_id:token"].nunique()

(4150, 5259)

In [249]:
checkin_df_filtered

Unnamed: 0,user_id:token,item_id:token,timestamp:float,checkin_count:float
796759,-_yn1IOKetBJuPxfK6u8IA,uEe6LCrh8vcPY9-T-CGvGQ,2018-01-01 00:04:39,1
5640196,wJ57oQxkmdN2Milto9nOmw,4YoVUmkpUBNtdYa804Wzvg,2018-01-01 00:13:15,1
109088,LEJ0Ux_flXswhbDKBl2alQ,8eDkw7CE0NKqMknPIu26fw,2018-01-01 00:25:04,1
851990,R2ztwUadqjgqAGTIXvrZtQ,68b9-2VCkQQ_Rj-dYQPfMw,2018-01-01 00:54:00,1
2193768,bHCwwbbDoMLT2OacNQVyOw,8SOgWpYKJgiEfTuyXKGdHw,2018-01-01 00:54:01,1
...,...,...,...,...
4756549,C7t1Jaha23kRaudGlotPPg,lOipP8dh4Daqid9uro4xNA,2019-12-31 23:14:18,1
3385078,qj4eSnrR0aZUK2OPT-qYvA,MgJ_P-xlnTorvkW7l7fXKg,2019-12-31 23:17:16,1
5362123,xx_kgsWOShVCGMcKYOQFgQ,NL_RDDQ_uPAT8t-x7hINvA,2019-12-31 23:28:14,1
628448,jvIISCRUSoSafUx9Ak1oew,3StNEgKAwpCFR1q0urmJrw,2019-12-31 23:36:29,1


In [250]:
value_counts = checkin_df_filtered['item_id:token'].value_counts().reset_index()
value_counts.columns = ['item_id:token', 'count']

max_count = value_counts['count'].max()
value_counts['business_popularity:float'] = value_counts['count'] / max_count
 
checkin_df_filtered = checkin_df_filtered.merge(value_counts[['item_id:token', 'business_popularity:float']], on = "item_id:token", how='left')


In [251]:
def user_popularity_sample_calculator(checkin_df_filtered, poi_df, user_df, sep_num, checkin_df_timestamp):
    # Calculate average popularity per user
    average_popularity_per_user = checkin_df_filtered.groupby('user_id:token')['business_popularity:float'].mean().reset_index() # try out median of item popularities in user profile instead of mean 
    average_popularity_per_user.columns = ['user_id:token', 'average_popularity']

    average_popularity_per_user = average_popularity_per_user.sort_values(by="average_popularity", ascending=False)

    # Get top users
    high_pop_user_df_sample = average_popularity_per_user.head(sep_num)
    
    # Get the users around the median
    median_index = len(average_popularity_per_user) // 2
    start_med_index = max(median_index -int (sep_num*1.5), 0)
    end_med_index = min(median_index + int(sep_num*1.5), len(average_popularity_per_user))
    med_pop_user_df_sample = average_popularity_per_user.iloc[start_med_index:end_med_index]
    
    # Get the lowest users
    low_pop_user_df_sample = average_popularity_per_user.tail(sep_num)

    unique_users = list(set(high_pop_user_df_sample["user_id:token"].tolist() + med_pop_user_df_sample["user_id:token"].tolist() + low_pop_user_df_sample["user_id:token"].tolist()))

    checkin_df_sample = checkin_df_filtered[checkin_df_filtered["user_id:token"].isin(unique_users)]
    checkin_df_sample = checkin_df_sample[checkin_df_sample["user_id:token"].isin(unique_users)]

    user_df_sample = user_df[user_df["user_id:token"].isin(unique_users)]
    poi_df_sample = poi_df[poi_df["item_id:token"].isin(checkin_df_sample["item_id:token"])]

    checkin_df_sample = checkin_df_sample[checkin_df_sample["item_id:token"].isin(poi_df_sample["item_id:token"])]

    checkin_df_timestamp = checkin_df_timestamp[checkin_df_timestamp["user_id:token"].isin(unique_users)]
    checkin_df_timestamp = checkin_df_timestamp[checkin_df_timestamp["item_id:token"].isin(poi_df_sample["item_id:token"])]

    
    return checkin_df_sample, high_pop_user_df_sample, med_pop_user_df_sample, low_pop_user_df_sample, user_df_sample, poi_df_sample, checkin_df_timestamp

In [252]:
sep_num = 1500//5

sep_num*1.5

450.0

In [253]:
if checkin_df_filtered["user_id:token"].nunique() > 1500:
    sep_num = 1500 // 5
else:
    sep_num = checkin_df_filtered["user_id:token"].nunique() // 5

print(sep_num)

300


In [254]:
checkin_df_sample, high_pop_user_df_sample, med_pop_user_df_sample, low_pop_user_df_sample, user_df_sample, poi_df_sample, checkin_df_timestamp = user_popularity_sample_calculator(checkin_df_filtered, poi_df, user_df, sep_num, checkin_df_timestamp)

In [255]:
def id_factorizer(checkin_df_sample, high_pop_user_df_sample, med_pop_user_df_sample, low_pop_user_df_sample, user_df_sample, poi_df_sample, checkin_df_timestamp):
    """Overwriting the actual ID with a factorized ID so that we can use the same ID both in RecBole and CAPRI"""
    checkin_df_sample['user_id:token'], user_id_map = pd.factorize(checkin_df_sample['user_id:token'])
    checkin_df_sample['item_id:token'], business_id_map = pd.factorize(checkin_df_sample['item_id:token'])

    # Create mapping dictionaries
    user_id_mapping = {original: i for i, original in enumerate(user_id_map)}
    business_id_mapping = {original: j for j, original in enumerate(business_id_map)}

    high_pop_user_df_sample['user_id:token'] = high_pop_user_df_sample['user_id:token'].map(user_id_mapping)
    med_pop_user_df_sample['user_id:token'] = med_pop_user_df_sample['user_id:token'].map(user_id_mapping)
    low_pop_user_df_sample['user_id:token'] = low_pop_user_df_sample['user_id:token'].map(user_id_mapping)

    checkin_df_timestamp["user_id:token"] = checkin_df_timestamp["user_id:token"].map(user_id_mapping)
    checkin_df_timestamp["item_id:token"] = checkin_df_timestamp["item_id:token"].map(business_id_mapping)

    user_df_sample['user_id:token'] = user_df_sample['user_id:token'].map(user_id_mapping)
    poi_df_sample['item_id:token'] = poi_df_sample['item_id:token'].map(business_id_mapping)


    return checkin_df_sample, high_pop_user_df_sample, med_pop_user_df_sample, low_pop_user_df_sample, user_df_sample, poi_df_sample, checkin_df_timestamp

In [256]:
checkin_df_sample, high_pop_user_df_sample, med_pop_user_df_sample, low_pop_user_df_sample, user_df_sample, poi_df_sample, checkin_df_timestamp = id_factorizer(checkin_df_sample, high_pop_user_df_sample, med_pop_user_df_sample, low_pop_user_df_sample, user_df_sample, poi_df_sample, checkin_df_timestamp)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_df_sample['user_id:token'] = user_df_sample['user_id:token'].map(user_id_mapping)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  poi_df_sample['item_id:token'] = poi_df_sample['item_id:token'].map(business_id_mapping)


In [257]:
checkin_df_sample

Unnamed: 0,user_id:token,item_id:token,timestamp:float,checkin_count:float,business_popularity:float
4,0,0,2018-01-01 00:54:01,1,0.086957
7,1,1,2018-01-01 01:08:53,1,0.047826
8,2,2,2018-01-01 01:19:18,1,0.130435
11,3,3,2018-01-01 01:32:55,1,0.086957
16,4,4,2018-01-01 02:46:20,1,0.134783
...,...,...,...,...,...
113819,662,1796,2019-12-31 20:18:02,1,0.065217
113820,1073,2998,2019-12-31 21:16:55,1,0.173913
113824,852,897,2019-12-31 21:48:14,1,0.304348
113834,1029,4492,2019-12-31 23:28:14,1,0.052174


In [258]:
def user_id_token_adder(df, column_name_list = ["user_id:token", "item_id:token"]):
    """ Recbole needs a token (string) instead of a number for the user and item ID"""
    for column_name in column_name_list:
        try:
            df[column_name] = df[column_name].astype(int)
            df[column_name] = df[column_name].astype(str) + "_x"
        except KeyError:
            pass
    return df

In [259]:
checkin_df_sample = user_id_token_adder(checkin_df_sample)
high_pop_user_df_sample = user_id_token_adder(high_pop_user_df_sample)
med_pop_user_df_sample = user_id_token_adder(med_pop_user_df_sample)
low_pop_user_df_sample = user_id_token_adder(low_pop_user_df_sample)
user_df_sample = user_id_token_adder(user_df_sample)
poi_df_sample = user_id_token_adder(poi_df_sample)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column_name] = df[column_name].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column_name] = df[column_name].astype(str) + "_x"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column_name] = df[column_name].astype(int)
A value is trying to be set on a copy of a slice from a Data

In [260]:
# get a json with the user id's of the respective popularity groups
user_id_popularity = {}
user_id_popularity["high"] = high_pop_user_df_sample["user_id:token"].tolist()
user_id_popularity["medium"] = med_pop_user_df_sample["user_id:token"].tolist()
user_id_popularity["low"] = low_pop_user_df_sample["user_id:token"].tolist()
json.dump(user_id_popularity, open(f"{DATASET_DIR}/{dataset}_user_id_popularity.json", "w"))


In [261]:
def data_saver_recbole(df, framework, suffix):
    
    if not os.path.exists(DATASET_DIR + "processed_data_" + framework):
        os.makedirs(DATASET_DIR + "processed_data_" + framework)

    df.to_csv(f"{DATASET_DIR}processed_data_{framework}/{dataset}_sample.{suffix}", sep="\t", index=False)

In [262]:
checkin_df_sample['review_id:token'] = range(1, len(checkin_df_sample) + 1)

In [263]:
checkin_df_sample = convert_to_unix_timestamp(checkin_df_sample, "timestamp:float")
checkin_df_timestamp = convert_to_unix_timestamp(checkin_df_timestamp, "timestamp:float")

In [264]:
checkin_df_sample.sort_values(by="checkin_count:float", ascending=False)
# very important: keeping the duplicate check-ins for the context aware recommendation to have the timestamps saved


# very important: dropping duplicate check-ins 
checkin_df_sample = checkin_df_sample.drop_duplicates(subset=["user_id:token", "item_id:token"], keep="first")

In [265]:
user_df_sample = user_df_sample[["user_id:token"]]

In [266]:
# This would be the correct splits if we let recbole do the splitting
# data_saver_recbole(checkin_df_sample, "recbole", "inter")
# data_saver_recbole(user_df_sample, "recbole", "user")
# data_saver_recbole(poi_df_sample, "recbole", "item")

In [267]:
checkin_df_timestamp = checkin_df_timestamp[["user_id:token", "item_id:token", "timestamp:float"]] # FINAL
checkins_capri_train_test_tune = checkin_df_sample[["user_id:token", "item_id:token", "timestamp:float", "checkin_count:float"]]
try:
    poi_df_sample_capri = poi_df_sample[["item_id:token", "lat:float", "lon:float"]] # FINAL
except KeyError: # in the snowcard data the coordinates are not given
    poi_df_sample_capri = poi_df_sample[["item_id:token"]]
datasize_capri = pd.DataFrame(data={"num_users" : [len(checkins_capri_train_test_tune["user_id:token"].unique())], "num_items" : [len(checkins_capri_train_test_tune["item_id:token"].unique())]}) # FINAL

### Creating train, test and val splits (user-based temporal split)

In [268]:
# splitting the data into train, test, and tune
checkins_capri_train_test_tune = checkins_capri_train_test_tune.sort_values(by=["user_id:token", "timestamp:float"])
checkins_capri_train_test_tune = checkins_capri_train_test_tune[["user_id:token", "item_id:token", "checkin_count:float"]]

# Split the data
train_list = []
val_list = []
test_list = []

for user, group in checkins_capri_train_test_tune.groupby('user_id:token'):
    n = len(group)
    train_end = int(n * 0.65)
    val_end = int(n * 0.80)
    
    train_list.append(group.iloc[:train_end])
    val_list.append(group.iloc[train_end:val_end])
    test_list.append(group.iloc[val_end:])

# Combine lists into DataFrames
train_df = pd.concat(train_list)
val_df = pd.concat(val_list)
test_df = pd.concat(test_list)



# Check the splits

# FINAL 6-8
print("Train Set:")
print(train_df.head())
print("\nValidation Set:")
print(val_df.head())
print("\nTest Set:")
print(test_df.head())


Train Set:
      user_id:token item_id:token  checkin_count:float
4               0_x           0_x                    1
1620            0_x         451_x                    1
2151            0_x         569_x                    1
10078           0_x        1810_x                    1
13923           0_x         311_x                    1

Validation Set:
      user_id:token item_id:token  checkin_count:float
50004           0_x          52_x                    1
52294           0_x        3869_x                    1
69045           0_x        2749_x                    1
87441        1000_x        1419_x                    1
89454        1000_x        3346_x                    2

Test Set:
      user_id:token item_id:token  checkin_count:float
75051           0_x         532_x                    1
83036           0_x        1872_x                    1
84999           0_x        2269_x                    1
88813           0_x        2619_x                    1
92581           0_x       

In [269]:
def datasaver_capri(df, filename):
    
    if not os.path.exists(DATASET_DIR + "processed_data_capri"):
        os.makedirs(DATASET_DIR + "processed_data_capri")
    
    df.to_csv(DATASET_DIR + "processed_data_capri/" + filename + ".txt", sep='\t', index=False, header=False)
    print("Data saved as " + filename + ".txt")
    



In [270]:
# adding a category column
if include_categories is True:
    if dataset == "yelp":
        # Split the 'category_name' column by commas
        poi_df_sample['category_name_unstacked:token_seq'] = poi_df_sample['category_name:token_seq'].str.split(', ')

        # Unstack the categories into multiple rows
        category_df_sample = poi_df_sample.explode('category_name_unstacked:token_seq')
        category_counts = category_df_sample["category_name_unstacked:token_seq"].value_counts()
        category_mask = category_df_sample["category_name_unstacked:token_seq"].map(category_counts) >= 25
        category_df_sample_filtered = category_df_sample.loc[category_mask]
        category_df_sample_filtered["category_id:token"], category_id = pd.factorize(category_df_sample_filtered["category_name_unstacked:token_seq"])
        category_df_sample_filtered.dropna(inplace=True)
        datasize_capri = pd.DataFrame(data={"num_users" : [len(checkins_capri_train_test_tune["user_id:token"].unique())], "num_items" : [len(checkins_capri_train_test_tune["item_id:token"].unique())], "num_categories" : [len(category_df_sample_filtered["category_id:token"].unique())]}) # FINAL
        datasaver_capri(category_df_sample_filtered, "poiCategories")


    elif dataset == "foursquarenyc" or dataset == "foursquaretky":
        poi_df_sample["category_id:token"], category_id = pd.factorize(poi_df_sample["category_name:token_seq"])
        datasize_capri = pd.DataFrame(data={"num_users" : [len(checkins_capri_train_test_tune["user_id:token"].unique())], "num_items" : [len(checkins_capri_train_test_tune["item_id:token"].unique())], "num_categories" : [len(poi_df_sample["category_id:token"].unique())]})
        poi_df_categories = poi_df_sample[["item_id:token", "category_id:token"]]
        datasaver_capri(poi_df_categories, "poiCategories")

    elif dataset == "snowcard":
        datasize_capri = pd.DataFrame(data={"num_users" : [len(checkins_capri_train_test_tune["user_id:token"].unique())], "num_items" : [len(checkins_capri_train_test_tune["item_id:token"].unique())], "num_categories" : [len(poi_df_sample["category_id:token"].unique())]})
        poi_df_categories = poi_df_sample[["item_id:token", "category_id:token"]]
        datasaver_capri(poi_df_categories, "poiCategories")
    

In [271]:
train_df["user_id:token"].nunique(), val_df["user_id:token"].nunique(), test_df["user_id:token"].nunique()

(1500, 1500, 1500)

In [272]:
# This is the correct split since we perform the splitting ourselves
data_saver_recbole(train_df, "recbole", "train.inter")
data_saver_recbole(test_df, "recbole", "test.inter")
data_saver_recbole(val_df, "recbole", "valid.inter")

In [273]:
# RECBOLE DEBIAS PREPROCESSING

# ### Make an intervened sample for RecBole debias (not needed for current approach since RecBole Debias not used )
# checkins_debias = checkins_capri_train_test_tune.copy()
# intervened_set = checkins_debias.sample(frac=0.5, random_state=10002)
# normal_set = checkins_debias.drop(intervened_set.index)

# original_user_ids = set(checkins_debias["user_id:token"].unique())
# intervened_user_ids = set(intervened_set["user_id:token"].unique())
# normal_user_ids = set(normal_set["user_id:token"].unique())

# missing_in_intervened = original_user_ids - intervened_user_ids
# missing_in_normal = original_user_ids - normal_user_ids

# if not missing_in_intervened and not missing_in_normal:
#     print("All user IDs are present in both subsets.")
# else:
#     print("Missing user IDs in intervened set:", missing_in_intervened)
#     print("Missing user IDs in normal set:", missing_in_normal)


# # SOURCE: https://github.com/DavyMorgan/dps/blob/19a3e5fb2cb6932c3f093ed443760ecd3d95bfdb/data_process_service/splitter.py

# popularity = (
#             intervened_set[["item_id:token", "user_id:token"]]
#             .groupby("item_id:token")
#             .count()
#             .reset_index()
#             .rename(columns={"user_id:token": "pop"})
#         )
# intervened_set = intervened_set.merge(popularity, on="item_id:token", how="left")
# intervened_set["pop"] = intervened_set["pop"].apply(lambda x: 1 / x)

# intervened_set.sort_values(by=["pop", "item_id:token"], ascending=False, inplace=True)
# intervened_set.drop(columns=["pop"], inplace=True)

# # Lists to collect data for each split
# train_list_intervened, val_list_intervened, test_list_intervened = [], [], []

# # Loop through each user's data in the sorted intervened set
# for user, group in intervened_set.groupby('user_id:token'):
#     n = len(group)
    
#     # Calculate end indices based on the desired split ratios
#     test_end = int(n * 0.5)  # Top 50% for test
#     train_end = test_end + int(n * 0.25)  # Next 25% for train
    
#     # Add to respective lists
#     test_list_intervened.append(group.iloc[:test_end])  # First 50% goes to test
#     train_list_intervened.append(group.iloc[test_end:train_end])  # Next 25% goes to train
#     val_list_intervened.append(group.iloc[train_end:])  # Remaining 25% goes to validation

# # Concatenate lists into DataFrames
# train_df_intervened = pd.concat(train_list_intervened, ignore_index=True)
# train_df_intervened = pd.concat([train_df_intervened, normal_set], ignore_index=True).sort_values(by=["user_id:token"])
# val_df_intervened = pd.concat(val_list_intervened, ignore_index=True)
# test_df_intervened = pd.concat(test_list_intervened, ignore_index=True)

# # Output the sizes of each set
# print("Intervened Training Set Size:", len(train_df_intervened))
# print("Intervened Validation Set Size:", len(val_df_intervened))
# print("Intervened Test Set Size:", len(test_df_intervened))

# data_saver_recbole(train_df_intervened, "recbole_debias", "train.inter")
# data_saver_recbole(test_df_intervened, "recbole_debias", "test.inter")
# data_saver_recbole(val_df_intervened, "recbole_debias", "valid.inter")

### CAPRI (Context-Aware POI Recommendation)

In [274]:
def user_id_cleaner(df, column_name_list = ["user_id:token", "item_id:token"]):
    """Save for CAPRI without _x since they require integers as IDs"""
    for column_name in column_name_list:
        df[column_name] = df[column_name].str.split("_")
        df[column_name] = df[column_name].apply(lambda x: x[0])

    return df

In [275]:
poi_df_sample_capri = user_id_cleaner(poi_df_sample_capri, ["item_id:token"])
train_df = user_id_cleaner(train_df)
val_df = user_id_cleaner(val_df)
test_df = user_id_cleaner(test_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column_name] = df[column_name].str.split("_")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column_name] = df[column_name].apply(lambda x: x[0])


In [276]:
train_df["user_id:token"] = train_df["user_id:token"].astype(int)
train_df.sort_values(by="item_id:token", ascending=False)

Unnamed: 0,user_id:token,item_id:token,checkin_count:float
17202,465,999,1
34809,809,999,1
4328,459,999,1
103560,1491,999,1
78642,1298,999,1
...,...,...,...
84855,1421,0,1
75735,521,0,2
36337,1001,0,1
35420,1113,0,1


In [277]:
datasaver_capri(checkin_df_timestamp, "checkins")
datasaver_capri(datasize_capri, "dataSize")
datasaver_capri(poi_df_sample_capri, "poiCoos")
datasaver_capri(train_df, "train")
datasaver_capri(val_df, "tune")
datasaver_capri(test_df, "test")

Data saved as checkins.txt
Data saved as dataSize.txt
Data saved as poiCoos.txt
Data saved as train.txt
Data saved as tune.txt
Data saved as test.txt
