In [713]:
import pandas as pd
import os
import json


dataset = "foursquaretky"

DATASET_DIR = f"/Volumes/Forster Neu/Masterarbeit Data/{dataset}_dataset/"


available_datasets = ["foursquarenyc", "foursquaretky", "yelp", "gowalla", "brightkite", "snowcard"]

In [714]:
def open_big_json(file_path):
    data = []

    # Open the file and read it line by line
    with open(file_path, 'r') as file:
        for line in file:
            # Parse the JSON data and append to the list
            data.append(json.loads(line))

    # Create a DataFrame from the list of records
    df = pd.DataFrame(data)

    # Print the first few rows of the DataFram

    return df

In [715]:
def convert_to_unix_timestamp(df, column_name):
    """
    Convert a column of timestamps in a DataFrame to Unix timestamps.

    Args:
        df (pd.DataFrame): The DataFrame containing the timestamp column.
        column_name (str): The name of the column with timestamps in "%Y-%m-%d %H:%M:%S" format.

    Returns:
        pd.DataFrame: The DataFrame with an additional column for Unix timestamps.
    """
    # Convert the column to datetime objects
    df[column_name] = pd.to_datetime(df[column_name], format="mixed")
    
    # Convert datetime objects to Unix timestamps
    df[f'{column_name}'] = df[column_name].apply(lambda x: x.timestamp())
    
    return df

In [716]:
if dataset == "snowcard":
    checkin_df = pd.read_csv(DATASET_DIR+"TSC_EEL_EXPORT.csv", encoding="latin1", sep=";", header=None, names=["timestamp:float", "user_id:token", "category_id:token", "category_name:token_seq", "name:token_seq", "user_type:token_seq"])
    checkin_df["item_id:token"], item_id = pd.factorize(checkin_df["name:token_seq"])
    user_df = checkin_df[["user_id:token", "user_type:token_seq"]].drop_duplicates(subset=["user_id:token"])
    poi_df = checkin_df[["item_id:token", "name:token_seq", "category_id:token", "category_name:token_seq"]].drop_duplicates(subset=["item_id:token"])
    checkin_df = checkin_df[["user_id:token", "item_id:token", "timestamp:float"]]

elif dataset == "foursquarenyc" or dataset == "foursquaretky":
    checkin_df = pd.read_csv(DATASET_DIR + "foursquare_data.csv", sep=",")
    checkin_df = checkin_df.drop(columns=["timezoneOffset"])
    checkin_df = checkin_df.rename(columns={"venueId": "item_id:token", "venueCategoryId": "category_id:token", "venueCategory": "category_name:token_seq", "userId": "user_id:token", "utcTimestamp": "timestamp:float", "latitude": "lat:float", "longitude": "lon:float"})
    user_df = checkin_df[["user_id:token"]].drop_duplicates()

    poi_df = checkin_df[["item_id:token", "category_id:token", "category_name:token_seq", "lat:float", "lon:float"]].drop_duplicates(subset=["item_id:token"])
    checkin_df = checkin_df[["user_id:token", "item_id:token", "timestamp:float"]]

elif dataset == "gowalla" or dataset == "brightkite":
    checkin_df = pd.read_csv(DATASET_DIR + f"loc-{dataset}_totalCheckins.txt", sep="\t", header=None, names=['user_id:token', 'timestamp:float', 'lat:float', 'lon:float', 'item_id:token'])
    checkin_df = checkin_df[~checkin_df['item_id:token'].isin(["00000000000000000000000000000000", "ede07eeea22411dda0ef53e233ec57ca"])]
    user_df = pd.read_csv(DATASET_DIR + f"loc-{dataset}_edges.txt", sep="\t", header=None, names=['user_id:token', 'friends:token_seq'])
    user_df = user_df.groupby('user_id:token')['friends:token_seq'].apply(lambda x: ','.join(map(str, x))).reset_index()
    user_df.columns = ['user_id:token', 'friends:token_seq']
    poi_df = checkin_df[['item_id:token', "lat:float", "lon:float"]].drop_duplicates(subset="item_id:token")
    checkin_df = checkin_df.drop(columns=["lat:float", "lon:float"])




elif dataset == "yelp":
    poi_df = pd.read_json(DATASET_DIR + "yelp_academic_dataset_business.json", lines=True)
    poi_df = poi_df.loc[poi_df['is_open'] == 1]
    poi_df = poi_df.drop(columns=["review_count", "stars", "hours", "is_open", "city", "state", "postal_code", "attributes", "address"])
    poi_df = poi_df.rename(columns={"latitude": "lat:float", "longitude": "lon:float", "business_id": "item_id:token", "name":"name:token_seq", "categories":"category_name:token_seq"})
    user_df = open_big_json(DATASET_DIR + "yelp_academic_dataset_user.json")
    user_df = user_df.drop(columns=["review_count", "name", "yelping_since", "useful", "funny", "cool", "elite", "fans", "compliment_hot", "average_stars", "compliment_more", "compliment_profile", "compliment_cute", "compliment_list", "compliment_note", "compliment_plain", "compliment_cool", "compliment_funny", "compliment_writer", "compliment_photos"])
    user_df = user_df.rename(columns={"user_id": "user_id:token", "friends": "friends:token_seq"})
    checkin_df = open_big_json(DATASET_DIR + "yelp_academic_dataset_review.json")
    checkin_df = checkin_df.drop(columns=["text", "cool", "stars", "useful", "funny", "review_id"])
    checkin_df = checkin_df.rename(columns={"user_id": "user_id:token", "business_id": "item_id:token", "date": "timestamp:float"})
    

In [717]:
poi_df

Unnamed: 0,item_id:token,category_id:token,category_name:token_seq,lat:float,lon:float
0,4f0fd5a8e4b03856eeb6c8cb,4bf58dd8d48988d10c951735,Cosmetics Shop,35.705101,139.619590
1,4b7b884ff964a5207d662fe3,4bf58dd8d48988d1d1941735,Ramen / Noodle House,35.715581,139.800317
2,4c16fdda96040f477cc473a5,4d954b0ea243a5684a65b473,Convenience Store,35.714542,139.480065
3,4c178638c2dfc928651ea869,4bf58dd8d48988d118951735,Food & Drink Shop,35.725592,139.776633
4,4f568309e4b071452e447afe,4f2a210c4b9023bd5841ed28,Housing Development,35.656083,139.734046
...,...,...,...,...,...
573640,4bb55c4e2f70c9b66f2b8430,4bf58dd8d48988d124941735,Office,35.665447,139.836892
573663,4b91d257f964a520a8d933e3,4bf58dd8d48988d1e0931735,Coffee Shop,35.750061,139.587119
573668,50fcc9a3e4b07380b3aae03e,4bf58dd8d48988d1cc941735,Steakhouse,35.671634,139.857184
573675,4bda94b8c79cc928e5327fe9,4bf58dd8d48988d1f9941735,Food & Drink Shop,35.703205,139.579236


In [718]:
print("Number of users, number of POIs", len(checkin_df["user_id:token"].unique()), len(checkin_df["item_id:token"].unique())
)
print("Sparsity:", 1 - len(checkin_df) / (len(checkin_df["user_id:token"].unique()) * len(checkin_df["item_id:token"].unique())))

Number of users, number of POIs 2293 61858
Sparsity: 0.9959552918331572


In [719]:
def filter_df(df, min_reviews_user=15, min_reviews_business=10):
    while True:
        # Filter businesses with at least min_reviews reviews
        business_counts = df["item_id:token"].value_counts()
        business_mask = df['item_id:token'].map(business_counts) >= min_reviews_business
        df_filtered = df.loc[business_mask]

        # Filter users with at least min_reviews reviews
        user_counts = df_filtered['user_id:token'].value_counts()
        user_mask = df_filtered['user_id:token'].map(user_counts) >= min_reviews_user
        df_filtered = df_filtered.loc[user_mask]

        # If the size of the filtered DataFrame didn't change, break the loop
        if df_filtered.shape[0] == df.shape[0]:
            break

        # Update the DataFrame for the next iteration
        df = df_filtered

    return df_filtered

In [720]:
checkin_df_filtered = filter_df(checkin_df, min_reviews_business=10, min_reviews_user=15)

In [721]:
checkin_df_filtered

Unnamed: 0,user_id:token,item_id:token,timestamp:float
2,114,4c16fdda96040f477cc473a5,Tue Apr 03 19:12:07 +0000 2012
7,114,4b3eae5cf964a520b4a025e3,Tue Apr 03 19:35:36 +0000 2012
8,1635,4cca7bd67965b60c80f0858a,Tue Apr 03 19:51:50 +0000 2012
9,2033,4b5c7671f964a520083129e3,Tue Apr 03 19:51:59 +0000 2012
10,589,4b5ed39cf964a520079a29e3,Tue Apr 03 19:59:06 +0000 2012
...,...,...,...
573695,2277,4b82669cf964a5209ed130e3,Sat Feb 16 02:33:48 +0000 2013
573697,2277,4b56c4c5f964a520c41a28e3,Sat Feb 16 02:34:32 +0000 2013
573698,326,4bab3456f964a5204d993ae3,Sat Feb 16 02:34:35 +0000 2013
573699,853,4b559c09f964a520efe827e3,Sat Feb 16 02:34:53 +0000 2013


In [722]:
# Step 1: Calculate the value counts of `business_id`
value_counts = checkin_df_filtered['item_id:token'].value_counts().reset_index()
value_counts.columns = ['item_id:token', 'count']

# Step 2: Normalize the counts y dividing by the maximum value count
max_count = value_counts['count'].max()
value_counts['business_popularity:float'] = value_counts['count'] / max_count

# Step 3: Merge the normalized counts back into the original DataFrame
checkin_df_filtered = checkin_df_filtered.merge(value_counts[['item_id:token', 'business_popularity:float']], on = "item_id:token", how='left')


In [723]:
def user_popularity_sample_calculator(checkin_df_filtered, poi_df, user_df, sep_num):
    # Calculate average popularity per user
    average_popularity_per_user = checkin_df_filtered.groupby('user_id:token')['business_popularity:float'].mean().reset_index()
    average_popularity_per_user.columns = ['user_id:token', 'average_popularity']

    average_popularity_per_user = average_popularity_per_user.sort_values(by="average_popularity", ascending=False)

    
    # Sort by average popularity
    

    # Get top 1000 users
    high_pop_user_df_sample = average_popularity_per_user.head(sep_num)
    
    # Get the middle 1000 users around the median
    median_index = len(average_popularity_per_user) // 2
    start_med_index = max(median_index -int (sep_num/2), 0)
    end_med_index = min(median_index + int(sep_num/2), len(average_popularity_per_user))
    med_pop_user_df_sample = average_popularity_per_user.iloc[start_med_index:end_med_index]
    
    # Get the lowest 1000 users
    low_pop_user_df_sample = average_popularity_per_user.tail(sep_num)

    unique_users = list(set(high_pop_user_df_sample["user_id:token"].tolist() + med_pop_user_df_sample["user_id:token"].tolist() + low_pop_user_df_sample["user_id:token"].tolist()))

    checkin_df_sample = checkin_df_filtered[checkin_df_filtered["user_id:token"].isin(unique_users)]

    checkin_df_sample = checkin_df_sample[checkin_df_sample["user_id:token"].isin(unique_users)]

    unique_items = checkin_df_sample["item_id:token"].unique()

    user_df_sample = user_df[user_df["user_id:token"].isin(unique_users)]
    poi_df_sample = poi_df[poi_df["item_id:token"].isin(unique_items)]


    
    return checkin_df_sample, high_pop_user_df_sample, med_pop_user_df_sample, low_pop_user_df_sample, user_df_sample, poi_df_sample

In [724]:
checkin_df_sample, high_pop_user_df_sample, med_pop_user_df_sample, low_pop_user_df_sample, user_df_sample, poi_df_sample = user_popularity_sample_calculator(checkin_df_filtered, poi_df, user_df, 500)

In [725]:
high_pop_user_df_sample

Unnamed: 0,user_id:token,average_popularity
46,47,0.692056
22,23,0.680855
1038,1039,0.612799
837,838,0.610174
1274,1275,0.605600
...,...,...
2129,2130,0.150612
2214,2215,0.150303
338,339,0.149993
2083,2084,0.149761


In [726]:
def id_factorizer(checkin_df_sample, high_pop_user_df_sample, med_pop_user_df_sample, low_pop_user_df_sample, user_df_sample, poi_df_sample):
    """Overwriting the actual ID with a factorized ID so that we can use the same ID both in RecBole and CAPRI"""
    checkin_df_sample['user_id:token'], user_id_map = pd.factorize(checkin_df_sample['user_id:token'])
    checkin_df_sample['item_id:token'], business_id_map = pd.factorize(checkin_df_sample['item_id:token'])

    # Create mapping dictionaries
    user_id_mapping = {original: i for i, original in enumerate(user_id_map)}
    business_id_mapping = {original: j for j, original in enumerate(business_id_map)}

    high_pop_user_df_sample['user_id:token'] = high_pop_user_df_sample['user_id:token'].map(user_id_mapping)
    med_pop_user_df_sample['user_id:token'] = med_pop_user_df_sample['user_id:token'].map(user_id_mapping)
    low_pop_user_df_sample['user_id:token'] = low_pop_user_df_sample['user_id:token'].map(user_id_mapping)

    user_df_sample['user_id:token'] = user_df_sample['user_id:token'].map(user_id_mapping)
    poi_df_sample['item_id:token'] = poi_df_sample['item_id:token'].map(business_id_mapping)

    return checkin_df_sample, high_pop_user_df_sample, med_pop_user_df_sample, low_pop_user_df_sample, user_df_sample, poi_df_sample

In [727]:
checkin_df_sample, high_pop_user_df_sample, med_pop_user_df_sample, low_pop_user_df_sample, user_df_sample, poi_df_sample = id_factorizer(checkin_df_sample, high_pop_user_df_sample, med_pop_user_df_sample, low_pop_user_df_sample, user_df_sample, poi_df_sample)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_df_sample['user_id:token'] = user_df_sample['user_id:token'].map(user_id_mapping)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  poi_df_sample['item_id:token'] = poi_df_sample['item_id:token'].map(business_id_mapping)


In [728]:
checkin_df_sample

Unnamed: 0,user_id:token,item_id:token,timestamp:float,business_popularity:float
0,0,0,Tue Apr 03 19:12:07 +0000 2012,0.006790
1,0,1,Tue Apr 03 19:35:36 +0000 2012,0.068623
2,1,2,Tue Apr 03 19:51:50 +0000 2012,0.010346
3,2,3,Tue Apr 03 19:51:59 +0000 2012,0.001536
4,3,4,Tue Apr 03 19:59:06 +0000 2012,0.043970
...,...,...,...,...
447561,816,24,Sat Feb 16 02:30:20 +0000 2013,0.961122
447562,881,24,Sat Feb 16 02:32:51 +0000 2013,0.961122
447564,1312,4300,Sat Feb 16 02:33:18 +0000 2013,0.001778
447567,464,1575,Sat Feb 16 02:34:35 +0000 2013,0.003071


In [729]:
def data_saver(df, filename, framework):
    if not os.path.exists(DATASET_DIR + "processed_data_" + framework):
        os.makedirs(DATASET_DIR + "processed_data_" + framework)
    
    df.to_csv(DATASET_DIR + "processed_data_" + framework + "/" + filename + ".csv")
    print("Data saved as " + framework + filename + ".csv")
    

In [730]:
# first of all saving data for cornac
data_saver(checkin_df_sample, "user_events", "cornac")
data_saver(high_pop_user_df_sample, "high_pop_user_sample", "cornac")
data_saver(med_pop_user_df_sample, "medium_pop_user_sample", "cornac")
data_saver(low_pop_user_df_sample, "low_pop_user_sample", "cornac")


Data saved as cornacuser_events.csv
Data saved as cornachigh_pop_user_sample.csv
Data saved as cornacmedium_pop_user_sample.csv
Data saved as cornaclow_pop_user_sample.csv


In [731]:
def data_saver_recbole(df, framework, suffix):
    
    if not os.path.exists(DATASET_DIR + "processed_data_" + framework):
        os.makedirs(DATASET_DIR + "processed_data_" + framework)

    df.to_csv(f"{DATASET_DIR}processed_data_{framework}/{dataset}_sample.{suffix}", sep="\t", index=False)

In [732]:
checkin_df_sample['review_id:token'] = range(1, len(checkin_df_sample) + 1)
# Step 1: Group by user_id and business_id and count check-ins
checkin_df_sample['checkin_count:float'] = checkin_df_sample.groupby(['user_id:token', 'item_id:token'])['item_id:token'].transform('count')


In [733]:
checkin_df_sample

Unnamed: 0,user_id:token,item_id:token,timestamp:float,business_popularity:float,review_id:token,checkin_count:float
0,0,0,Tue Apr 03 19:12:07 +0000 2012,0.006790,1,1
1,0,1,Tue Apr 03 19:35:36 +0000 2012,0.068623,2,25
2,1,2,Tue Apr 03 19:51:50 +0000 2012,0.010346,3,128
3,2,3,Tue Apr 03 19:51:59 +0000 2012,0.001536,4,1
4,3,4,Tue Apr 03 19:59:06 +0000 2012,0.043970,5,15
...,...,...,...,...,...,...
447561,816,24,Sat Feb 16 02:30:20 +0000 2013,0.961122,279325,143
447562,881,24,Sat Feb 16 02:32:51 +0000 2013,0.961122,279326,11
447564,1312,4300,Sat Feb 16 02:33:18 +0000 2013,0.001778,279327,3
447567,464,1575,Sat Feb 16 02:34:35 +0000 2013,0.003071,279328,1


In [734]:
checkin_df_sample = convert_to_unix_timestamp(checkin_df_sample, "timestamp:float")

In [735]:
checkin_df_sample.sort_values(by="checkin_count:float", ascending=False)
# very important: keeping the duplicate check-ins for the context aware recommendation to have the timestamps saved
checkin_df_timestamps = checkin_df_sample.copy()

# very important: dropping duplicate check-ins 
checkin_df_sample = checkin_df_sample.drop_duplicates(subset=["user_id:token", "item_id:token"], keep="first")

In [736]:
data_saver_recbole(checkin_df_sample, "recbole", "inter")
data_saver_recbole(user_df_sample, "recbole", "user")
data_saver_recbole(poi_df_sample, "recbole", "item")

In [737]:
poi_df_sample

Unnamed: 0,item_id:token,category_id:token,category_name:token_seq,lat:float,lon:float
2,0,4d954b0ea243a5684a65b473,Convenience Store,35.714542,139.480065
7,1,4bf58dd8d48988d129951735,Train Station,35.700253,139.480255
8,2,4bf58dd8d48988d162941735,Other Great Outdoors,35.755759,139.733573
9,3,4bf58dd8d48988d1d1941735,Ramen / Noodle House,35.693121,139.699447
10,4,4bf58dd8d48988d1eb931735,Airport,35.548963,139.784611
...,...,...,...,...,...
509497,7288,4d954b0ea243a5684a65b473,Convenience Store,35.600404,139.592516
510129,7289,4bf58dd8d48988d1d2941735,Sushi Restaurant,35.697197,139.815391
519107,7293,4bf58dd8d48988d103941735,Home (private),35.650417,139.757541
528409,7298,4bf58dd8d48988d1fe931735,Bus Station,35.698772,139.617411


In [738]:
checkin_df_timestamps = checkin_df_timestamps[["user_id:token", "item_id:token", "timestamp:float"]] # FINAL
checkins_capri_train_test_tune = checkin_df_sample[["user_id:token", "item_id:token", "timestamp:float", "checkin_count:float"]]
try:
    poi_df_sample_capri = poi_df_sample[["item_id:token", "lat:float", "lon:float"]] # FINAL
except KeyError: # in the snowcard data the coordinates are not given
    poi_df_sample_capri = poi_df_sample[["item_id:token"]]
datasize_capri = pd.DataFrame(data={"num_users" : [len(checkins_capri_train_test_tune["user_id:token"].unique())], "num_items" : [len(checkins_capri_train_test_tune["item_id:token"].unique())]}) # FINAL

In [739]:
# splitting the data into train, test, and tune
checkins_capri_train_test_tune = checkins_capri_train_test_tune.sort_values(by=["user_id:token", "timestamp:float"])
checkins_capri_train_test_tune = checkins_capri_train_test_tune[["user_id:token", "item_id:token", "checkin_count:float"]]

# Split the data
train_list = []
val_list = []
test_list = []

for user, group in checkins_capri_train_test_tune.groupby('user_id:token'):
    n = len(group)
    train_end = int(n * 0.65)
    val_end = int(n * 0.80)
    
    train_list.append(group.iloc[:train_end])
    val_list.append(group.iloc[train_end:val_end])
    test_list.append(group.iloc[val_end:])

# Combine lists into DataFrames
train_df = pd.concat(train_list)
val_df = pd.concat(val_list)
test_df = pd.concat(test_list)



# Check the splits

# FINAL 6-8
print("Train Set:")
print(train_df.head())
print("\nValidation Set:")
print(val_df.head())
print("\nTest Set:")
print(test_df.head())


Train Set:
      user_id:token  item_id:token  checkin_count:float
0                 0              0                    1
1                 0              1                   25
16                0             12                    1
5686              0             79                    3
5712              0           1793                    1

Validation Set:
        user_id:token  item_id:token  checkin_count:float
121461              0           4224                    1
121505              0            124                    1
121525              0           3891                    1
121558              0           1887                    1
121779              0           3239                    1

Test Set:
        user_id:token  item_id:token  checkin_count:float
127808              0           6265                    2
138497              0           2223                    1
138696              0           5780                    1
139514              0            425         

In [740]:
def datasaver_capri(df, filename):
    
    if not os.path.exists(DATASET_DIR + "processed_data_capri"):
        os.makedirs(DATASET_DIR + "processed_data_capri")
    
    df.to_csv(DATASET_DIR + "processed_data_capri/" + filename + ".txt", sep='\t', index=False, header=False)
    print("Data saved as " + filename + ".txt")
    



In [741]:
# adding a category column

if dataset == "yelp":
    # Split the 'category_name' column by commas
    poi_df_sample['category_name_unstacked:token_seq'] = poi_df_sample['category_name:token_seq'].str.split(', ')

    # Unstack the categories into multiple rows
    category_df_sample = poi_df_sample.explode('category_name_unstacked:token_seq')
    category_counts = category_df_sample["category_name_unstacked:token_seq"].value_counts()
    category_mask = category_df_sample["category_name_unstacked:token_seq"].map(category_counts) >= 25
    category_df_sample_filtered = category_df_sample.loc[category_mask]
    category_df_sample_filtered["category_id:token"], category_id = pd.factorize(category_df_sample_filtered["category_name_unstacked:token_seq"])
    category_df_sample_filtered.dropna(inplace=True)
    datasize_capri = pd.DataFrame(data={"num_users" : [len(checkins_capri_train_test_tune["user_id:token"].unique())], "num_items" : [len(checkins_capri_train_test_tune["item_id:token"].unique())], "num_categories" : [len(category_df_sample_filtered["category_id:token"].unique())]}) # FINAL
    datasaver_capri(category_df_sample_filtered, "poiCategories")


elif dataset == "foursquarenyc" or dataset == "foursquaretky":
    poi_df_sample["category_id:token"], category_id = pd.factorize(poi_df_sample["category_name:token_seq"])
    datasize_capri = pd.DataFrame(data={"num_users" : [len(checkins_capri_train_test_tune["user_id:token"].unique())], "num_items" : [len(checkins_capri_train_test_tune["item_id:token"].unique())], "num_categories" : [len(poi_df_sample["category_id:token"].unique())]})
    poi_df_categories = poi_df_sample[["item_id:token", "category_id:token"]]
    datasaver_capri(poi_df_categories, "poiCategories")

elif dataset == "snowcard":
    datasize_capri = pd.DataFrame(data={"num_users" : [len(checkins_capri_train_test_tune["user_id:token"].unique())], "num_items" : [len(checkins_capri_train_test_tune["item_id:token"].unique())], "num_categories" : [len(poi_df_sample["category_id:token"].unique())]})
    poi_df_categories = poi_df_sample[["item_id:token", "category_id:token"]]
    datasaver_capri(poi_df_categories, "poiCategories")
    

Data saved as poiCategories.txt


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  poi_df_sample["category_id:token"], category_id = pd.factorize(poi_df_sample["category_name:token_seq"])


In [742]:
datasaver_capri(checkin_df_timestamps, "checkins")
datasaver_capri(datasize_capri, "dataSize")
datasaver_capri(poi_df_sample_capri, "poiCoos")
datasaver_capri(train_df, "train")
datasaver_capri(val_df, "tune")
datasaver_capri(test_df, "test")

Data saved as checkins.txt
Data saved as dataSize.txt
Data saved as poiCoos.txt
Data saved as train.txt
Data saved as tune.txt
Data saved as test.txt
