In [7]:
import pandas as pd
import os


dataset = "foursquarenyc"

DATASET_DIR = f"/Volumes/Forster Neu/Masterarbeit Data/{dataset}_dataset/"


In [8]:
checkin_df = pd.read_csv(DATASET_DIR + "foursquare_data.csv", sep=",")

In [9]:
checkin_df.rename(columns={"userId": "user_id", "venueId": "business_id", "venueCategoryId" : "category_id", "utcTimestamp": "timestamp"}, inplace=True)

In [10]:
checkin_df["user_id"].nunique()

1083

In [4]:
print("Number of users, number of POIs", len(checkin_df["user_id"].unique()), len(checkin_df["business_id"].unique())
)
print("Sparsity:", 1 - len(checkin_df) / (len(checkin_df["user_id"].unique()) * len(checkin_df["business_id"].unique())))

Number of users, number of POIs 2293 61858
Sparsity: 0.9959552918331572


In [89]:
def filter_df(df, min_reviews=10):
    while True:
        # Filter businesses with at least min_reviews reviews
        business_counts = df["business_id"].value_counts()
        business_mask = df['business_id'].map(business_counts) >= min_reviews
        df_filtered = df.loc[business_mask]

        # Filter users with at least min_reviews reviews
        user_counts = df_filtered['user_id'].value_counts()
        user_mask = df_filtered['user_id'].map(user_counts) >= min_reviews
        df_filtered = df_filtered.loc[user_mask]

        # If the size of the filtered DataFrame didn't change, break the loop
        if df_filtered.shape[0] == df.shape[0]:
            break

        # Update the DataFrame for the next iteration
        df = df_filtered

    return df_filtered

In [90]:
checkin_df_filtered = filter_df(checkin_df, min_reviews=10)

In [91]:
sep_num = int(checkin_df_filtered["user_id"].nunique()/3)

In [92]:
# Step 1: Calculate the value counts of `business_id`
value_counts = checkin_df_filtered['business_id'].value_counts().reset_index()
value_counts.columns = ['business_id', 'count']

# Step 2: Normalize the counts y dividing by the maximum value count
max_count = value_counts['count'].max()
value_counts['business_popularity'] = value_counts['count'] / max_count

# Step 3: Merge the normalized counts back into the original DataFrame
checkin_df_filtered = checkin_df_filtered.merge(value_counts[['business_id', 'business_popularity']], on='business_id', how='left')


In [93]:
def user_popularity_calculator(checkin_df_filtered, sep_num):
    # Filter out instances with the specified business_id
    try:
        checkin_df_filtered = checkin_df_filtered[checkin_df_filtered['business_id'] != "00000000000000000000000000000000"]
    except Exception as e:
        print("No such field found to filter out")
    # Calculate average popularity per user
    average_popularity_per_user = checkin_df_filtered.groupby('user_id')['business_popularity'].mean().reset_index()
    average_popularity_per_user.columns = ['user_id', 'average_popularity']

    average_popularity_per_user = average_popularity_per_user.sort_values(by="average_popularity", ascending=False)

    
    # Sort by average popularity
    

    # Get top 1000 users
    high_pop_user_df_sample = average_popularity_per_user.head(sep_num)
    
    # Get the middle 1000 users around the median
    median_index = len(average_popularity_per_user) // 2
    start_med_index = max(median_index -int (sep_num/2), 0)
    end_med_index = min(median_index + int(sep_num/2), len(average_popularity_per_user))
    med_pop_user_df_sample = average_popularity_per_user.iloc[start_med_index:end_med_index]
    
    # Get the lowest 1000 users
    low_pop_user_df_sample = average_popularity_per_user.tail(sep_num)

    unique_users = list(set(high_pop_user_df_sample["user_id"].tolist() + med_pop_user_df_sample["user_id"].tolist() + low_pop_user_df_sample["user_id"].tolist()))

    checkin_df_sample = checkin_df_filtered[checkin_df_filtered["user_id"].isin(unique_users)]

    checkin_df_sample = filter_df(checkin_df_sample, min_reviews=10)

    checkin_df_sample = checkin_df_sample[checkin_df_sample["user_id"].isin(unique_users)]


    
    return checkin_df_sample, high_pop_user_df_sample, med_pop_user_df_sample, low_pop_user_df_sample

In [94]:
checkin_df_sample, high_pop_user_df_sample, medium_pop_user_df_sample, low_pop_user_df_sample = user_popularity_calculator(checkin_df_filtered, sep_num)

In [95]:
len(checkin_df_sample["user_id"].unique())

2292

In [96]:
len(low_pop_user_df_sample)

764

In [97]:
# def data_sample_maker(high_pop_user_df, medium_pop_user_df, low_pop_user_df, checkin_df_filtered):
#     # sampling not necessary because the dataset is so small

#     high_pop_user_df_sample = high_pop_user_df.copy()
#     medium_pop_user_df_sample = medium_pop_user_df.copy()
#     low_pop_user_df_sample = low_pop_user_df.copy()

#     unique_users = list(set(high_pop_user_df_sample["user_id"].tolist() + medium_pop_user_df_sample["user_id"].tolist() + low_pop_user_df_sample["user_id"].tolist()))

#     checkin_df_sample = checkin_df_filtered.loc[checkin_df_filtered["user_id"].isin(unique_users)]

#     return checkin_df_sample, high_pop_user_df_sample, medium_pop_user_df_sample, low_pop_user_df_sample

    

In [98]:
# checkin_df_sample, high_pop_user_df_sample, medium_pop_user_df_sample, low_pop_user_df_sample = data_sample_maker(high_pop_user_df, medium_pop_user_df, low_pop_user_df, checkin_df_filtered)

In [99]:
def data_saver(df, filename, framework):
    if not os.path.exists(DATASET_DIR + "processed_data_" + framework):
        os.makedirs(DATASET_DIR + "processed_data_" + framework)
    
    df.to_csv(DATASET_DIR + "processed_data_" + framework + "/" + filename + ".csv")
    print("Data saved as " + framework + filename + ".csv")
    

In [100]:
# first of all saving data for cornac
data_saver(checkin_df_sample, "user_events", "cornac")
data_saver(high_pop_user_df_sample, "high_pop_user_sample", "cornac")
data_saver(medium_pop_user_df_sample, "medium_pop_user_sample", "cornac")
data_saver(low_pop_user_df_sample, "low_pop_user_sample", "cornac")


Data saved as cornacuser_events.csv
Data saved as cornachigh_pop_user_sample.csv
Data saved as cornacmedium_pop_user_sample.csv
Data saved as cornaclow_pop_user_sample.csv


In [101]:
def data_saver_recbole(df, framework, suffix):
    
    if not os.path.exists(DATASET_DIR + "processed_data_" + framework):
        os.makedirs(DATASET_DIR + "processed_data_" + framework)

    df.to_csv(f"{DATASET_DIR}processed_data_{framework}/{dataset}_sample.{suffix}", sep="\t", index=False)

In [102]:
checkin_df_sample['review_id'] = range(1, len(checkin_df_sample) + 1)
# Step 1: Group by user_id and business_id and count check-ins
checkin_df_sample['checkin_count'] = checkin_df_sample.groupby(['user_id', 'business_id'])['business_id'].transform('count')


In [103]:
business_sample = checkin_df_sample.groupby('business_id').first().reset_index()
business_sample.rename(columns={"latitude":"lat", "longitude":"lon"}, inplace=True)
business_sample = business_sample[["business_id", "lat", "lon"]]


In [104]:
user_df_sample = checkin_df_sample.groupby('user_id').size().reset_index(name='review_counts:float')
checkin_df_sample = checkin_df_sample[["review_id","user_id","business_id","timestamp", "checkin_count"]]

In [105]:
def convert_to_unix_timestamp(df, column_name):
    """
    Convert a column of timestamps in a DataFrame to Unix timestamps.

    Args:
        df (pd.DataFrame): The DataFrame containing the timestamp column.
        column_name (str): The name of the column with timestamps in "%Y-%m-%d %H:%M:%S" format.

    Returns:
        pd.DataFrame: The DataFrame with an additional column for Unix timestamps.
    """
    # Convert the column to datetime objects
    df[column_name] = pd.to_datetime(df[column_name], format="mixed")
    
    # Convert datetime objects to Unix timestamps
    df[f'{column_name}'] = df[column_name].apply(lambda x: x.timestamp())
    
    return df

In [106]:
checkin_df_sample.rename(columns={"user_id":"user_id:token", "business_id":"item_id:token", "checkin_count":"rating:float", "timestamp":"timestamp:float", "review_id": "review_id:token"}, inplace=True)
user_df_sample.rename(columns={"user_id":"user_id:token"}, inplace=True)
business_sample.rename(columns={"business_id": "item_id:token", "lat" : "lat:float", "lon" : "lon:float"}, inplace=True)

In [107]:
checkin_df_sample = convert_to_unix_timestamp(checkin_df_sample, "timestamp:float")

In [108]:
checkin_df_sample = checkin_df_sample.loc[checkin_df_sample["rating:float"] < 100]

checkin_df_sample.sort_values(by="rating:float", ascending=False)

checkin_df_timestamps = checkin_df_sample.copy()

checkin_df_sample = checkin_df_sample.drop_duplicates(subset=["user_id:token", "item_id:token"], keep="first")

In [109]:
business_sample

Unnamed: 0,item_id:token,lat:float,lon:float
0,4b058799f964a5208b9b22e3,35.690712,139.691119
1,4b058799f964a5208d9b22e3,35.672271,139.758711
2,4b058799f964a520929b22e3,35.687043,139.698871
3,4b058799f964a520959b22e3,35.670580,139.727849
4,4b058799f964a520979b22e3,35.690062,139.694569
...,...,...,...
7866,50dce1a4e4b0f3777e14560e,35.630669,139.795103
7867,50e137f4e4b0073eb57aeeb6,35.697748,139.564680
7868,50f351a3e4b077d65674c57d,35.650417,139.757541
7869,50f91a7fe4b0b1597b2c6f3a,35.642719,139.608459


In [110]:
data_saver_recbole(checkin_df_sample, "recbole", "inter")
data_saver_recbole(user_df_sample, "recbole", "user")
data_saver_recbole(business_sample, "recbole", "item")

In [111]:
# reverting to unix timestamps and only using the necessary columns
review_df_sample = checkin_df_sample.copy()
checkins_capri_min = checkin_df_timestamps[["user_id:token", "item_id:token", "timestamp:float"]]

checkins_capri_train_test_tune = review_df_sample[["user_id:token", "item_id:token", "timestamp:float", "rating:float"]]

In [112]:
len(checkins_capri_min["user_id:token"].unique())
len(checkins_capri_min["item_id:token"].unique())

print("Length of users: ", len(checkins_capri_min["user_id:token"].unique()))
print("Length of POIs: ", len(checkins_capri_min["item_id:token"].unique()))



Length of users:  2292
Length of POIs:  7855


In [113]:
# FINAL 3
datasize_capri = pd.DataFrame(data={"num_users" : [len(checkins_capri_min["user_id:token"].unique())], "num_items" : [len(checkins_capri_min["item_id:token"].unique())]})

In [114]:
# FINAL 4
poi_coos_capri = business_sample.copy()
user_df_sample = pd.concat([high_pop_user_df_sample, medium_pop_user_df_sample, low_pop_user_df_sample])

In [115]:
checkins_capri_min

Unnamed: 0,user_id:token,item_id:token,timestamp:float
0,114,4c16fdda96040f477cc473a5,1.333480e+09
1,114,4b3eae5cf964a520b4a025e3,1.333482e+09
3,2033,4b5c7671f964a520083129e3,1.333483e+09
4,589,4b5ed39cf964a520079a29e3,1.333483e+09
5,589,4e014c11c65b896d116d480c,1.333483e+09
...,...,...,...
447565,2277,4b82669cf964a5209ed130e3,1.360982e+09
447566,2277,4b56c4c5f964a520c41a28e3,1.360982e+09
447567,326,4bab3456f964a5204d993ae3,1.360982e+09
447568,853,4b559c09f964a520efe827e3,1.360982e+09


In [116]:
checkins_capri_min['user_id_int'], user_id_map = pd.factorize(checkins_capri_min['user_id:token'])
checkins_capri_min['business_id_int'], business_id_map = pd.factorize(checkins_capri_min['item_id:token'])

# FINAL 2.0
checkins_capri_min_int = checkins_capri_min[["user_id_int", "business_id_int", "timestamp:float"]]
# Create mapping dictionaries
user_id_mapping = {original: i for i, original in enumerate(user_id_map)}
business_id_mapping = {original: j for j, original in enumerate(business_id_map)}

poi_coos_capri["business_id_int"] = poi_coos_capri["item_id:token"].map(business_id_mapping)
# FINAL 2.0
poi_coos_capri_int = poi_coos_capri[["business_id_int", "lat:float", "lon:float"]]


# FINAL 2.0

checkins_capri_train_test_tune['user_id_int'] = checkins_capri_train_test_tune['user_id:token'].map(user_id_mapping)
checkins_capri_train_test_tune['business_id_int'] = checkins_capri_train_test_tune['item_id:token'].map(business_id_mapping)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  checkins_capri_min['user_id_int'], user_id_map = pd.factorize(checkins_capri_min['user_id:token'])


In [117]:
checkins_capri_train_test_tune = checkins_capri_train_test_tune[["user_id_int", "business_id_int", "rating:float", "timestamp:float"]]

In [118]:
poi_coos_capri_int.dropna(inplace=True)
poi_coos_capri_int["business_id_int"] = poi_coos_capri_int["business_id_int"].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  poi_coos_capri_int.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  poi_coos_capri_int["business_id_int"] = poi_coos_capri_int["business_id_int"].astype(int)


In [119]:
remaining_businesses = poi_coos_capri_int["business_id_int"].values.tolist()

In [120]:
def filter_remaining_businesses(df, remaining_businesses):
    return df[df["business_id_int"].isin(remaining_businesses)]


In [121]:
# splitting the data into train, test, and tune
checkins_capri_train_test_tune = checkins_capri_train_test_tune.sort_values(by=["user_id_int", "timestamp:float"])
checkins_capri_train_test_tune = checkins_capri_train_test_tune[["user_id_int", "business_id_int", "rating:float"]]

# Split the data
train_list = []
val_list = []
test_list = []

for user, group in checkins_capri_train_test_tune.groupby('user_id_int'):
    n = len(group)
    train_end = int(n * 0.65)
    val_end = int(n * 0.80)
    
    train_list.append(group.iloc[:train_end])
    val_list.append(group.iloc[train_end:val_end])
    test_list.append(group.iloc[val_end:])

# Combine lists into DataFrames
train_df = pd.concat(train_list)
val_df = pd.concat(val_list)
test_df = pd.concat(test_list)



# Check the splits

# FINAL 6-8
print("Train Set:")
print(train_df.head())
print("\nValidation Set:")
print(val_df.head())
print("\nTest Set:")
print(test_df.head())


Train Set:
      user_id_int  business_id_int  rating:float
0               0                0             1
1               0                1            25
16              0               15             1
5686            0              116             3
5712            0             2366             1

Validation Set:
        user_id_int  business_id_int  rating:float
121461            0             2739             1
121505            0              185             1
121525            0              401             1
121558            0               65             1
121779            0             4120             1

Test Set:
        user_id_int  business_id_int  rating:float
127808            0             5811             2
138497            0              820             1
138696            0             6765             1
139514            0              571             1
157393            0               67             1


In [122]:
train_df

Unnamed: 0,user_id_int,business_id_int,rating:float
0,0,0,1
1,0,1,25
16,0,15,1
5686,0,116,3
5712,0,2366,1
...,...,...,...
419845,2291,130,1
419904,2291,1942,1
420315,2291,156,1
421981,2291,903,4


In [123]:
def datasaver_capri(df, filename):
    
    if not os.path.exists(DATASET_DIR + "processed_data_capri"):
        os.makedirs(DATASET_DIR + "processed_data_capri")
    
    df.to_csv(DATASET_DIR + "processed_data_capri/" + filename + ".txt", sep='\t', index=False, header=False)
    print("Data saved as " + filename + ".txt")
    

    


In [124]:
checkins_capri_min_int

Unnamed: 0,user_id_int,business_id_int,timestamp:float
0,0,0,1.333480e+09
1,0,1,1.333482e+09
3,1,2,1.333483e+09
4,2,3,1.333483e+09
5,2,4,1.333483e+09
...,...,...,...
447565,2279,6492,1.360982e+09
447566,2279,566,1.360982e+09
447567,667,2102,1.360982e+09
447568,2276,1127,1.360982e+09


In [125]:
checkins_capri_min_int = filter_remaining_businesses(checkins_capri_min_int, remaining_businesses)
poi_coos_capri_int = filter_remaining_businesses(poi_coos_capri_int, remaining_businesses)
datasize_capri = pd.DataFrame(data={"num_users" : [len(checkins_capri_min_int["user_id_int"].unique())], "num_items" : [len(checkins_capri_min_int["business_id_int"].unique())]})
train_df = filter_remaining_businesses(train_df, remaining_businesses)
val_df = filter_remaining_businesses(val_df, remaining_businesses)
test_df = filter_remaining_businesses(test_df, remaining_businesses)

In [126]:
datasaver_capri(checkins_capri_min_int, "checkins")
datasaver_capri(datasize_capri, "dataSize")
datasaver_capri(poi_coos_capri_int, "poiCoos")
datasaver_capri(train_df, "train")
datasaver_capri(val_df, "tune")
datasaver_capri(test_df, "test")

Data saved as checkins.txt
Data saved as dataSize.txt
Data saved as poiCoos.txt
Data saved as train.txt
Data saved as tune.txt
Data saved as test.txt
