In [159]:
import pandas as pd
import os


dataset = "foursquarenyc"

DATASET_DIR = f"/Volumes/Forster Neu/Masterarbeit Data/{dataset}_dataset/"


In [160]:
checkin_df = pd.read_csv(DATASET_DIR + "foursquare_data.csv", sep=",")


In [161]:
checkin_df.rename(columns={"userId": "user_id", "venueId": "business_id", "venueCategoryId" : "category_id", "utcTimestamp": "timestamp"}, inplace=True)

In [162]:
print("Number of users, number of POIs", len(checkin_df["user_id"].unique()), len(checkin_df["business_id"].unique())
)
print("Sparsity:", 1 - len(checkin_df) / (len(checkin_df["user_id"].unique()) * len(checkin_df["business_id"].unique())))

Number of users, number of POIs 1083 38333
Sparsity: 0.9945217396687467


In [163]:
# filtering out users and businesses with less than 10 reviews

business_counts = checkin_df["business_id"].value_counts()
mask = checkin_df['business_id'].map(business_counts) >= 10
checkin_df_filtered = checkin_df.loc[mask]

counts = checkin_df_filtered ['user_id'].value_counts()
mask = checkin_df_filtered ['user_id'].map(counts) >= 10
checkin_df_filtered = checkin_df_filtered .loc[mask]

In [164]:
checkin_df_filtered["user_id"].value_counts()

user_id
293     2515
185     1918
354     1620
315     1530
974      967
        ... 
627       25
607       21
693       20
1069      19
1073      17
Name: count, Length: 1083, dtype: int64

In [165]:
# Step 1: Calculate the value counts of `business_id`
value_counts = checkin_df_filtered['business_id'].value_counts().reset_index()
value_counts.columns = ['business_id', 'count']

# Step 2: Normalize the counts by dividing by the maximum value count
max_count = value_counts['count'].max()
value_counts['business_popularity'] = value_counts['count'] / max_count

# Step 3: Merge the normalized counts back into the original DataFrame
checkin_df_filtered = checkin_df_filtered.merge(value_counts[['business_id', 'business_popularity']], on='business_id', how='left')


In [166]:
def user_popularity_calculator(checkin_df_filtered):

    average_popularity_per_user = checkin_df_filtered.groupby('user_id')['business_popularity'].mean().reset_index()
    average_popularity_per_user.sort_values(by="business_popularity", ascending=False)

    
    threshold_low = average_popularity_per_user['business_popularity'].quantile(0.2)
    threshold_med = average_popularity_per_user['business_popularity'].quantile(0.8)
    
    # Filter users based on thresholds
    high_pop_user_df = average_popularity_per_user[average_popularity_per_user['business_popularity'] >= threshold_med]
    medium_pop_user_df = average_popularity_per_user[(average_popularity_per_user['business_popularity'] >= threshold_low) & (average_popularity_per_user['business_popularity'] < threshold_med)]
    low_pop_user_df = average_popularity_per_user[average_popularity_per_user['business_popularity'] < threshold_low]



    return high_pop_user_df, medium_pop_user_df, low_pop_user_df

In [167]:
high_pop_user_df, medium_pop_user_df, low_pop_user_df = user_popularity_calculator(checkin_df_filtered)

In [168]:
def data_sample_maker(high_pop_user_df, medium_pop_user_df, low_pop_user_df, checkin_df_filtered):
    # sampling not necessary because the dataset is so small

    high_pop_user_df_sample = high_pop_user_df.copy()
    medium_pop_user_df_sample = medium_pop_user_df.copy()
    low_pop_user_df_sample = low_pop_user_df.copy()

    unique_users = list(set(high_pop_user_df_sample["user_id"].tolist() + medium_pop_user_df_sample["user_id"].tolist() + low_pop_user_df_sample["user_id"].tolist()))

    checkin_df_sample = checkin_df_filtered.loc[checkin_df_filtered["user_id"].isin(unique_users)]

    return checkin_df_sample, high_pop_user_df_sample, medium_pop_user_df_sample, low_pop_user_df_sample

    

In [169]:
checkin_df_sample, high_pop_user_df_sample, medium_pop_user_df_sample, low_pop_user_df_sample = data_sample_maker(high_pop_user_df, medium_pop_user_df, low_pop_user_df, checkin_df_filtered)

In [170]:
def data_saver(df, filename, framework):
    if not os.path.exists(DATASET_DIR + "processed_data_" + framework):
        os.makedirs(DATASET_DIR + "processed_data_" + framework)
    
    df.to_csv(DATASET_DIR + "processed_data_" + framework + "/" + filename + ".csv")
    print("Data saved as " + framework + filename + ".csv")
    

In [171]:
# first of all saving data for cornac
data_saver(checkin_df_sample, "user_events", "cornac")
data_saver(high_pop_user_df_sample, "high_pop_user_sample", "cornac")
data_saver(medium_pop_user_df_sample, "medium_pop_user_sample", "cornac")
data_saver(low_pop_user_df_sample, "low_pop_user_sample", "cornac")


Data saved as cornacuser_events.csv
Data saved as cornachigh_pop_user_sample.csv
Data saved as cornacmedium_pop_user_sample.csv
Data saved as cornaclow_pop_user_sample.csv


In [172]:
def data_saver_recbole(df, framework, suffix):
    
    if not os.path.exists(DATASET_DIR + "processed_data_" + framework):
        os.makedirs(DATASET_DIR + "processed_data_" + framework)

    df.to_csv(f"{DATASET_DIR}processed_data_{framework}/{dataset}_sample.{suffix}", sep="\t", index=False)

In [173]:
checkin_df_sample['review_id'] = range(1, len(checkin_df_sample) + 1)

In [174]:
# Step 1: Group by user_id and business_id and count check-ins
checkin_df_sample['checkin_count'] = checkin_df_sample.groupby(['user_id', 'business_id'])['business_id'].transform('count')


In [175]:
business_sample = checkin_df_sample.groupby('business_id').first().reset_index()
business_sample = business_sample[["business_id", "latitude", "longitude"]]


In [176]:
user_df_sample = checkin_df_sample.groupby('user_id').size().reset_index(name='review_counts:float')
checkin_df_sample = checkin_df_sample[["review_id","user_id","business_id","timestamp", "checkin_count"]]

In [177]:
def convert_to_unix_timestamp(df, column_name):
    """
    Convert a column of timestamps in a DataFrame to Unix timestamps.

    Args:
        df (pd.DataFrame): The DataFrame containing the timestamp column.
        column_name (str): The name of the column with timestamps in "%Y-%m-%d %H:%M:%S" format.

    Returns:
        pd.DataFrame: The DataFrame with an additional column for Unix timestamps.
    """
    # Convert the column to datetime objects
    df[column_name] = pd.to_datetime(df[column_name], format="mixed")
    
    # Convert datetime objects to Unix timestamps
    df[f'{column_name}'] = df[column_name].apply(lambda x: x.timestamp())
    
    return df

In [178]:
checkin_df_sample.rename(columns={"user_id":"user_id:token", "business_id":"item_id:token", "checkin_count":"rating:float", "timestamp":"timestamp:float", "review_id": "review_id:token"}, inplace=True)
user_df_sample.rename(columns={"user_id":"user_id:token"}, inplace=True)

In [179]:
checkin_df_sample = convert_to_unix_timestamp(checkin_df_sample, "timestamp:float")

In [180]:
data_saver_recbole(checkin_df_sample, "recbole", "inter")
data_saver_recbole(user_df_sample, "recbole", "user")
data_saver_recbole(business_sample, "recbole", "item")

In [181]:
# reverting to unix timestamps and only using the necessary columns
review_df_sample = checkin_df_sample.copy()
#ALMOST FINAL - STARS_BIN MUST BE REMOVED FOR checkins.txt AND date_unix FOR train/test/tune.txt
checkins_capri_min = review_df_sample[["user_id:token", "item_id:token", "timestamp:float"]]
checkins_capri_train_test_tune = review_df_sample[["user_id:token", "item_id:token", "timestamp:float", "rating:float"]]

In [182]:
len(checkins_capri_min["user_id:token"].unique())
len(checkins_capri_min["item_id:token"].unique())

print("Length of users: ", len(checkins_capri_min["user_id:token"].unique()))
print("Length of POIs: ", len(checkins_capri_min["item_id:token"].unique()))



Length of users:  1083
Length of POIs:  5135


In [183]:
# FINAL 3
datasize_capri = pd.DataFrame(data={"num_users" : [len(checkins_capri_min["user_id:token"].unique())], "num_items" : [len(checkins_capri_min["item_id:token"].unique())]})

In [184]:
# FINAL 4
poi_coos_capri = business_sample.copy()
user_df_sample = pd.concat([high_pop_user_df_sample, medium_pop_user_df_sample, low_pop_user_df_sample])

In [185]:
checkins_capri_min

Unnamed: 0,user_id:token,item_id:token,timestamp:float
0,470,49bbd6c0f964a520f4531fe3,1.333476e+09
1,979,4a43c0aef964a520c6a61fe3,1.333476e+09
2,69,4c5cc7b485a1e21e00d35711,1.333476e+09
3,87,4cf2c5321d18a143951b5cec,1.333476e+09
4,642,4ab966c3f964a5203c7f20e3,1.333476e+09
...,...,...,...
147933,994,45850853f964a5209f3f1fe3,1.360982e+09
147934,688,3fd66200f964a52000e71ee3,1.360982e+09
147935,560,4bca32ff0687ef3be789dbcc,1.360982e+09
147936,945,50a77716e4b0b5a9492f6f56,1.360982e+09


In [186]:
checkins_capri_min['user_id_int'], user_id_map = pd.factorize(checkins_capri_min['user_id:token'])
checkins_capri_min['business_id_int'], business_id_map = pd.factorize(checkins_capri_min['item_id:token'])
# FINAL 2.0
checkins_capri_min_int = checkins_capri_min[["user_id_int", "business_id_int", "timestamp:float"]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  checkins_capri_min['user_id_int'], user_id_map = pd.factorize(checkins_capri_min['user_id:token'])


In [187]:
checkins_capri_min_int

Unnamed: 0,user_id_int,business_id_int,timestamp:float
0,0,0,1.333476e+09
1,1,1,1.333476e+09
2,2,2,1.333476e+09
3,3,3,1.333476e+09
4,4,4,1.333476e+09
...,...,...,...
147933,518,4150,1.360982e+09
147934,97,3682,1.360982e+09
147935,443,2476,1.360982e+09
147936,637,5073,1.360982e+09


In [188]:
# Create mapping dictionaries
user_id_mapping = {original: i for i, original in enumerate(user_id_map)}
business_id_mapping = {original: i for i, original in enumerate(business_id_map)}


In [189]:
poi_coos_capri["business_id_int"] = poi_coos_capri["business_id"].map(business_id_mapping)
# FINAL 2.0
poi_coos_capri_int = poi_coos_capri[["business_id_int", "latitude", "longitude"]]

In [190]:
checkins_capri_train_test_tune['user_id_int'] = checkins_capri_train_test_tune['user_id:token'].map(user_id_mapping)
checkins_capri_train_test_tune['business_id_int'] = checkins_capri_train_test_tune['item_id:token'].map(business_id_mapping)

In [191]:
checkins_capri_train_test_tune = checkins_capri_train_test_tune[["user_id_int", "business_id_int", "rating:float", "timestamp:float"]]

In [192]:
# FINAL
checkins_capri_min_int
datasize_capri
poi_coos_capri_int


Unnamed: 0,business_id_int,latitude,longitude
0,3682,40.733596,-74.003139
1,322,40.756377,-73.967653
2,1914,40.739685,-74.006020
3,2334,40.718363,-73.990817
4,62,40.722842,-73.994116
...,...,...,...
5130,5126,40.743355,-73.932838
5131,5127,40.756945,-73.986011
5132,5132,40.889077,-74.108426
5133,5133,40.741677,-74.004831


In [193]:
checkins_capri_train_test_tune

Unnamed: 0,user_id_int,business_id_int,rating:float,timestamp:float
0,0,0,27,1.333476e+09
1,1,1,40,1.333476e+09
2,2,2,104,1.333476e+09
3,3,3,1,1.333476e+09
4,4,4,9,1.333476e+09
...,...,...,...,...
147933,518,4150,1,1.360982e+09
147934,97,3682,1,1.360982e+09
147935,443,2476,1,1.360982e+09
147936,637,5073,25,1.360982e+09


In [194]:
# splitting the data into train, test, and tune
checkins_capri_train_test_tune = checkins_capri_train_test_tune.sort_values(by=["user_id_int", "timestamp:float"])
checkins_capri_train_test_tune = checkins_capri_train_test_tune[["user_id_int", "business_id_int", "rating:float"]]

# Split the data
train_list = []
val_list = []
test_list = []

for user, group in checkins_capri_train_test_tune.groupby('user_id_int'):
    n = len(group)
    train_end = int(n * 0.65)
    val_end = int(n * 0.80)
    
    train_list.append(group.iloc[:train_end])
    val_list.append(group.iloc[train_end:val_end])
    test_list.append(group.iloc[val_end:])

# Combine lists into DataFrames
train_df = pd.concat(train_list)
val_df = pd.concat(val_list)
test_df = pd.concat(test_list)



# Check the splits

# FINAL 6-8
print("Train Set:")
print(train_df.head())
print("\nValidation Set:")
print(val_df.head())
print("\nTest Set:")
print(test_df.head())


Train Set:
      user_id_int  business_id_int  rating:float
0               0                0            27
462             0              422             1
1349            0                0            27
2946            0             1452             1
2948            0             1781             1

Validation Set:
       user_id_int  business_id_int  rating:float
25386            0             3872            13
25466            0                0            27
26677            0             3872            13
26735            0                0            27
28254            0             3872            13

Test Set:
       user_id_int  business_id_int  rating:float
33036            0                0            27
33535            0             3872            13
33590            0                0            27
35013            0                0            27
36041            0                0            27


In [195]:
train_df

Unnamed: 0,user_id_int,business_id_int,rating:float
0,0,0,27
462,0,422,1
1349,0,0,27
2946,0,1452,1
2948,0,1781,1
...,...,...,...
138569,1082,5130,11
138570,1082,5131,10
138571,1082,2575,7
138605,1082,2463,5


In [196]:
def datasaver_capri(df, filename):
    
    if not os.path.exists(DATASET_DIR + "processed_data_capri"):
        os.makedirs(DATASET_DIR + "processed_data_capri")
    
    df.to_csv(DATASET_DIR + "processed_data_capri/" + filename + ".txt", sep='\t', index=False, header=False)
    print("Data saved as " + filename + ".txt")
    

    


In [197]:
datasaver_capri(checkins_capri_min_int, "checkins")
datasaver_capri(datasize_capri, "dataSize")
datasaver_capri(poi_coos_capri_int, "poiCoos")
datasaver_capri(train_df, "train")
datasaver_capri(val_df, "tune")
datasaver_capri(test_df, "test")

Data saved as checkins.txt
Data saved as dataSize.txt
Data saved as poiCoos.txt
Data saved as train.txt
Data saved as tune.txt
Data saved as test.txt
