In [1]:
import pandas as pd
import os

dataset =  "brightkite"     # choose between "gowalla" and "brightkite"


DATASET_DIR = f"/Volumes/Forster Neu/Masterarbeit Data/{dataset}_dataset/"

In [2]:
checkin_df = pd.read_csv(DATASET_DIR + f"loc-{dataset}_totalCheckins.txt", sep="\t", header=None, names=['user_id', 'date', 'lat', 'lon', 'business_id'])
socials_df = pd.read_csv(DATASET_DIR + f"loc-{dataset}_edges.txt", sep="\t", header=None, names=['user_id', 'friend_id'])

In [77]:
checkin_df = checkin_df[~checkin_df['business_id'].isin(["00000000000000000000000000000000", "ede07eeea22411dda0ef53e233ec57ca"])]

In [3]:
socials_df

Unnamed: 0,user_id,friend_id
0,0,1
1,0,2
2,0,3
3,0,4
4,0,5
...,...,...
428151,58225,58226
428152,58225,58227
428153,58226,58220
428154,58226,58225


In [79]:
def filter_df(df, min_reviews=10):
    while True:
        # Filter businesses with at least min_reviews reviews
        business_counts = df["business_id"].value_counts()
        business_mask = df['business_id'].map(business_counts) >= min_reviews
        df_filtered = df.loc[business_mask]

        # Filter users with at least min_reviews reviews
        user_counts = df_filtered['user_id'].value_counts()
        user_mask = df_filtered['user_id'].map(user_counts) >= min_reviews
        df_filtered = df_filtered.loc[user_mask]

        # If the size of the filtered DataFrame didn't change, break the loop
        if df_filtered.shape[0] == df.shape[0]:
            break

        # Update the DataFrame for the next iteration
        df = df_filtered

    return df_filtered

In [80]:
checkin_df_filtered = filter_df(checkin_df, min_reviews=10)

In [81]:
checkin_df_filtered

Unnamed: 0,user_id,date,lat,lon,business_id
1,0,2010-10-16T06:02:04Z,39.891383,-105.070814,7a0f88982aa015062b95e3b4843f9ca2
2,0,2010-10-16T03:48:54Z,39.891077,-105.068532,dd7cd3d264c2d063832db506fba8bf79
3,0,2010-10-14T18:25:51Z,39.750469,-104.999073,9848afcc62e500a01cf6fbf24b797732f8963683
4,0,2010-10-14T00:21:47Z,39.752713,-104.996337,2ef143e12038c870038df53e0478cefc
8,0,2010-10-13T03:57:23Z,39.827022,-105.143191,f6f52a75fd80e27e3770cd3a87054f27
...,...,...,...,...,...
4747187,58186,2008-12-31T19:46:34Z,39.739154,-104.984703,ee8b1d0ea22411ddb074dbd65f1665cf
4747188,58186,2008-12-08T23:49:07Z,39.633321,-105.317215,ee8b88dea22411dda02b03b8e03e14ab
4747189,58186,2008-12-03T21:09:14Z,39.633321,-105.317215,ee8b88dea22411dda02b03b8e03e14ab
4747190,58186,2008-11-30T22:30:12Z,39.633321,-105.317215,ee8b88dea22411dda02b03b8e03e14ab


In [82]:
# Step 1: Calculate the value counts of `business_id`
value_counts = checkin_df_filtered['business_id'].value_counts().reset_index()
value_counts.columns = ['business_id', 'count']

# Step 2: Normalize the counts y dividing by the maximum value count
max_count = value_counts['count'].max()
value_counts['business_popularity'] = value_counts['count'] / max_count

# Step 3: Merge the normalized counts back into the original DataFrame
checkin_df_filtered = checkin_df_filtered.merge(value_counts[['business_id', 'business_popularity']], on='business_id', how='left')


In [83]:
def user_popularity_calculator(checkin_df_filtered, socials_df):
    # Filter out instances with the specified business_id
    try:
        checkin_df_filtered = checkin_df_filtered[checkin_df_filtered['business_id'] != "00000000000000000000000000000000"]
    except Exception as e:
        print("No such field found to filter out")
    # Calculate average popularity per user
    average_popularity_per_user = checkin_df_filtered.groupby('user_id')['business_popularity'].mean().reset_index()
    average_popularity_per_user.columns = ['user_id', 'average_popularity']

    average_popularity_per_user = average_popularity_per_user.sort_values(by="average_popularity", ascending=False)

    
    # Sort by average popularity
    

    # Get top 1000 users
    high_pop_user_df_sample = average_popularity_per_user.head(1500)
    
    # Get the middle 1000 users around the median
    median_index = len(average_popularity_per_user) // 2
    start_med_index = max(median_index - 750, 0)
    end_med_index = min(median_index + 750, len(average_popularity_per_user))
    med_pop_user_df_sample = average_popularity_per_user.iloc[start_med_index:end_med_index]
    
    # Get the lowest 1000 users
    low_pop_user_df_sample = average_popularity_per_user.tail(1500)

    unique_users = list(set(high_pop_user_df_sample["user_id"].tolist() + med_pop_user_df_sample["user_id"].tolist() + low_pop_user_df_sample["user_id"].tolist()))

    checkin_df_sample = checkin_df_filtered[checkin_df_filtered["user_id"].isin(unique_users)]

    checkin_df_sample = filter_df(checkin_df_sample, min_reviews=10)

    checkin_df_sample = checkin_df_sample[checkin_df_sample["user_id"].isin(unique_users)]



    socials_df_sample = socials_df.loc[socials_df["user_id"].isin(unique_users)]
    socials_df_sample = socials_df_sample.loc[socials_df_sample["friend_id"].isin(unique_users)]
    
    return checkin_df_sample, high_pop_user_df_sample, med_pop_user_df_sample, low_pop_user_df_sample, socials_df_sample

In [84]:
checkin_df_sample, high_pop_user_df_sample, medium_pop_user_df_sample, low_pop_user_df_sample, socials_df_sample = user_popularity_calculator(checkin_df_filtered, socials_df)

In [85]:
len(low_pop_user_df_sample)

1500

In [86]:
len(checkin_df_sample["user_id"].unique())

4202

In [87]:
def data_saver(df, filename, framework):
    if not os.path.exists(DATASET_DIR + "processed_data_" + framework):
        os.makedirs(DATASET_DIR + "processed_data_" + framework)
    
    df.to_csv(DATASET_DIR + "processed_data_" + framework + "/" + filename + ".csv")
    print("Data saved as " + framework + filename + ".csv")
    

In [88]:
# first of all saving data for cornac
data_saver(checkin_df_sample, "user_events", "cornac")
data_saver(high_pop_user_df_sample, "high_pop_user_sample", "cornac")
data_saver(medium_pop_user_df_sample, "medium_pop_user_sample", "cornac")
data_saver(low_pop_user_df_sample, "low_pop_user_sample", "cornac")
data_saver(socials_df, "socials_sample", "cornac")


Data saved as cornacuser_events.csv
Data saved as cornachigh_pop_user_sample.csv
Data saved as cornacmedium_pop_user_sample.csv
Data saved as cornaclow_pop_user_sample.csv
Data saved as cornacsocials_sample.csv


In [89]:
def data_saver_recbole(df, framework, suffix):
    
    if not os.path.exists(DATASET_DIR + "processed_data_" + framework):
        os.makedirs(DATASET_DIR + "processed_data_" + framework)

    df.to_csv(f"{DATASET_DIR}processed_data_{framework}/{dataset}_sample.{suffix}", sep="\t", index=False)

In [90]:
checkin_df_sample['review_id'] = range(1, len(checkin_df_sample) + 1)
# Step 1: Group by user_id and business_id and count check-ins
checkin_df_sample['checkin_count'] = checkin_df_sample.groupby(['user_id', 'business_id'])['business_id'].transform('count')


In [91]:
business_sample = checkin_df_sample.groupby('business_id').first().reset_index()
business_sample = business_sample[["business_id", "lat", "lon"]]


In [92]:
business_sample

Unnamed: 0,business_id,lat,lon
0,00204e1a5b3b38239885d598ee136dc54e3b6bba,42.015791,-87.842834
1,002d84f6f69032864901e9389416016b4e18cce8,30.371864,-97.725859
2,0055f6812c66283b3beedeafb1fac2f8a0468b23,35.171178,-106.589938
3,0065008261f45817d16bde83af272e47,42.907334,11.915677
4,00879999f574e6241f84d3ba6442fc5f,52.681236,-0.306004
...,...,...,...
9424,ffd485f6cd6611dd9980003048c0801e,36.862314,-76.304705
9425,ffdabea6ed3911ddba25003048c0801e,35.399708,-78.011890
9426,ffead5c643e111de9f3a003048c10834,38.176224,-83.439606
9427,fffa7914b1e00a4a384f592e96131ab9,35.187334,136.969434


In [93]:
user_df_sample = checkin_df_sample.groupby('user_id').size().reset_index(name='review_counts:float')
checkin_df_sample = checkin_df_sample[["review_id","user_id","business_id","date", "checkin_count"]]


In [94]:
user_df_sample

Unnamed: 0,user_id,review_counts:float
0,5,318
1,12,954
2,17,619
3,21,1356
4,25,1473
...,...,...
4197,57982,14
4198,58011,43
4199,58038,34
4200,58094,14


In [95]:
checkin_df_sample

Unnamed: 0,review_id,user_id,business_id,date,checkin_count
5175,1,5,8fde23d6245c11debf73003048c0801e,2010-08-12T23:22:05Z,6
5179,2,5,427afd521bfc11deaa07003048c0801e,2009-12-24T04:23:46Z,49
5183,3,5,ed128b88a22411dd947eb3052da0b148,2009-10-09T20:29:29Z,1
5184,4,5,ee81ef22a22411ddb5e97f082c799f59,2009-10-03T22:32:57Z,92
5185,5,5,ee6b8534a22411dd96da6f185082f76e,2009-10-02T20:26:43Z,69
...,...,...,...,...,...
3001583,553150,58185,c7017a267a9c11dd99970030487eb504,2008-09-05T17:34:39Z,21
3001584,553151,58185,c7017a267a9c11dd99970030487eb504,2008-09-05T05:59:09Z,21
3001585,553152,58185,c7017a267a9c11dd99970030487eb504,2008-09-05T05:58:44Z,21
3001586,553153,58185,c7017a267a9c11dd99970030487eb504,2008-09-04T17:17:45Z,21


In [20]:
def convert_to_unix_timestamp(df, column_name):
    """
    Convert a column of timestamps in a DataFrame to Unix timestamps.

    Args:
        df (pd.DataFrame): The DataFrame containing the timestamp column.
        column_name (str): The name of the column with timestamps in "%Y-%m-%d %H:%M:%S" format.

    Returns:
        pd.DataFrame: The DataFrame with an additional column for Unix timestamps.
    """
    # Convert the column to datetime objects
    df[column_name] = pd.to_datetime(df[column_name], format="mixed")
    
    # Convert datetime objects to Unix timestamps
    df[f'{column_name}'] = df[column_name].apply(lambda x: x.timestamp())
    
    return df

In [21]:
checkin_df_sample.rename(columns={"user_id":"user_id:token", "business_id":"item_id:token", "checkin_count":"rating:float", "date":"timestamp:float", "review_id": "review_id:token"}, inplace=True)
user_df_sample.rename(columns={"user_id":"user_id:token"}, inplace=True)
business_sample.rename(columns={"business_id": "item_id:token", "lat" : "lat:float", "lon" : "lon:float"}, inplace=True)

In [22]:
checkin_df_sample = convert_to_unix_timestamp(checkin_df_sample, "timestamp:float")

In [24]:
checkin_df_sample = checkin_df_sample.loc[checkin_df_sample["rating:float"] < 100]

checkin_df_sample.sort_values(by="rating:float", ascending=False)

checkin_df_timestamps = checkin_df_sample.copy()

checkin_df_sample = checkin_df_sample.drop_duplicates(subset=["user_id:token", "item_id:token"], keep="first")

In [25]:
checkin_df_timestamps["user_id:token"].nunique()

4027

In [26]:
data_saver_recbole(checkin_df_sample, "recbole", "inter")
data_saver_recbole(user_df_sample, "recbole", "user")
data_saver_recbole(business_sample, "recbole", "item")

### Lastly, getting the data ready for CAPRI

In [27]:
# reverting to unix timestamps and only using the necessary columns
review_df_sample = checkin_df_sample.copy()
checkins_capri_min = checkin_df_timestamps[["user_id:token", "item_id:token", "timestamp:float"]]

checkins_capri_train_test_tune = review_df_sample[["user_id:token", "item_id:token", "timestamp:float", "rating:float"]]

In [28]:
len(checkins_capri_min["user_id:token"].unique())
len(checkins_capri_min["item_id:token"].unique())

print("Length of users: ", len(checkins_capri_min["user_id:token"].unique()))
print("Length of POIs: ", len(checkins_capri_min["item_id:token"].unique()))



Length of users:  4027
Length of POIs:  8803


In [29]:
# FINAL 3
datasize_capri = pd.DataFrame(data={"num_users" : [len(checkins_capri_min["user_id:token"].unique())], "num_items" : [len(checkins_capri_min["item_id:token"].unique())]})

In [30]:
datasize_capri

Unnamed: 0,num_users,num_items
0,4027,8803


In [31]:
# FINAL 4
poi_coos_capri = business_sample.copy()

In [32]:
user_df_sample = pd.concat([high_pop_user_df_sample, medium_pop_user_df_sample, low_pop_user_df_sample])

In [33]:
# FINAL 5
socials_df_sample

Unnamed: 0,user_id,friend_id
537,5,29
547,5,123
549,5,125
550,5,127
1025,12,41
...,...,...
426437,57084,57083
427306,57708,55487
427555,57859,57860
427557,57860,57859


In [34]:
checkins_capri_train_test_tune.sort_values(by="rating:float", ascending=False)

Unnamed: 0,user_id:token,item_id:token,timestamp:float,rating:float
25696,36,6b92ac4dc45a850b326177f8e654bae58a453f16,1.255388e+09,99
1762077,10751,b9d89b5ea22411dda5ef27ee20f87cee,1.283949e+09,99
1395903,7844,bc03df0c7a6411dd95460030487eb504,1.258764e+09,99
2802526,36729,ef45799ca22411dd9236df37bed1f662,1.271716e+09,99
228250,674,ef31eaf8a22411dda712cf84a0eb3a89,1.255394e+09,99
...,...,...,...,...
2348576,19406,ee6b8534a22411dd96da6f185082f76e,1.223313e+09,1
382114,1587,947cb88bcbc87d6a3ef9a591f4302497a5b57da3,1.248669e+09,1
382120,1587,6cf5d08cbefe8a0c83fa8084800acac680e9504d,1.245122e+09,1
2348515,19406,be2f1e669cc111dd9a50003048c0801e,1.250832e+09,1


In [35]:
checkins_capri_min['user_id_int'], user_id_map = pd.factorize(checkins_capri_min['user_id:token'])
checkins_capri_min['business_id_int'], business_id_map = pd.factorize(checkins_capri_min['item_id:token'])

# FINAL 2.0
checkins_capri_min_int = checkins_capri_min[["user_id_int", "business_id_int", "timestamp:float"]]
# Create mapping dictionaries
user_id_mapping = {original: i for i, original in enumerate(user_id_map)}
business_id_mapping = {original: j for j, original in enumerate(business_id_map)}

poi_coos_capri["business_id_int"] = poi_coos_capri["item_id:token"].map(business_id_mapping)
# FINAL 2.0
poi_coos_capri_int = poi_coos_capri[["business_id_int", "lat:float", "lon:float"]]
socials_df_sample['user_id_int'] = socials_df_sample['user_id'].map(user_id_mapping)
socials_df_sample["friends_int"] = socials_df_sample["friend_id"].map(user_id_mapping)



# FINAL 2.0
social_relations_min_int = socials_df_sample[["user_id_int", "friends_int"]]
checkins_capri_train_test_tune['user_id_int'] = checkins_capri_train_test_tune['user_id:token'].map(user_id_mapping)
checkins_capri_train_test_tune['business_id_int'] = checkins_capri_train_test_tune['item_id:token'].map(business_id_mapping)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  checkins_capri_min['user_id_int'], user_id_map = pd.factorize(checkins_capri_min['user_id:token'])


In [37]:
# splitting the data into train, test, and tune
checkins_capri_train_test_tune = checkins_capri_train_test_tune.sort_values(by=["user_id_int", "timestamp:float"])
checkins_capri_train_test_tune = checkins_capri_train_test_tune[["user_id_int", "business_id_int", "rating:float"]]

# Split the data
train_list = []
val_list = []
test_list = []

for user, group in checkins_capri_train_test_tune.groupby('user_id_int'):
    n = len(group)
    train_end = int(n * 0.65)
    val_end = int(n * 0.80)
    
    train_list.append(group.iloc[:train_end])
    val_list.append(group.iloc[train_end:val_end])
    test_list.append(group.iloc[val_end:])

# Combine lists into DataFrames
train_df = pd.concat(train_list)
val_df = pd.concat(val_list)
test_df = pd.concat(test_list)



# Check the splits

# FINAL 6-8
print("Train Set:")
print(train_df.head())
print("\nValidation Set:")
print(val_df.head())
print("\nTest Set:")
print(test_df.head())


Train Set:
      user_id_int  business_id_int  rating:float
5516            0               28             2
5496            0               27             1
5473            0               26             1
5469            0               25             1
5439            0               24             3

Validation Set:
      user_id_int  business_id_int  rating:float
5222            0               10             3
5214            0                9             5
5209            0                8             4
5191            0                7             2
5189            0                6             2

Test Set:
      user_id_int  business_id_int  rating:float
5187            0                5             2
5185            0                4            69
5184            0                3            92
5183            0                2             1
5179            0                1            49


In [38]:
social_relations_min_int

Unnamed: 0,user_id_int,friends_int
537,0.0,6.0
547,0.0,27.0
549,0.0,28.0
550,0.0,29.0
1025,1.0,8.0
...,...,...
426437,,3999.0
427306,4011.0,3955.0
427555,4017.0,4018.0
427557,4018.0,4017.0


In [39]:
social_relations_min_int

Unnamed: 0,user_id_int,friends_int
537,0.0,6.0
547,0.0,27.0
549,0.0,28.0
550,0.0,29.0
1025,1.0,8.0
...,...,...
426437,,3999.0
427306,4011.0,3955.0
427555,4017.0,4018.0
427557,4018.0,4017.0


In [40]:
def datasaver_capri(df, filename):
    
    if not os.path.exists(DATASET_DIR + "processed_data_capri"):
        os.makedirs(DATASET_DIR + "processed_data_capri")
    df.to_csv(DATASET_DIR + "processed_data_capri/" + filename + ".txt", sep='\t', index=False, header=False)
    print("Data saved as " + filename + ".txt")
    

    


In [41]:
poi_coos_capri_int.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  poi_coos_capri_int.dropna(inplace=True)


In [42]:
poi_coos_capri_int["business_id_int"] = poi_coos_capri_int["business_id_int"].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  poi_coos_capri_int["business_id_int"] = poi_coos_capri_int["business_id_int"].astype(int)


In [43]:
social_relations_min_int.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  social_relations_min_int.dropna(inplace=True)


In [44]:
social_relations_min_int["user_id_int"] = social_relations_min_int["user_id_int"].astype(int)
social_relations_min_int["friends_int"] = social_relations_min_int["friends_int"].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  social_relations_min_int["user_id_int"] = social_relations_min_int["user_id_int"].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  social_relations_min_int["friends_int"] = social_relations_min_int["friends_int"].astype(int)


In [45]:
datasaver_capri(checkins_capri_min_int, "checkins")
datasaver_capri(datasize_capri, "dataSize")
datasaver_capri(poi_coos_capri_int, "poiCoos")
datasaver_capri(social_relations_min_int, "socialRelations")
datasaver_capri(train_df, "train")
datasaver_capri(val_df, "tune")
datasaver_capri(test_df, "test")

Data saved as checkins.txt
Data saved as dataSize.txt
Data saved as poiCoos.txt
Data saved as socialRelations.txt
Data saved as train.txt
Data saved as tune.txt
Data saved as test.txt
