In [None]:
import os, csv
import pandas as pd

os.getcwd()

In [None]:
df = pd.read_csv('../MicroLens-50k_pairs.csv')
print(f'shape: {df.shape}')
df[:5]

In [None]:
from collections import Counter
import numpy as np

min_u_num, min_i_num = 7, 5

def get_illegal_ids_by_inter_num(df, field, max_num=None, min_num=None):
    if field is None:
        return set()
    if max_num is None and min_num is None:
        return set()

    max_num = max_num or np.inf
    min_num = min_num or -1

    ids = df[field].values
    inter_num = Counter(ids)
    ids = {id_ for id_ in inter_num if inter_num[id_] < min_num or inter_num[id_] > max_num}
    print(f'{len(ids)} illegal_ids_by_inter_num, field={field}')

    return ids


def filter_by_k_core(df):
    while True:
        ban_users = get_illegal_ids_by_inter_num(df, field='user', max_num=None, min_num=min_u_num)
        ban_items = get_illegal_ids_by_inter_num(df, field='item', max_num=None, min_num=min_i_num)
        if len(ban_users) == 0 and len(ban_items) == 0:
            return

        dropped_inter = pd.Series(False, index=df.index)
        if 'user':
            dropped_inter |= df['user'].isin(ban_users)
        if 'item':
            dropped_inter |= df['item'].isin(ban_items)
        print(f'{len(dropped_inter)} dropped interactions')
        df.drop(df.index[dropped_inter], inplace=True)

In [None]:
filter_by_k_core(df)
print(f'k-core shape: {df.shape}')
print(f'shape after k-core: {df.shape}')
df[:5]

In [None]:
print(df['user'].nunique())
print(df['item'].nunique())

In [None]:
import pandas as pd
import numpy as np

# df has three columns: 'user', 'item', 'timestamp'

# calculate the frequency of each item
item_frequency = df.groupby('item').size().reset_index(name='frequency')

# sort items by frequency in descending order
item_frequency_sorted = item_frequency.sort_values(by='frequency', ascending=False)

# sort the original dataframe by timestamp
df_sorted = df.sort_values('timestamp')

# get all unique items and users
all_items = set(df_sorted['item'].unique())
all_users = df_sorted['user'].unique()

# generate negative samples for each user
negative_samples_per_user = {}
for user in all_users:
    user_items = df_sorted[df_sorted['user'] == user]['item'].unique()
    available_items = list(all_items - set(user_items))
    negative_samples = np.random.choice(available_items, size=min(20, len(available_items)), replace=False)
    negative_samples_per_user[user] = negative_samples

print(len(negative_samples_per_user.keys()))

In [None]:
top_6_items_per_user = {}
for user in all_users:
    # get the items interacted by the user
    user_items = df_sorted[df_sorted['user'] == user]['item']
    # get the top 6 items based on frequency
    top_6_items = user_items.map(item_frequency_sorted.set_index('item')['frequency']).sort_values(ascending=False).index[:6]
    # sorted these top 6 items by timestamp
    top_6_items_per_user[user] = df_sorted.loc[top_6_items].sort_value('timestamp')['item'].values

In [None]:
lines = []
for user in all_users:
    top_items_str = ', '.join([str(item) for item in top_6_items_per_user[user]])
    negative_samples_str = ', '.join([str(item) for item in negative_samples_per_user[user]])
    # format the line as "user_id\ttop_items\tnegative_samples"
    line = f"{user}\t{top_items_str}\t{negative_samples_str}"
    lines.append(line)

tsv_file_path = 'user_items_negs.tsv'
with open(tsv_file_path, 'w') as file:
    file.write('\n'.join(lines))

In [None]:
from sklearn.model_selection import train_test_split

tsv_file_path = 'user_items_negs.tsv'
data = pd.read_csv(tsv_file_path, sep='\t', header=None, names=['user', 'items', 'negative_samples'])

users = data['user'].unique()

train_users, test_val_users = train_test_split(users, test_size=0.2, random_state=42)

val_users, test_users = train_test_split(test_val_users, test_size=0.5, random_state=42)

train_data = data[data['user'].isin(train_users)]
val_data = data[data['user'].isin(val_users)]
test_data = data[data['user'].isin(test_users)]

print(f"Training data size: {len(train_data)}")
print(f"Validation data size: {len(val_data)}")
print(f"Test data size: {len(test_data)}")

In [None]:
def save_to_tsv(df, file_path):
    df.to_csv(file_path, sep='\t', header=False, index=False)
    print(f"File saved to {file_path}")

# save the dataframes to TSV files
save_to_tsv(train_data, 'train.tsv')
save_to_tsv(val_data, 'val.tsv')
save_to_tsv(test_data, 'test.tsv')
