In [1]:
import os, csv
import pandas as pd

os.getcwd()

'/Users/yyykobe/PycharmProjects/MM_Rec/data/MicroLens-50k/sample_subset'

In [2]:
df = pd.read_csv('../MicroLens-50k_pairs.csv')
print(f'shape: {df.shape}')
df[:5]

shape: (359708, 3)


Unnamed: 0,user,item,timestamp
0,36121,9580,1583378629552
1,26572,9580,1583436719018
2,37550,9580,1584412681021
3,14601,9580,1584848802432
4,15061,9580,1585388171106


In [50]:
from collections import Counter
import numpy as np

min_u_num, min_i_num = 7, 5

def get_illegal_ids_by_inter_num(df, field, max_num=None, min_num=None):
    if field is None:
        return set()
    if max_num is None and min_num is None:
        return set()

    max_num = max_num or np.inf
    min_num = min_num or -1

    ids = df[field].values
    inter_num = Counter(ids)
    ids = {id_ for id_ in inter_num if inter_num[id_] < min_num or inter_num[id_] > max_num}
    print(f'{len(ids)} illegal_ids_by_inter_num, field={field}')

    return ids


def filter_by_k_core(df):
    while True:
        ban_users = get_illegal_ids_by_inter_num(df, field='user', max_num=None, min_num=min_u_num)
        ban_items = get_illegal_ids_by_inter_num(df, field='item', max_num=None, min_num=min_i_num)
        if len(ban_users) == 0 and len(ban_items) == 0:
            return

        dropped_inter = pd.Series(False, index=df.index)
        if 'user':
            dropped_inter |= df['user'].isin(ban_users)
        if 'item':
            dropped_inter |= df['item'].isin(ban_items)
        print(f'{len(dropped_inter)} dropped interactions')
        df.drop(df.index[dropped_inter], inplace=True)

In [51]:
filter_by_k_core(df)
print(f'k-core shape: {df.shape}')
print(f'shape after k-core: {df.shape}')
df[:5]

9184 illegal_ids_by_inter_num, field=user
0 illegal_ids_by_inter_num, field=item
220472 dropped interactions
0 illegal_ids_by_inter_num, field=user
1634 illegal_ids_by_inter_num, field=item
165368 dropped interactions
1261 illegal_ids_by_inter_num, field=user
0 illegal_ids_by_inter_num, field=item
159739 dropped interactions
0 illegal_ids_by_inter_num, field=user
462 illegal_ids_by_inter_num, field=item
152431 dropped interactions
409 illegal_ids_by_inter_num, field=user
0 illegal_ids_by_inter_num, field=item
150685 dropped interactions
0 illegal_ids_by_inter_num, field=user
159 illegal_ids_by_inter_num, field=item
148271 dropped interactions
135 illegal_ids_by_inter_num, field=user
0 illegal_ids_by_inter_num, field=item
147648 dropped interactions
0 illegal_ids_by_inter_num, field=user
49 illegal_ids_by_inter_num, field=item
146845 dropped interactions
49 illegal_ids_by_inter_num, field=user
0 illegal_ids_by_inter_num, field=item
146650 dropped interactions
0 illegal_ids_by_inter_num,

Unnamed: 0,user,item,timestamp
0,36121,9580,1583378629552
2,37550,9580,1584412681021
3,14601,9580,1584848802432
4,15061,9580,1585388171106
6,3542,9580,1585404918503


In [52]:
print(df['user'].nunique())
print(df['item'].nunique())

14323
9535


In [42]:
import pandas as pd
import numpy as np

# 假设df是原始DataFrame，包含'user', 'item', 'timestamp'列

# 计算整个数据集中每个item的出现频率
item_frequency = df.groupby('item').size().reset_index(name='frequency')

# 对频率进行降序排序
item_frequency_sorted = item_frequency.sort_values(by='frequency', ascending=False)

# 对原始DataFrame根据timestamp进行排序
df_sorted = df.sort_values('timestamp')

# 获取所有独特的items和users
all_items = set(df_sorted['item'].unique())
all_users = df_sorted['user'].unique()

# 生成Negative Samples
negative_samples_per_user = {}
for user in all_users:
    user_items = df_sorted[df_sorted['user'] == user]['item'].unique()
    available_items = list(all_items - set(user_items))
    negative_samples = np.random.choice(available_items, size=min(20, len(available_items)), replace=False)
    negative_samples_per_user[user] = negative_samples

print(len(negative_samples_per_user.keys()))

25411


In [44]:
top_6_items_per_user = {}
for user in all_users:
    # 获取用户互动过的所有items
    user_items = df_sorted[df_sorted['user'] == user]['item']
    # 基于全局频率排序这些items，并取前6个
    top_6_items = user_items.map(item_frequency_sorted.set_index('item')['frequency']).sort_values(ascending=False).index[:6]
    top_6_items_per_user[user] = df_sorted.loc[top_6_items, 'item'].values

In [47]:
lines = []
for user in all_users:
    top_items_str = ', '.join([str(item) for item in top_6_items_per_user[user]])
    negative_samples_str = ', '.join([str(item) for item in negative_samples_per_user[user]])
    # 格式化每行数据
    line = f"{user}\t{top_items_str}\t{negative_samples_str}"
    lines.append(line)

tsv_file_path = 'user_items_negs.tsv'
with open(tsv_file_path, 'w') as file:
    file.write('\n'.join(lines))

In [48]:
from sklearn.model_selection import train_test_split

tsv_file_path = 'user_items_negs.tsv'
data = pd.read_csv(tsv_file_path, sep='\t', header=None, names=['user', 'items', 'negative_samples'])

users = data['user'].unique()

train_users, test_val_users = train_test_split(users, test_size=0.2, random_state=42)

val_users, test_users = train_test_split(test_val_users, test_size=0.5, random_state=42)

train_data = data[data['user'].isin(train_users)]
val_data = data[data['user'].isin(val_users)]
test_data = data[data['user'].isin(test_users)]

print(f"Training data size: {len(train_data)}")
print(f"Validation data size: {len(val_data)}")
print(f"Test data size: {len(test_data)}")

Training data size: 20328
Validation data size: 2541
Test data size: 2542


In [49]:
def save_to_tsv(df, file_path):
    df.to_csv(file_path, sep='\t', header=False, index=False)
    print(f"File saved to {file_path}")

# 按照原格式保存分割后的数据
save_to_tsv(train_data, 'train.tsv')
save_to_tsv(val_data, 'val.tsv')
save_to_tsv(test_data, 'test.tsv')


File saved to train.tsv
File saved to val.tsv
File saved to test.tsv
