In [1]:
import os
import sys

import pandas as pd
import numpy as np
import torch

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

In [2]:
# set seed
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [3]:
path = '/data/ephemeral/home/data/ml-20m/'
save_path = '/data/ephemeral/home/data/cold/'
df = pd.read_csv(path + 'ml-20m.inter', sep='\t', header=0)

In [4]:
df

Unnamed: 0,user_id:token,item_id:token,rating:float,timestamp:float
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580
...,...,...,...,...
20000258,138493,68954,4.5,1258126920
20000259,138493,69526,4.5,1259865108
20000260,138493,69644,3.0,1260209457
20000261,138493,70286,5.0,1258126944


## Rating filtering

In [5]:
# rating 3 이상만 남기기
df = df[df['rating:float'] > 2.5]

In [6]:
df.drop(columns=['rating:float', 'timestamp:float'], inplace=True)

In [7]:
df

Unnamed: 0,user_id:token,item_id:token
0,1,2
1,1,29
2,1,32
3,1,47
4,1,50
...,...,...
20000257,138493,68319
20000258,138493,68954
20000259,138493,69526
20000260,138493,69644


20000263 -> 16486759(-17.56%) 인터랙션 수 변동

In [8]:
df.rename(columns={'user_id:token': 'user_id', 'item_id:token': 'item_id'}, inplace=True)

In [9]:
df

Unnamed: 0,user_id,item_id
0,1,2
1,1,29
2,1,32
3,1,47
4,1,50
...,...,...
20000257,138493,68319
20000258,138493,68954
20000259,138493,69526
20000260,138493,69644


In [10]:
df.groupby('user_id').size().describe()

count    138445.000000
mean        119.085261
std         178.699187
min           1.000000
25%          30.000000
50%          59.000000
75%         133.000000
max        8241.000000
dtype: float64

## 10-core filtering

In [11]:
df = df.groupby('user_id').filter(lambda x: len(x) >= 10)

In [12]:
df.groupby('user_id').size().describe()

count    137534.000000
mean        119.830275
std         179.054563
min          10.000000
25%          30.000000
50%          59.000000
75%         133.000000
max        8241.000000
dtype: float64

138445 -> 137534(-0.65%) 유저 수 변동

In [13]:
df.groupby('item_id').size().describe()

count    24799.000000
mean       664.572644
std       2822.870765
min          1.000000
25%          3.000000
50%         17.000000
75%        176.500000
max      62631.000000
dtype: float64

item에 대해서는 10-core filtering 진행하지 않음, item 다양성 보전

In [14]:
df

Unnamed: 0,user_id,item_id
0,1,2
1,1,29
2,1,32
3,1,47
4,1,50
...,...,...
20000257,138493,68319
20000258,138493,68954
20000259,138493,69526
20000260,138493,69644


16486759 -> 16480737(-0.03%) 인터랙션 수 변동

## Split interactions

In [15]:
# Get unique user ids
unique_users = df['user_id'].unique()

# Split users into train and temp (80% train, 20% temp)
train_users, temp_users = train_test_split(unique_users, test_size=0.2, random_state=seed)

# Split temp into validation and test (50% validation, 50% test of the remaining 20%)
val_users, test_users = train_test_split(temp_users, test_size=0.5, random_state=seed)

# Create train, validation, and test dataframes
train_df = df[df['user_id'].isin(train_users)]
val_df = df[df['user_id'].isin(val_users)]
test_df = df[df['user_id'].isin(test_users)]

print(f'Train set: {len(train_df)} interactions')
print(f'Validation set: {len(val_df)} interactions')
print(f'Test set: {len(test_df)} interactions')

Train set: 13180359 interactions
Validation set: 1644096 interactions
Test set: 1656282 interactions


In [16]:
def split_interactions(val_df, test_df, k, n=5):
    # Select first k interactions for each user in val_df
    val_first_k = val_df.groupby('user_id').head(k)

    # Select following n interactions for each user in val_df
    val_following_n = val_df.groupby('user_id').apply(lambda x: x.iloc[k:k+n]).reset_index(drop=True)

    # Select first k interactions for each user in test_df
    test_first_k = test_df.groupby('user_id').head(k)

    # Select following n interactions for each user in test_df
    test_following_n = test_df.groupby('user_id').apply(lambda x: x.iloc[k:k+n]).reset_index(drop=True)

    return val_first_k, val_following_n, test_first_k, test_following_n

In [17]:
val_first_5, val_following_5, test_first_5, test_following_5 = split_interactions(val_df, test_df, k=5, n=5)

### Train data 생성

val_first_k, test_first_k를 train_df에 합침

In [18]:
def create_train_data(train_df, val_k, test_k):
    train_data_k = pd.merge(train_df, val_k, how='outer')
    train_data_k = pd.merge(train_data_k, test_k, how='outer')
    return train_data_k

In [19]:
train_data_5 = create_train_data(train_df, val_first_5, test_first_5)

In [20]:
train_df.groupby('user_id').size().describe()

count    110027.000000
mean        119.792042
std         178.307171
min          10.000000
25%          30.000000
50%          59.000000
75%         133.000000
max        8241.000000
dtype: float64

In [21]:
train_data_5.groupby('user_id').size().describe()

count    137534.000000
mean         96.833467
std         165.960978
min           5.000000
25%          19.000000
50%          42.000000
75%         106.000000
max        8241.000000
dtype: float64

In [22]:
train_data_5

Unnamed: 0,user_id,item_id
0,1,2
1,1,29
2,1,32
3,1,47
4,1,50
...,...,...
13317889,138493,68319
13317890,138493,68954
13317891,138493,69526
13317892,138493,69644


10-core filtering 이후의 유저 수 137534로 구성되었음을 확인, 시간 순서 보전

### 1-shot, 3-shot에 대해서도 학습 데이터 구성

In [23]:
val_first_3, val_following_3, test_first_3, test_following_3 = split_interactions(val_df, test_df, k=3, n=5)

In [24]:
train_data_3 = create_train_data(train_df, val_first_3, test_first_3)

In [25]:
val_first_1, val_following_1, test_first_1, test_following_1 = split_interactions(val_df, test_df, k=1, n=5)

In [26]:
train_data_1 = create_train_data(train_df, val_first_1, test_first_1)

# CSV 저장

In [27]:
one_shot_path = save_path + '1shot/'
os.makedirs(one_shot_path, exist_ok=True)

train_data_1.to_csv(one_shot_path + 'train_1.csv', index=False)
val_first_1.to_csv(one_shot_path + 'val_1_k.csv', index=False)
val_following_1.to_csv(one_shot_path + 'val_1_n.csv', index=False)
test_first_1.to_csv(one_shot_path + 'test_1_k.csv', index=False)
test_following_1.to_csv(one_shot_path + 'test_1_n.csv', index=False)

In [28]:
three_shot_path = save_path + '3shot/'
os.makedirs(three_shot_path, exist_ok=True)

train_data_3.to_csv(three_shot_path + 'train_3.csv', index=False)
val_first_3.to_csv(three_shot_path + 'val_3_k.csv', index=False)
val_following_3.to_csv(three_shot_path + 'val_3_n.csv', index=False)
test_first_3.to_csv(three_shot_path + 'test_3_k.csv', index=False)
test_following_3.to_csv(three_shot_path + 'test_3_n.csv', index=False)

In [29]:
five_shot_path = save_path + '5shot/'
os.makedirs(five_shot_path, exist_ok=True)

train_data_5.to_csv(five_shot_path + 'train_5.csv', index=False)
val_first_5.to_csv(five_shot_path + 'val_5_k.csv', index=False)
val_following_5.to_csv(five_shot_path + 'val_5_n.csv', index=False)
test_first_5.to_csv(five_shot_path + 'test_5_k.csv', index=False)
test_following_5.to_csv(five_shot_path + 'test_5_n.csv', index=False)

### 혹시 모르니 Rating filtering, 10-core user filtering 된 데이터 저장

In [30]:
df.to_csv(save_path + 'full.csv', index=False)