In [1]:
import os

import pandas as pd
import numpy as np
import torch

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

In [2]:
# set seed
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [3]:
path = '/data/ephemeral/home/data/ml-20m/'
save_path = '/data/ephemeral/home/data/cold/'
df = pd.read_csv(path + 'ml-20m.inter', sep='\t', header=0)

In [4]:
df

Unnamed: 0,user_id:token,item_id:token,rating:float,timestamp:float
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580
...,...,...,...,...
20000258,138493,68954,4.5,1258126920
20000259,138493,69526,4.5,1259865108
20000260,138493,69644,3.0,1260209457
20000261,138493,70286,5.0,1258126944


## Rating filtering

In [5]:
# rating 3 이상만 남기기
df = df[df['rating:float'] > 2.5]

In [6]:
df.drop(columns=['rating:float', 'timestamp:float'], inplace=True)

In [7]:
df

Unnamed: 0,user_id:token,item_id:token
0,1,2
1,1,29
2,1,32
3,1,47
4,1,50
...,...,...
20000257,138493,68319
20000258,138493,68954
20000259,138493,69526
20000260,138493,69644


20000263 -> 16486759(-17.56%) 인터랙션 수 변동

In [8]:
df.rename(columns={'user_id:token': 'user_id', 'item_id:token': 'item_id'}, inplace=True)

In [9]:
df

Unnamed: 0,user_id,item_id
0,1,2
1,1,29
2,1,32
3,1,47
4,1,50
...,...,...
20000257,138493,68319
20000258,138493,68954
20000259,138493,69526
20000260,138493,69644


In [10]:
df.groupby('user_id').size().describe()

count    138445.000000
mean        119.085261
std         178.699187
min           1.000000
25%          30.000000
50%          59.000000
75%         133.000000
max        8241.000000
dtype: float64

## 10-core filtering

In [11]:
df = df.groupby('user_id').filter(lambda x: len(x) >= 10)

In [12]:
df.groupby('user_id').size().describe()

count    137534.000000
mean        119.830275
std         179.054563
min          10.000000
25%          30.000000
50%          59.000000
75%         133.000000
max        8241.000000
dtype: float64

138445 -> 137534(-0.65%) 유저 수 변동

In [13]:
df.groupby('item_id').size().describe()

count    24799.000000
mean       664.572644
std       2822.870765
min          1.000000
25%          3.000000
50%         17.000000
75%        176.500000
max      62631.000000
dtype: float64

item에 대해서는 10-core filtering 진행하지 않음, item 다양성 보전

In [14]:
df

Unnamed: 0,user_id,item_id
0,1,2
1,1,29
2,1,32
3,1,47
4,1,50
...,...,...
20000257,138493,68319
20000258,138493,68954
20000259,138493,69526
20000260,138493,69644


16486759 -> 16480737(-0.03%) 인터랙션 수 변동

## Split interactions

In [16]:
# 유저들을 분리해 세 개의 유저 군으로 분리하는 함수
def split_users(df, first_split=2/3, second_split=0.5, seed=42):
    # Get unique user ids
    df_unique_users = df['user_id'].unique()

    # Split users
    first, temp_users = train_test_split(df_unique_users, test_size=first_split, random_state=seed)
    second, third = train_test_split(temp_users, test_size=second_split, random_state=seed)

    # Create dataframes
    first = df[df['user_id'].isin(first)]
    second = df[df['user_id'].isin(second)]
    third = df[df['user_id'].isin(third)]

    return first, second, third

In [None]:
train_df, val_df, test_df = split_users(df, first_split=0.2, second_split=0.5, seed=seed)

In [32]:
val_1, val_3, val_5 = split_users(val_df, first_split=2/3, second_split=0.5, seed=seed)
test_1, test_3, test_5 = split_users(test_df, first_split=2/3, second_split=0.5, seed=seed)

In [34]:
# 각 유저에 대해 첫 k개, 다음 n개의 인터랙션을 뽑아내는 함수
def split_interactions(df, k, n=5):
    # Select first k interactions for each user in df
    first_k = df.groupby('user_id').head(k)
    # Select following n interactions for each user in df
    following_n = df.groupby('user_id').apply(lambda x: x.iloc[k:k+n]).reset_index(drop=True)

    return first_k, following_n

In [None]:
val_1_k, val_1_n = split_interactions(val_1, k=1, n=5)
val_3_k, val_3_n = split_interactions(val_3, k=3, n=5)
val_5_k, val_5_n = split_interactions(val_5, k=5, n=5)

test_1_k, test_1_n = split_interactions(test_1, k=1, n=5)
test_3_k, test_3_n = split_interactions(test_3, k=3, n=5)
test_5_k, test_5_n = split_interactions(test_5, k=5, n=5)

## Merge data

### valt_k, test_k를 train_df에 합침

In [36]:
train_data = pd.merge(train_df, val_1_k, how='outer')
train_data = pd.merge(train_data, val_3_k, how='outer')
train_data = pd.merge(train_data, val_5_k, how='outer')

train_data = pd.merge(train_data, test_1_k, how='outer')
train_data = pd.merge(train_data, test_3_k, how='outer')
train_data = pd.merge(train_data, test_5_k, how='outer')

train에 사용된 인터랙션 데이터 취합

In [None]:
val_k = pd.merge(val_1_k, val_3_k, how='outer')
val_k = pd.merge(val_k, val_5_k, how='outer')

test_k = pd.merge(test_1_k, test_3_k, how='outer')
test_k = pd.merge(test_k, test_5_k, how='outer')

정답 데이터 취합

In [46]:
val_n = pd.merge(val_1_n, val_3_n, how='outer')
val_n = pd.merge(val_n, val_5_n, how='outer')

test_n = pd.merge(test_1_n, test_3_n, how='outer')
test_n = pd.merge(test_n, test_5_n, how='outer')

# CSV 저장

In [51]:
df.to_csv(save_path + 'full.csv', index=False)

In [52]:
train_data.to_csv(save_path + 'train.csv', index=False)

val_k.to_csv(save_path + 'val_k.csv', index=False)
val_n.to_csv(save_path + 'val_n.csv', index=False)

test_k.to_csv(save_path + 'test_k.csv', index=False)
test_n.to_csv(save_path + 'test_n.csv', index=False)