In [1]:
import os

import pandas as pd
import numpy as np
import torch

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

In [None]:
# set seed
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [3]:
path = '/data/ephemeral/home/data/ml-20m/'
save_path = '/data/ephemeral/home/data/cold/'
df = pd.read_csv(path + 'ml-20m.inter', sep='\t', header=0)

In [None]:
df

Unnamed: 0,user_id:token,item_id:token,rating:float,timestamp:float
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580
...,...,...,...,...
20000258,138493,68954,4.5,1258126920
20000259,138493,69526,4.5,1259865108
20000260,138493,69644,3.0,1260209457
20000261,138493,70286,5.0,1258126944


타임스탬프의 오름차순으로 정렬이 되어있지 않음

## 시간 순 정렬

In [5]:
df = df.groupby('user_id:token', group_keys=False).apply(lambda x: x.sort_values('timestamp:float'))

## Rating filtering

In [6]:
# rating 3 이상만 남기기
df = df[df['rating:float'] > 2.5]

In [7]:
df.drop(columns=['rating:float', 'timestamp:float'], inplace=True)

In [8]:
df

Unnamed: 0,user_id:token,item_id:token
20,1,924
19,1,919
86,1,2683
61,1,1584
23,1,1079
...,...,...
20000140,138493,6534
20000242,138493,53464
19999965,138493,1275
20000154,138493,6996


20000263 -> 16486759(-17.56%) 인터랙션 수 변동

In [9]:
df.rename(columns={'user_id:token': 'user_id', 'item_id:token': 'item_id'}, inplace=True)

In [10]:
df

Unnamed: 0,user_id,item_id
20,1,924
19,1,919
86,1,2683
61,1,1584
23,1,1079
...,...,...
20000140,138493,6534
20000242,138493,53464
19999965,138493,1275
20000154,138493,6996


In [11]:
df.groupby('user_id').size().describe()

count    138445.000000
mean        119.085261
std         178.699187
min           1.000000
25%          30.000000
50%          59.000000
75%         133.000000
max        8241.000000
dtype: float64

## 20-core filtering

In [12]:
df = df.groupby('user_id').filter(lambda x: len(x) >= 20)

In [13]:
df.groupby('user_id').size().describe()

count    126972.000000
mean        128.442554
std         183.741964
min          20.000000
25%          35.000000
50%          66.000000
75%         144.000000
max        8241.000000
dtype: float64

138445 -> 126972(-8.29%) 유저 수 변동

In [14]:
df.groupby('item_id').size().describe()

count    24790.000000
mean       657.870432
std       2786.483233
min          1.000000
25%          3.000000
50%         17.000000
75%        176.000000
max      61307.000000
dtype: float64

item에 대해서는 20-core filtering 진행하지 않음, item 다양성 보전

In [15]:
df

Unnamed: 0,user_id,item_id
20,1,924
19,1,919
86,1,2683
61,1,1584
23,1,1079
...,...,...
20000140,138493,6534
20000242,138493,53464
19999965,138493,1275
20000154,138493,6996


16486759 -> 16308608(-1.08%) 인터랙션 수 변동

## Split interactions

In [17]:
# 유저들을 분리해 세 개의 유저 군으로 분리하는 함수
def split_users(df, first_split, second_split, seed=42):
    # Get unique user ids
    df_unique_users = df['user_id'].unique()

    # Split users
    first, temp_users = train_test_split(df_unique_users, train_size=first_split, random_state=seed)
    second, third = train_test_split(temp_users, train_size=second_split, random_state=seed)

    # Create dataframes
    first = df[df['user_id'].isin(first)]
    second = df[df['user_id'].isin(second)]
    third = df[df['user_id'].isin(third)]

    return first, second, third

In [19]:
train_df, val_df, test_df = split_users(df, first_split=0.8, second_split=0.5, seed=seed)

In [20]:
val_1, val_3, tmp = split_users(val_df, first_split=1/5, second_split=1/4, seed=seed)
val_5, val_10, val_15 = split_users(tmp, first_split=1/3, second_split=1/2, seed=seed)

test_1, test_3, tmp = split_users(test_df, first_split=1/5, second_split=1/4, seed=seed)
test_5, test_10, test_15 = split_users(tmp, first_split=1/3, second_split=1/2, seed=seed)

In [21]:
# 각 유저에 대해 첫 k개, 다음 n개의 인터랙션을 뽑아내는 함수
def split_interactions(df, k, n=5):
    # Select first k interactions for each user in df
    first_k = df.groupby('user_id').head(k)
    # Select following n interactions for each user in df
    following_n = df.groupby('user_id').apply(lambda x: x.iloc[k:k+n]).reset_index(drop=True)

    return first_k, following_n

In [22]:
val_1_k, val_1_n = split_interactions(val_1, k=1, n=5)
val_3_k, val_3_n = split_interactions(val_3, k=3, n=5)
val_5_k, val_5_n = split_interactions(val_5, k=5, n=5)
val_10_k, val_10_n = split_interactions(val_10, k=10, n=5)
val_15_k, val_15_n = split_interactions(val_15, k=15, n=5)

test_1_k, test_1_n = split_interactions(test_1, k=1, n=5)
test_3_k, test_3_n = split_interactions(test_3, k=3, n=5)
test_5_k, test_5_n = split_interactions(test_5, k=5, n=5)
test_10_k, test_10_n = split_interactions(test_10, k=10, n=5)
test_15_k, test_15_n = split_interactions(test_15, k=15, n=5)

## Merge data

### valt_k, test_k를 train_df에 합침

In [23]:
train_data = pd.merge(train_df, val_1_k, how='outer')
train_data = pd.merge(train_data, val_3_k, how='outer')
train_data = pd.merge(train_data, val_5_k, how='outer')
train_data = pd.merge(train_data, val_10_k, how='outer')
train_data = pd.merge(train_data, val_15_k, how='outer')

train_data = pd.merge(train_data, test_1_k, how='outer')
train_data = pd.merge(train_data, test_3_k, how='outer')
train_data = pd.merge(train_data, test_5_k, how='outer')
train_data = pd.merge(train_data, test_10_k, how='outer')
train_data = pd.merge(train_data, test_15_k, how='outer')

train에 사용된 인터랙션 데이터 취합

In [24]:
val_k = pd.merge(val_1_k, val_3_k, how='outer')
val_k = pd.merge(val_k, val_5_k, how='outer')
val_k = pd.merge(val_k, val_10_k, how='outer')
val_k = pd.merge(val_k, val_15_k, how='outer')

test_k = pd.merge(test_1_k, test_3_k, how='outer')
test_k = pd.merge(test_k, test_5_k, how='outer')
test_k = pd.merge(test_k, test_10_k, how='outer')
test_k = pd.merge(test_k, test_15_k, how='outer')

정답 데이터 취합

In [25]:
val_n = pd.merge(val_1_n, val_3_n, how='outer')
val_n = pd.merge(val_n, val_5_n, how='outer')
val_n = pd.merge(val_n, val_10_n, how='outer')
val_n = pd.merge(val_n, val_15_n, how='outer')

test_n = pd.merge(test_1_n, test_3_n, how='outer')
test_n = pd.merge(test_n, test_5_n, how='outer')
test_n = pd.merge(test_n, test_10_n, how='outer')
test_n = pd.merge(test_n, test_15_n, how='outer')

# CSV 저장

In [26]:
df.to_csv(save_path + 'full.csv', index=False)

In [27]:
train_data.to_csv(save_path + 'train.csv', index=False)

val_k.to_csv(save_path + 'val_k.csv', index=False)
val_n.to_csv(save_path + 'val_n.csv', index=False)

test_k.to_csv(save_path + 'test_k.csv', index=False)
test_n.to_csv(save_path + 'test_n.csv', index=False)