In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity="all"

import os
import json
import argparse
import pandas as pd
import numpy as np
import time, datetime
from tqdm import tqdm
from logging import getLogger
import torch

from recbole.model.general_recommender.ease import EASE
from recbole.model.context_aware_recommender.ffm import FFM

from recbole.config import Config
from recbole.data import create_dataset, data_preparation, Interaction
from recbole.utils import init_logger, get_trainer, get_model, init_seed, set_color


SEED=13

In [41]:
train = pd.read_csv("/opt/ml/input/data/train/train_ratings.csv")

user2idx = {v:k for k,v in enumerate(sorted(set(train.user)))}
item2idx = {v:k for k,v in enumerate(sorted(set(train.item)))}
uidx2user = {k:v for k,v in enumerate(sorted(set(train.user)))}
iidx2item = {k:v for k,v in enumerate(sorted(set(train.item)))}

## create train_data.item

In [None]:
data_path = '../../data/train'
year_data = pd.read_csv(os.path.join(data_path, 'years.tsv'), sep='\t')
title_data = pd.read_csv(os.path.join(data_path, 'titles.tsv'), sep='\t')
genre_data = pd.read_csv(os.path.join(data_path, 'genres.tsv'), sep='\t')

## genre
# genre 공백구분자 딕셔너리
genre_dict = dict(genre_data.groupby('item').genre.apply(lambda x: " ".join(list(x))))
title_data['genre'] = title_data.item.map(genre_dict) # 딕셔너리 적용

## year
title_data = title_data.merge(year_data, on='item', how='left')
# year에서 NaN값만, title의 year 데이터에서 채우기
title_data.year = title_data.year.fillna(title_data.title.map(lambda x: x[-5:-1])).astype(int)

## title
# title에서 "(년도)" 정보 제거
title_data.title = title_data.title.str.replace(r"(\(\d+-*\d*\))","").str.strip()

writer_data = pd.read_csv(os.path.join(data_path, 'writers.tsv'), sep='\t')
writer_dict = dict(writer_data.groupby('item').writer.apply(lambda x: " ".join(list(x)))
title_data['writer'] = title_data.item.map(writer_dict)

director_data = pd.read_csv(os.path.join(data_path, 'directors.tsv'), sep='\t')
director_dict = dict(director_data.groupby('item').director.apply(lambda x: " ".join(list(x))))
title_data['director'] = title_data.item.map(director_dict)

In [65]:
title_data.item = title_data.item.map(item2idx)

In [71]:
cols=['item_id:token', 'title:token_seq', 'genre:token_seq', 'year:token', 'writer:token_seq', 'director:token_seq']

In [72]:
title_data.columns = cols

In [73]:
title_data.to_csv("dataset/train_data.item",sep='\t',index=False)

In [74]:
title_data

Unnamed: 0,item_id:token,title:token_seq,genre:token_seq,year:token,writer:token_seq,director:token_seq
0,193,"Shawshank Redemption, The",Crime Drama,1994,nm0000175 nm0001104,nm0001104
1,1435,"Matrix, The",Action Sci-Fi Thriller,1999,nm0905152 nm0905154,nm0905152 nm0905154
2,1642,Fight Club,Action Crime Drama Thriller,1999,nm0657333 nm0880243,nm0000399
3,179,Pulp Fiction,Comedy Crime Drama Thriller,1994,nm0000233 nm0000812,nm0000233
4,220,Forrest Gump,Comedy Drama Romance War,1994,nm0744839,nm0000709
...,...,...,...,...,...,...
6802,5943,American Pie Presents: The Book of Love (Ameri...,Comedy,2009,nm0381221 nm0825738,nm0003289
6803,6734,Need for Speed,Action Crime Drama,2014,nm0309691,nm0915304
6804,3979,Taxi 3,Action Comedy,2003,nm0000108,nm0470443
6805,2027,Porky's II: The Next Day,Comedy,1983,nm0163706 nm0650276,nm0163706


In [62]:
!head dataset/train_data/train_data.item

item_id:token	year:float	writer:token	title:token	genre:token	director:token
2033	2001	nm0099541	Planet of the Apes (2001)	Action	nm0000318
2433	2002	nm0001392	Lord of the Rings: The Two Towers, The (2002)	Adventure	nm0001392
2863	2004	nm1286500	50 First Dates (2004)	Comedy	nm0781842
1262	1984	nm0000101	Ghostbusters (a.k.a. Ghost Busters) (1984)	Action	nm0718645
3433	2004	nm0258268	Ghost in the Shell 2: Innocence (a.k.a. Innocence) (Inosensu) (2004)	Action	nm0651900
1123	1998	nm0001779	Shakespeare in Love (1998)	Comedy	nm0006960
339	1995	nm0411872	Ghost in the Shell (Kôkaku kidôtai) (1995)	Animation	nm0651900
884	1988	nm0060103	Rain Man (1988)	Drama	nm0001469
724	1997	nm0000108	Fifth Element, The (1997)	Action	nm0000108


In [60]:
!head dataset/general_data/general_data.inter

user_id:token	item_id:token	timestamp:float
0	2505	1230782529
0	109	1230782534
0	319	1230782539
0	368	1230782542
0	1183	1230782563
0	1510	1230782583
0	1274	1230782646
0	1486	1230782656
0	1359	1230782694


## create 2억개의 interaction data

In [27]:
train.user = train.user.map(user2idx)
train.item = train.item.map(item2idx)

In [16]:
li=[]
for i in np.arange(0,31360):
    for j in range(0,6807):
        li.append((i,j))

In [18]:
df = pd.DataFrame(li)

In [29]:
df.columns=['user','item']

In [23]:
df.columns

RangeIndex(start=0, stop=2, step=1)

In [34]:
df.shape

(213467520, 2)

In [30]:
df2 = df.merge(train, on=['user','item'],how='left')

In [33]:
df2[df2.time.isna()].shape

(208313049, 3)

In [35]:
213467520 - train.shape[0]

208313049

In [36]:
df3 = df2[df2.time.isna()]

In [38]:
df3.columns=['user_id:token','item_id:token','timestamp:float']

In [39]:
df3

Unnamed: 0,user_id:token,item_id:token,timestamp:float
1,0,1,
2,0,2,
3,0,3,
4,0,4,
5,0,5,
...,...,...,...
213467515,31359,6802,
213467516,31359,6803,
213467517,31359,6804,
213467518,31359,6805,


In [40]:
outpath = f"dataset/train_data"
os.makedirs(outpath, exist_ok=True)
# sub_train=train.groupby("user").sample(n=10, random_state=SEED)
# sub_train.shape
df3.to_csv(os.path.join(outpath,"test_data.inter"),sep='\t',index=False)