In [42]:
import numpy as np
import pandas as pd
import datetime
import json
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupShuffleSplit
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F


In [2]:
def my_aggregation(x):
    if not x.empty:
        return x

In [3]:
def convert_time_column(data):
    def convert_timestamp(x):
        timestamp = datetime.datetime.fromtimestamp(x)
        return timestamp.strftime('%Y-%m-%d %H:%M:%S')
        
    data['timestamp'] = pd.DatetimeIndex(data.time.apply(lambda x: convert_timestamp(x)))
    return data

In [38]:
data_dir = '/opt/ml/movie-recommendation/data/train/'

train = pd.read_csv(data_dir+'train_ratings.csv')
train

Unnamed: 0,user,item,time
0,11,4643,1230782529
1,11,170,1230782534
2,11,531,1230782539
3,11,616,1230782542
4,11,2140,1230782563
...,...,...,...
5154466,138493,44022,1260209449
5154467,138493,4958,1260209482
5154468,138493,68319,1260209720
5154469,138493,40819,1260209726


In [39]:
train = convert_time_column(train)
train

Unnamed: 0,user,item,time,timestamp
0,11,4643,1230782529,2009-01-01 04:02:09
1,11,170,1230782534,2009-01-01 04:02:14
2,11,531,1230782539,2009-01-01 04:02:19
3,11,616,1230782542,2009-01-01 04:02:22
4,11,2140,1230782563,2009-01-01 04:02:43
...,...,...,...,...
5154466,138493,44022,1260209449,2009-12-07 18:10:49
5154467,138493,4958,1260209482,2009-12-07 18:11:22
5154468,138493,68319,1260209720,2009-12-07 18:15:20
5154469,138493,40819,1260209726,2009-12-07 18:15:26


In [40]:
user_group_dfs = list(train.groupby('user'))

In [41]:
session_id = 0

session_data = dict()
#session_data['user'] = list()
session_data['session'] = list()
session_data['item'] = list()
session_data['time'] = list()

for user, group_df in tqdm(user_group_dfs):
    group_df = group_df.set_index('timestamp')
    session_group_dfs = list(group_df.resample('1M'))

    for session, item_df in session_group_dfs :
        if len(item_df) <= 1:
            continue
        #session_data['user'].extend([user]*len(item_df))
        session_data['session'].extend([session_id]*len(item_df))
        session_data['item'].extend(item_df['item'].values)
        session_data['time'].extend(item_df['time'])
        session_id +=1

session_data = pd.DataFrame(session_data)
session_data

100%|██████████| 31360/31360 [02:25<00:00, 216.16it/s]


Unnamed: 0,session,item,time
0,0,4643,1230782529
1,0,170,1230782534
2,0,531,1230782539
3,0,616,1230782542
4,0,2140,1230782563
...,...,...,...
5120523,165275,44022,1260209449
5120524,165275,4958,1260209482
5120525,165275,68319,1260209720
5120526,165275,40819,1260209726


In [59]:
splitter = GroupShuffleSplit(test_size=0.1, n_splits=1, random_state = 42)
split = splitter.split(session_data, groups=session_data['session'])
train_idx, test_idx = next(split)

train = session_data.iloc[train_idx]
valid = session_data.iloc[test_idx]

In [62]:
train.to_csv(data_dir+'gru/train.csv', index=False)
valid.to_csv(data_dir+'gru/valid.csv', index=False)