In [65]:
from collections import defaultdict
from datetime import datetime
import torch
import numpy as np
import pandas as pd
from datetime import datetime

In [40]:
m1_1m_path = '../../../datasets/ml-1m'
PATH_TO_PROCESSED_DATA = 'processed/'


In [23]:
file_path = m1_1m_path + '/ratings.dat'

data_ratings = pd.read_csv(file_path, sep='::', engine='python',
                   names=['SessionID', 'ItemID', 'Rating', 'Time'], index_col='SessionID').drop('Rating', axis=1, inplace=False)

In [27]:
data_ratings

Unnamed: 0_level_0,ItemID,Time
SessionID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1193,978300760
1,661,978302109
1,914,978301968
1,3408,978300275
1,2355,978824291
...,...,...
6040,1091,956716541
6040,1094,956704887
6040,562,956704746
6040,1096,956715648


In [30]:
len(set(data_ratings.index))

6040

# SASRec Handling

In [58]:
order_ratings = data_ratings
# session_lengths>1
order_ratings = order_ratings.groupby('ItemID').filter(lambda  x: len(x) >= 5)
order_ratings = order_ratings.groupby('SessionID').filter(lambda  x: len(x) >= 2)
order_ratings = order_ratings.groupby('SessionID').apply(lambda  x: x.sort_values('Time'))
train_tr = order_ratings.groupby('SessionID').apply(lambda x: x.iloc[:-2]).reset_index(level=0, drop=True).reindex(columns=['Time', 'ItemID'])
valid = order_ratings.groupby('SessionID').apply(lambda x: x.iloc[:-1]).reset_index(level=0, drop=True).reindex(columns=['Time', 'ItemID'])
test = order_ratings.reindex(columns=['Time', 'ItemID'])

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  order_ratings = order_ratings.groupby('SessionID').apply(lambda  x: x.sort_values('Time'))


In [60]:
train_tr.to_csv(PATH_TO_PROCESSED_DATA + 'ml-1m_train_tr.txt', sep=',', index=True)
valid.to_csv(PATH_TO_PROCESSED_DATA + 'ml-1m_train_valid.txt', sep=',', index=True)
test.to_csv(PATH_TO_PROCESSED_DATA + 'ml-1m_test.txt', sep=',', index=True)

In [57]:
train_tr.reindex(columns=['Time', 'ItemID'])

Unnamed: 0_level_0,Time,ItemID
SessionID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,978300019,3186
1,978300055,1721
1,978300055,1022
1,978300055,1270
1,978300103,2340
...,...,...
6040,997454367,3671
6040,997454398,232
6040,997454429,2917
6040,997454464,1784


In [48]:
train_tr

Unnamed: 0_level_0,ItemID,Time
SessionID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3186,978300019
1,1721,978300055
1,1022,978300055
1,1270,978300055
1,2340,978300103
...,...,...
6040,3671,997454367
6040,232,997454398
6040,2917,997454429
6040,1784,997454464


In [49]:
valid

Unnamed: 0_level_0,ItemID,Time
SessionID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3186,978300019
1,1721,978300055
1,1022,978300055
1,1270,978300055
1,2340,978300103
...,...,...
6040,232,997454398
6040,2917,997454429
6040,1784,997454464
6040,1921,997454464


In [50]:
test

Unnamed: 0_level_0,ItemID,Time
SessionID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3186,978300019
1,1721,978300055
1,1022,978300055
1,1270,978300055
1,2340,978300103
...,...,...
6040,2917,997454429
6040,1784,997454464
6040,1921,997454464
6040,161,997454486


# GRU4Rec handling


In [14]:
order_ratings = data_ratings
# session_lengths>1
order_ratings = order_ratings.groupby('SessionID').filter(lambda  x: len(x) > 1)
order_ratings = order_ratings.groupby('ItemID').filter(lambda  x: len(x) >= 5)
order_ratings = order_ratings.groupby('SessionID').filter(lambda  x: len(x) >= 2)
order_ratings = order_ratings.groupby('SessionID').apply(lambda  x: x.sort_values('Time'))
# We discard users and items with fewer than 5 related actions.
order_ratings

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  order_ratings = order_ratings.groupby('SessionID').apply(lambda  x: x.sort_values('Time'))


Unnamed: 0_level_0,ItemID,Time
SessionID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3186,978300019
1,1721,978300055
1,1022,978300055
1,1270,978300055
1,2340,978300103
...,...,...
6040,2917,997454429
6040,1784,997454464
6040,1921,997454464
6040,161,997454486


In [15]:
len(set(order_ratings.index))

6040

In [16]:
tmax = order_ratings.Time.max()
interval = 86400 * 30 * 12 * 2
split_point_1 = tmax - interval
session_train = order_ratings[order_ratings['Time'] <= split_point_1]
session_test = order_ratings[order_ratings['Time'] > split_point_1]
session_test = session_test.groupby('SessionID').filter(lambda  x: len(x) >= 2)
len(set(session_train.index)), len(set(session_test.index)), len(session_train), len(session_test)

(6035, 914, 932987, 66579)

In [17]:
# train = session_train.reset_index().rename(columns={'index': 'SessionID'})
# train

In [18]:
train = session_train
test = session_test
print('Full train set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}'.format(len(train), train.index.nunique(), train.ItemID.nunique()))
train.to_csv(PATH_TO_PROCESSED_DATA + 'ml-1m_train_full.txt', sep=',', index=True)
print('Test set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}'.format(len(test), test.index.nunique(), test.ItemID.nunique()))
test.to_csv(PATH_TO_PROCESSED_DATA + 'ml-1m_test.txt', sep=',', index=True)

Full train set
	Events: 932987
	Sessions: 6035
	Items: 3414
Test set
	Events: 66579
	Sessions: 914
	Items: 3246


In [19]:
tmax = train.Time.max()
interval = 86400 * 30 * 3
split_point_2 = tmax - interval
session_train = train[train['Time'] <= split_point_2]
session_val = train[train['Time'] > split_point_2]
train_tr = session_train
valid = session_val
valid = valid.groupby('SessionID').filter(lambda  x: len(x) >= 2)
print('Train set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}'.format(len(train_tr), train_tr.index.nunique(), train_tr.ItemID.nunique()))
train_tr.to_csv(PATH_TO_PROCESSED_DATA + 'ml-1m_train_tr.txt', sep=',', index=True)
print('Validation set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}'.format(len(valid), valid.index.nunique(), valid.ItemID.nunique()))
valid.to_csv(PATH_TO_PROCESSED_DATA + 'ml-1m_train_valid.txt', sep=',', index=True)x

Train set
	Events: 850937
	Sessions: 5719
	Items: 3411
Validation set
	Events: 81984
	Sessions: 1100
	Items: 3239


In [62]:
dataBefore = 'C:\\Users\\Yunlong\\Desktop\\Recommender-System\\datasets\\ml-1m\\ratings.dat'
train = pd.read_csv(dataBefore, sep='::', header=None, usecols=[0, 1, 2], dtype={0: np.int32, 1: str, 2: np.int64})

  train = pd.read_csv(dataBefore, sep='::', header=None, usecols=[0, 1, 2], dtype={0: np.int32, 1: str, 2: np.int64})


In [63]:
train

Unnamed: 0,0,1,2
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5
...,...,...,...
1000204,6040,1091,1
1000205,6040,1094,5
1000206,6040,562,5
1000207,6040,1096,4
