In [3]:
import pandas as pd

from preprocess import train_val_test_split, prepare_splitted_data

In [4]:
PROJECT_PATH = '~/Projects/negative_feedback/'
DATA_PATH = f'{PROJECT_PATH}/data/gg/'

RELEVANCE_COL = 'rating'
RELEVANCE_THRESHOLD = 5

In [5]:
path_to_df = '~/Projects/negative_feedback/data/reviews_Toys_and_Games_5.json'
data = pd.read_json(path_to_df, lines = True)

In [6]:
data.overall.median()

5.0

In [7]:
sum(data.overall < 5) / len(data)

0.3866835325214652

In [8]:
data = data[['reviewerID', 'asin', 'overall', 'unixReviewTime']].rename(columns=
    {'reviewerID': 'user_id', 'asin': 'item_id', 'overall': 'rating', 'unixReviewTime': 'timestamp'})

In [9]:
data.user_id = data.user_id.astype('category').cat.codes
data.item_id = data.item_id.astype('category').cat.codes

In [10]:
data.duplicated(['user_id', 'item_id']).sum()

0

In [11]:
(data.groupby('item_id').count()['user_id'] < 5).mean()

0.0

In [12]:
(data.groupby('user_id').count()['item_id'] < 5).mean()

0.0

In [13]:
train, val, test = train_val_test_split(data, RELEVANCE_THRESHOLD, RELEVANCE_COL, test_quantile=0.9)

Filtering items..
Number of items before 11924
Number of items after 11924
Interactions length before: 167597
Interactions length after: 167597
Filtering users..
Number of users before 19412
Number of users after 19412
Interactions length before: 167597
Interactions length after: 167597


In [14]:
val.user_id.nunique()

2316

In [15]:
train.user_id.nunique()

18776

In [16]:
train.groupby('user_id')['item_id'].agg('count').describe()

count    18776.000000
mean         8.032861
std          8.282813
min          1.000000
25%          5.000000
50%          6.000000
75%          8.000000
max        550.000000
Name: item_id, dtype: float64

In [17]:
def filter_last_positive(df):

    df = df.sort_values(['user_id', 'timestamp'])
    last_item = df.groupby('user_id').tail(1)
    last_item = last_item[last_item[RELEVANCE_COL] >= RELEVANCE_THRESHOLD]
    df = df[df['user_id'].isin(last_item['user_id'])]

    return df

In [18]:
val = filter_last_positive(val)

In [19]:
val.user_id.nunique()

1483

In [22]:
train.to_parquet(f'{DATA_PATH}/train.parquet', index=False)
test.to_parquet(f'{DATA_PATH}/test.parquet', index=False)
val.to_parquet(f'{DATA_PATH}/validation.parquet', index=False)