In [2]:
import sys
import pandas as pd

from preprocess import train_val_test_split, prepare_splitted_data

In [29]:
PROJECT_PATH = '~/Projects/negative_feedback/'
DATA_PATH = f'{PROJECT_PATH}/data/kion_genres/'

RELEVANCE_COL = 'watched_pct'
RELEVANCE_THRESHOLD = 15

In [30]:
path_to_df = '~/Projects/negative_feedback/data/interactions_df.csv'
data = pd.read_csv(path_to_df)

In [31]:
data.user_id.nunique()

962179

In [32]:
random_user_ids = data['user_id'].sample(n=50000, random_state=42)
data = data[data.user_id.isin(random_user_ids)]

In [43]:
len(data) / data.user_id.nunique() / data.item_id.nunique()

0.001611509104369135

In [34]:
data = data.rename(columns={'last_watch_dt': 'timestamp'})

In [35]:
data.watched_pct.median()

44.0

In [36]:
sum(data.watched_pct < 15) / len(data)

0.3597750799934417

In [15]:
data.duplicated(['user_id', 'item_id']).sum()

0

In [16]:
(data.groupby('item_id').count()['user_id'] < 5).mean()

0.43326550606677217

In [17]:
(data.groupby('user_id').count()['item_id'] < 5).mean()

0.2350282236726054

In [18]:
data = data[data.total_dur > 300]

In [19]:
data = data.sort_values(['user_id', 'timestamp'])

In [20]:
item_dict = dict(zip(data['item_id'], data['item_id'].astype('category').cat.codes))

In [21]:
data

Unnamed: 0,user_id,item_id,timestamp,total_dur,watched_pct
3599591,9,15297,2021-07-26,16861,63.0
1613143,9,9728,2021-07-28,7956,100.0
4959141,9,13865,2021-07-28,7744,100.0
4365617,9,3076,2021-07-30,761,12.0
1710329,9,10440,2021-08-01,13929,60.0
...,...,...,...,...,...
5148539,1097508,5220,2021-08-08,18877,25.0
1908283,1097508,3777,2021-08-11,894,3.0
5443553,1097508,9985,2021-08-22,2409,7.0
2575024,1097514,11778,2021-06-29,3036,42.0


In [22]:
train, val, test = train_val_test_split(data, RELEVANCE_THRESHOLD, RELEVANCE_COL, test_quantile=0.9)

Filtering items..
Number of items before 11650
Number of items after 6366
Interactions length before: 762253
Interactions length after: 752396
Filtering users..
Number of users before 43393
Number of users after 31153
Interactions length before: 752396
Interactions length after: 723924


In [28]:
import numpy as np
np.mean(pd.concat([train, val, test]).watched_pct < 15)

0.1891416726808882

In [29]:
val.user_id.nunique()

7194

In [30]:
train.user_id.nunique()

30432

In [31]:
train.groupby('user_id')['item_id'].agg('count').describe()

count    30432.000000
mean        21.220262
std         24.160900
min          1.000000
25%          8.000000
50%         13.000000
75%         25.000000
max        448.000000
Name: item_id, dtype: float64

In [32]:
def filter_last_positive(df):

    df = df.sort_values(['user_id', 'timestamp'])
    last_item = df.groupby('user_id').tail(1)
    last_item = last_item[last_item[RELEVANCE_COL] >= RELEVANCE_THRESHOLD]
    df = df[df['user_id'].isin(last_item['user_id'])]

    return df

In [34]:
val = filter_last_positive(val)

In [35]:
val.user_id.nunique()

5451

In [37]:
train.to_parquet(f'{DATA_PATH}/train.parquet', index=False)
test.to_parquet(f'{DATA_PATH}/test.parquet', index=False)
val.to_parquet(f'{DATA_PATH}/validation.parquet', index=False)

In [38]:
path_to_items = '~/Projects/negative_feedback/data/items.csv'
items = pd.read_csv(path_to_items)

In [83]:
items['item_id'] = items['item_id'].map(item_dict)

In [84]:
items.to_parquet(f'{DATA_PATH}/items.parquet', index=False)