In [11]:
import feather
import pandas as pd
import numpy as np

from tqdm import tqdm
from functools import partial

In [2]:
df_interactions_1 = feather.read_dataframe('df/df_interactions_1.feather')
df_interactions_2 = feather.read_dataframe('df/df_interactions_2.feather')

df_interactions_1 = df_interactions_1[df_interactions_1.interaction_type != 0].reset_index(drop=1)
df_interactions_2 = df_interactions_2[df_interactions_2.interaction_type != 0].reset_index(drop=1)

df_int = pd.concat([df_interactions_1, df_interactions_2]).reset_index(drop=1)
del df_interactions_1, df_interactions_2, df_int['ts']

In [6]:
df_int.drop_duplicates(inplace=1)

In [8]:
def to_set(g):
    return set(g)

int_dict = df_int.groupby(by=['user_id', 'item_id']).agg({'interaction_type': to_set})
int_dict.reset_index(inplace=1)

In [13]:
df_users = feather.read_dataframe('df/df_users.feather')
iid_idx = dict(zip(df_items.item_id, df_items.index))

In [15]:
def is_in(s, t):
    return t in s

partial(is_in, t=1)({1, 2})

True

In [10]:
codes = [
    ('clicked', 1),
    ('bookmarked', 2),
    ('replied', 3),
    ('deleted', 4),
    ('recruiter_interested', 5),
]

In [16]:
for col, t in tqdm(codes):
    int_dict[col] = int_dict.interaction_type.apply(partial(is_in, t=t))

int_dict['delete_only'] = (int_dict.interaction_type == {4})

100%|██████████| 5/5 [00:30<00:00,  6.67s/it]


In [18]:
del int_dict['interaction_type']

In [20]:
bookmark_or_replied = (int_dict.bookmarked == 1) | (int_dict.replied == 1) 

In [21]:
uidx = int_dict.user_id.apply(uid_idx.get)

In [22]:
is_premium = df_users.iloc[uidx].premium.reset_index(drop=1)
premium_boost = is_premium + 1
int_dict['premium_boost'] = premium_boost

In [23]:
int_dict['score'] = 1 * int_dict.clicked + \
                    5 * bookmark_or_replied + \
                   20 * int_dict.recruiter_interested + \
                  -10 * int_dict.delete_only

In [25]:
no_interaction = {
    'item_id': 0,
    'user_id': 0,
    'bookmarked': 0,
    'clicked': 0,
    'delete_only': 0,
    'deleted': 0,
    'recruiter_interested': 0,
    'replied': 0,
    'premium_boost': 0,
    'score': 0,
}

df_interactions = int_dict.append(no_interaction, ignore_index=1)

for c in df_interactions.columns:
    if c.endswith('_id'):
        df_interactions[c] = df_interactions[c].astype('uint32')
        continue
    if c == 'score':
        df_interactions[c] = df_interactions[c].astype('int8')
        continue
    df_interactions[c] = df_interactions[c].astype('uint8')

df_interactions.dtypes

user_id                 uint32
item_id                 uint32
clicked                  uint8
bookmarked               uint8
replied                  uint8
deleted                  uint8
recruiter_interested     uint8
delete_only              uint8
premium_boost            uint8
score                     int8
dtype: object

In [26]:
feather.write_dataframe(df_interactions, 'df/df_int_scores.feather')

In [28]:
df_interactions.score.value_counts()

 1     4764541
-10     856849
 6      327172
 5       31749
 20       2356
 21        130
 26         14
 25          2
 0           1
Name: score, dtype: int64