In [1]:
import feather
import pandas as pd
import numpy as np

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
df_interactions_1 = feather.read_dataframe('df/df_interactions_1.feather')
df_interactions_2 = feather.read_dataframe('df/df_interactions_2.feather')

df_interactions_1 = df_interactions_1[df_interactions_1.interaction_type != 0].reset_index(drop=1)
df_interactions_2 = df_interactions_2[df_interactions_2.interaction_type != 0].reset_index(drop=1)

df_int = pd.concat([df_interactions_1, df_interactions_2]).reset_index(drop=1)
del df_interactions_1, df_interactions_2, df_int['ts']

In [3]:
val_items = pd.read_csv('tmp/val_items.txt', header=None)[0]
vi = set(val_items)

In [4]:
df_int = df_int[~df_int.item_id.isin(vi)].reset_index(drop=1)

In [5]:
df_users = feather.read_dataframe('df/df_users.feather')
df_items = feather.read_dataframe('df/df_items.feather')

uid_idx = dict(zip(df_users.user_id, df_users.index))
iid_idx = dict(zip(df_items.item_id, df_items.index))

In [7]:
cv = CountVectorizer(token_pattern=r'\d+', dtype=np.uint8)
X_item_titles = cv.fit_transform(df_items.title)
X_user_job = cv.transform(df_users.jobroles)

In [8]:
common_columns = ['career_level', 'discipline_id', 'industry_id', 'country', 'region']

In [9]:
uidx = df_int.user_id.apply(uid_idx.get)
df_int_users = df_users.iloc[uidx].reset_index(drop=1)

iidx = df_int.item_id.apply(iid_idx.get)
df_int_items = df_items.iloc[iidx].reset_index(drop=1)

In [10]:
df_features = (df_int_users[common_columns] == df_int_items[common_columns]).astype('uint8')

In [11]:
title_match = X_item_titles[iidx.values].multiply(X_user_job[uidx.values]).sum(axis=1)
title_match = np.asarray(title_match).reshape(-1)

In [12]:
df_features['title_match'] = title_match

In [13]:
del X_item_titles, X_user_job, uidx, iidx, title_match, df_int_items, df_int_users
import gc
gc.collect()

52

In [14]:
df_features['target'] = (df_int.interaction_type != 4).astype('uint8')

In [15]:
df_features = df_features[df_features.title_match > 0].reset_index(drop=1)

In [16]:
from sklearn.cross_validation import train_test_split



In [17]:
df_train, df_val = train_test_split(df_features, test_size=0.1, random_state=1)

In [18]:
features = sorted(set(df_train.columns) - {'target'})

In [19]:
X_train = df_train[features].values
y_train = df_train.target.values

In [20]:
X_val = df_val[features].values
y_val = df_val.target.values

In [21]:
import xgboost as xgb

In [22]:
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)
watchlist = [(dtrain, 'train'), (dval, 'val')]

In [23]:
xgb_pars = {
    'eta': 0.1,
    'gamma': 0,
    'max_depth': 2,
    'min_child_weight': 1,
    'max_delta_step': 0,
    'subsample': 1,
    'colsample_bytree': 1,
    'colsample_bylevel': 1,
    'lambda': 1,
    'alpha': 0,
    'tree_method': 'approx',
# not deafauts
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'nthread': 8,
    'seed': 42,
    'silent': 1,
    'base_score': 0.0,
}

num_round = 25

In [24]:
model = xgb.train(xgb_pars, dtrain, num_round, evals=watchlist, verbose_eval=1)

[0]	train-rmse:0.838921	val-rmse:0.838582
[1]	train-rmse:0.771652	val-rmse:0.771348
[2]	train-rmse:0.712524	val-rmse:0.712259
[3]	train-rmse:0.660762	val-rmse:0.660541
[4]	train-rmse:0.615653	val-rmse:0.615479
[5]	train-rmse:0.576534	val-rmse:0.576409
[6]	train-rmse:0.542782	val-rmse:0.542708
[7]	train-rmse:0.513816	val-rmse:0.513795
[8]	train-rmse:0.489094	val-rmse:0.489128
[9]	train-rmse:0.468108	val-rmse:0.468196
[10]	train-rmse:0.450391	val-rmse:0.450533
[11]	train-rmse:0.435508	val-rmse:0.435701
[12]	train-rmse:0.423067	val-rmse:0.423308
[13]	train-rmse:0.4127	val-rmse:0.412986
[14]	train-rmse:0.404114	val-rmse:0.404443
[15]	train-rmse:0.397011	val-rmse:0.397381
[16]	train-rmse:0.391165	val-rmse:0.391573
[17]	train-rmse:0.386358	val-rmse:0.386799
[18]	train-rmse:0.38242	val-rmse:0.382893
[19]	train-rmse:0.379194	val-rmse:0.379695
[20]	train-rmse:0.376561	val-rmse:0.377089
[21]	train-rmse:0.374414	val-rmse:0.374964
[22]	train-rmse:0.372659	val-rmse:0.37323
[23]	train-rmse:0.371218	

## Validation

In [25]:
from tqdm import tqdm

In [26]:
val_users = pd.read_csv('tmp/val_users2.txt').user_id
vu = set(val_users)

In [27]:
df_scores = feather.read_dataframe('df/df_int_scores.feather')
uval = df_scores.user_id.isin(vu)
ival = df_scores.item_id.isin(vi)
df_scores = df_scores[uval & ival].append(df_scores.iloc[-1]).reset_index(drop=1)

In [28]:
scores_idx = dict(zip(zip(df_scores.user_id, df_scores.item_id), df_scores.index))
del df_scores['user_id'], df_scores['item_id']

In [29]:
batch_uids = [uid_idx[i] for i in val_users]
df_candidates = df_users.iloc[batch_uids].reset_index(drop=1)
user_roles = cv.transform(df_candidates.jobroles)

In [34]:
delete_score = []

for item_id in tqdm(val_items[:10000]):
    batch_res = df_candidates.copy()

    item = df_items.iloc[iid_idx[item_id]]
    item_title = cv.transform([item.title])
    
    title_match = user_roles.dot(item_title.T).toarray().reshape(-1)

    batch_res['title_match'] = title_match
    batch_res = batch_res[batch_res.title_match > 0].reset_index(drop=1)
    
    if len(batch_res) == 0:
        continue

    eq_features = (item[common_columns] == batch_res[common_columns]).astype('uint8')
    eq_features['title_match'] = batch_res.title_match

    X = eq_features[features].values
    dtest = xgb.DMatrix(X, feature_names=features)

    batch_res['pred'] = model.predict(dtest)
    batch_res.sort_values(by='pred', ascending=0, inplace=1)

    scores_join_idx = [scores_idx.get((u, item_id), -1) for u in batch_res.user_id]
    batch_scores = df_scores.iloc[scores_join_idx].reset_index(drop=1)

    is_delete = batch_scores.deleted + batch_scores.delete_only
    
    delete_score.extend(zip(batch_res['pred'], is_delete))

100%|██████████| 10000/10000 [05:14<00:00, 31.76it/s]


In [37]:
df_deletes = pd.DataFrame(delete_score, columns=['score', 'is_delete'])

In [38]:
from sklearn.metrics import roc_auc_score

In [44]:
df_deletes.is_delete = (df_deletes.is_delete >= 1).astype('uint8')

In [45]:
roc_auc_score(df_deletes.is_delete.values, df_deletes.score.values)

0.44673225107789977

In [51]:
total_sum = 0
scores = []

for item_id in tqdm(val_items[:10000]):
    batch_res = df_candidates.copy()

    item = df_items.iloc[iid_idx[item_id]]
    item_title = cv.transform([item.title])
    
    title_match = user_roles.dot(item_title.T).toarray().reshape(-1)

    batch_res['title_match'] = title_match
    batch_res = batch_res[batch_res.title_match > 0].reset_index(drop=1)
    
    if len(batch_res) == 0:
        continue

    eq_features = (item[common_columns] == batch_res[common_columns]).astype('uint8')
    eq_features['title_match'] = batch_res.title_match

    X = eq_features[features].values
    dtest = xgb.DMatrix(X, feature_names=features)

    batch_res['pred'] = model.predict(dtest)
        
    batch_res = batch_res[batch_res['pred'] > 0.8].reset_index(drop=1)
    if len(batch_res) == 0:
        continue

    batch_res.sort_values(by='pred', ascending=0, inplace=1)
    batch_res = batch_res.iloc[:100].reset_index(drop=1)

    scores_join_idx = [scores_idx.get((u, item_id), -1) for u in batch_res.user_id]
    batch_scores = df_scores.iloc[scores_join_idx].reset_index(drop=1)
    batch_res['score'] = batch_scores.premium_boost * batch_scores.score

    item_score = 0

    if batch_scores[['clicked', 'bookmarked', 'replied']].values.sum() > 0:
        if item.is_paid == 1:
            item_score = 50
        else:
            item_score = 25

    s = batch_res.score.sum() + item_score
    scores.append(s)

    total_sum = total_sum + s

100%|██████████| 10000/10000 [05:40<00:00, 29.41it/s]


In [1]:
total_sum / 10000

NameError: name 'total_sum' is not defined

## Submission

In [180]:
val_items = pd.read_csv('data/targetItems.csv', header=None)[0]
val_users = pd.read_csv('data/targetUsers.csv').user_id

In [182]:
batch_uids = [uid_idx[i] for i in val_users]
df_candidates = df_users.iloc[batch_uids].reset_index(drop=1)
user_roles = cv.transform(df_candidates.jobroles)

In [183]:
res = []

for item_id in tqdm(val_items):
    batch_res = df_candidates.copy()

    item = df_items.iloc[iid_idx[item_id]]
    item_title = cv.transform([item.title])

    title_match = user_roles.dot(item_title.T).toarray().reshape(-1)

    batch_res['title_match'] = title_match
    batch_res = batch_res[batch_res.title_match > 0].reset_index(drop=1)

    if len(batch_res) == 0:
        continue

    eq_features = (item[common_columns] == batch_res[common_columns]).astype('uint8')
    eq_features['title_match'] = batch_res.title_match

    X = eq_features[features].values
    dtest = xgb.DMatrix(X, feature_names=features)

    batch_res['pred'] = model.predict(dtest)
    batch_res = batch_res[batch_res['pred'] > 0.8].reset_index(drop=1)
    if len(batch_res) == 0:
        continue

    batch_res.sort_values(by='pred', ascending=0, inplace=1)
    batch_res = batch_res.iloc[:100].reset_index(drop=1)

    user_ids = list(batch_res.user_id)

    res.append((item_id, user_ids))

100%|██████████| 46559/46559 [26:24<00:00, 29.39it/s]


In [185]:
output_file = 'benchmark.txt'

with open(output_file, 'w') as fp:
    #fp.write('item_id\tuser_ids\n')
    for item, users in tqdm(res):
        top_users_joined = ','.join(str(u) for u in users)
        fp.write('%d\t%s\n' % (item, top_users_joined))
        fp.flush()

100%|██████████| 44904/44904 [00:08<00:00, 5201.42it/s]
