In [1]:
import numpy as np
import pandas as pd

from collections import Counter
from tqdm.notebook import tqdm

from sklearn.metrics import *
from utils.eval import *
from utils.funcs import *

In [2]:
data = pd.read_csv("../data/User Interaction Data/dating_suggestions.csv")

test_period_start, test_period_end = '2022-02-01', '2022-02-15'
train_interaction = data[(data['created_at'] > '2021-11-30') & (data['created_at'] < test_period_start)]
test_interaction = data[(data['created_at'] > test_period_start) & (data['created_at'] < test_period_end)]

unique_users = list(set(pd.concat([train_interaction["source_id"], train_interaction["user_id"]])))
test_interaction = test_interaction[(test_interaction["user_id"].isin(unique_users)) & (test_interaction["source_id"].isin(unique_users))]
print(len(unique_users))

user_to_idx, idx_to_user = {}, {}
for i in tqdm(range(len(unique_users))):
  user_to_idx[unique_users[i]] = i
  idx_to_user[i] = unique_users[i]

21392


  0%|          | 0/21392 [00:00<?, ?it/s]

In [3]:
def jaccard(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union

def cf4(uid, sid):
    sxy, syx = 0, 0
        
    nei1 = train_accepted[train_accepted["source_id"] == sid]["user_id"].values
    nei2 = train_accepted[train_accepted["user_id"] == uid]["source_id"].values

    if len(nei1) > 0:
        for u in nei1:
            sxy += jaccard(train_accepted[train_accepted["user_id"] == u]["source_id"].values, nei2)
        sxy = sxy/len(nei1)

    if len(nei2) >0:
        for v in nei2:
            syx += jaccard(train_accepted[train_accepted["source_id"] == v]["user_id"].values, nei1)
        syx = syx/len(nei2)

    if sxy >0 and syx > 0:
        return 2/((1/sxy) +(1/syx))
    else:
        return 0

In [3]:
np.random.seed(777)
num_train, num_test = 1000000, 50000

idx = np.random.choice(np.arange(len(train_interaction)), num_train, replace=False)
train_data = train_interaction.iloc[idx]
train_accepted = train_data[train_data["accepted"] == 1]

idx = np.random.choice(np.arange(len(test_interaction)), num_test, replace=False)
test_data = test_interaction.iloc[idx]

cnt = Counter(train_interaction['user_id'])
unique_test_users = test_interaction['user_id'].unique()
to_include = []
threshold = 10
for u in unique_test_users:
  if cnt[u] > threshold:
    to_include.append(u)

test_data = test_interaction[test_interaction['user_id'].isin(to_include)]

In [8]:
train_accepted

Unnamed: 0,user_id,source_id,created_at,accepted
3476030,243387,261223,2022-01-07 09:25:50,1.0
903268,104601,254963,2021-12-01 20:03:58,1.0
366899,257106,27804,2022-01-07 03:16:59,1.0
4337033,88,260593,2022-01-18 10:13:04,1.0
827645,155263,254694,2021-12-01 01:17:34,1.0
...,...,...,...,...
5122472,228682,266915,2022-01-30 09:40:08,1.0
2747641,118430,259878,2021-12-28 13:44:31,1.0
554596,266190,3707,2022-01-25 10:22:20,1.0
1862412,226503,257624,2021-12-15 11:41:56,1.0


In [10]:
uids, sids = test_data['user_id'].values, test_data['source_id'].values
y_pred  =[]

for i in tqdm(range(len(test_data))):
    y_pred.append(cf4(uids[i], sids[i],train_accepted))

  0%|          | 0/155157 [00:00<?, ?it/s]

In [11]:
y_test = test_data["accepted"].values

print("ROAUC: ", roc_auc_score(y_test, y_pred))
print("PRAUC: ", average_precision_score(y_test, y_pred))  

ROAUC:  0.6855450393372671
PRAUC:  0.2889522051482133


In [12]:
unique_test_users = test_data["user_id"].unique()
test_data["y_prob"] = y_pred

ks = np.arange(1, 11)
# ks = [10]

for k in ks:
  hrs, ndcgs= [], []
  for uid in unique_test_users:
    pdf = test_data[test_data["user_id"] == uid]
    gt = pdf[pdf["accepted"]== 1]["source_id"].values
    pr = pdf["y_prob"].values
    items = pdf["source_id"].values
    if len(gt) > 2:
      hrs.append(hit_ratio(gt, items, pr, k))
      ndcgs.append(ndcg(gt, items, pr, k))
  print(k, np.mean(hrs), np.mean(ndcgs))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data["y_prob"] = y_pred


1 0.10904895884741109 0.5718700039338468
2 0.20908038446953933 0.5643696845054651
3 0.3016051860266215 0.5516562495410107
4 0.37531795319423344 0.5533591851918221
5 0.43800737949670815 0.5627600559759443
6 0.4922786018251689 0.5741648965729019
7 0.538833314711305 0.5861800427643881
8 0.5773189309486506 0.5960539380090132
9 0.6119211876977428 0.6069827459522013
10 0.6429012106995483 0.6170697705984763
