In [None]:
import ast
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.metrics.pairwise import cosine_similarity

In [219]:
df_train = pd.read_csv("datasets/x_train.csv", index_col=None)[:1000]
y_train = pd.read_csv("datasets/y_train.csv", index_col=None)
df_train.job_ids = df_train.job_ids.apply(ast.literal_eval)
df_train.actions = df_train.actions.apply(ast.literal_eval)
df_train.head()

Unnamed: 0,session_id,job_ids,actions
0,0,"[305, 299, 300, 290, 282, 274, 264, 261]","[view, view, view, view, view, view, view, view]"
1,1,"[84, 257, 252, 250]","[view, view, view, view]"
2,2,"[241, 237, 221, 309, 310, 306, 301]","[view, view, apply, apply, apply, apply, apply]"
3,3,"[303, 297, 296, 298, 294, 295, 292, 293]","[apply, apply, apply, apply, apply, apply, app..."
4,4,"[171, 291, 289, 166, 288, 155]","[apply, apply, apply, apply, apply, apply]"


In [220]:
job_ids = [set(job_list) for job_list in df_train.job_ids]
job_ids = set.union(*job_ids)
jod_id_to_idx = {id: idx for idx, id in enumerate(job_ids)}
jod_idx_to_id = {idx: id for id, idx in jod_id_to_idx.items()}

In [221]:
df_train.job_ids = df_train.job_ids.apply(lambda ids : [jod_id_to_idx[id] for id in ids])

In [222]:
def indices_to_binary_vector(indices):
    binary_vector = np.zeros(len(job_ids))
    binary_vector[indices] = 1
    return binary_vector

df_train.job_ids = df_train.job_ids.apply(indices_to_binary_vector).to_list()

In [None]:
def get_prediction(K=20, T=.3):
    
    interaction_matrix = np.array(df_train.job_ids.to_list())
    similarity_matrix = cosine_similarity(interaction_matrix)
    similar_users = np.argsort(similarity_matrix, axis=1)[:, -K-1:-1][:, ::-1]
    # transform the indices into a binary vector
    vectorized_similar_users = np.zeros((interaction_matrix.shape[0], interaction_matrix.shape[0]))
    np.put_along_axis(vectorized_similar_users, similar_users, 1, axis=1)
    
    job_popularity = vectorized_similar_users @ interaction_matrix / K
    popular_jobs = np.argsort(job_popularity, axis=1)[:, -10:][:, ::-1]
    sorted_popularity = np.take_along_axis(job_popularity, popular_jobs, axis=1)

    actions = list((sorted_popularity.mean(axis=1) > T))
    actions = ['apply' if action else 'view' for action in actions]
    next_jobs = [list(map(lambda idx: jod_idx_to_id[idx], job_list)) for job_list in popular_jobs]

    return actions, next_jobs

In [224]:
def evaluate(actions_pred, next_job_pred, actions_true, next_jobs_true):
    accuracy = sum([1 if pred == true else 0 for pred, true in zip(actions_pred, actions_true)]) / len(actions_true)
    MRR = 0
    for pred, true in zip(next_job_pred, next_jobs_true):
        if true in pred:
            MRR += 1 / (1 + pred.index(true))
    MRR /= len(next_jobs_true)
    return 0.3 * accuracy + 0.7 * MRR

In [225]:
actions_true = y_train.action.to_list()
next_jobs_true = y_train.job_id.to_list()

In [226]:
results = []
for K in tqdm(range(1, 100, 5)):
    results.append([])
    for T in np.arange(0.1, 1, 0.1):
        actions, next_jobs = get_prediction(K, T)
        results[-1].append(evaluate(actions, next_jobs, actions_true, next_jobs_true))

100%|██████████| 20/20 [00:59<00:00,  2.98s/it]
