In [3]:
import ast
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

### 1. Training part

In [4]:
df_train = pd.read_csv("datasets/x_train.csv", index_col=None)
y_train = pd.read_csv("datasets/y_train.csv", index_col=None)
actions_true = y_train.action.to_list()
next_jobs_true = y_train.job_id.to_list()
df_train.job_ids = df_train.job_ids.apply(ast.literal_eval)
df_train.actions = df_train.actions.apply(ast.literal_eval)

df_train.head()

Unnamed: 0,session_id,job_ids,actions
0,0,"[305, 299, 300, 290, 282, 274, 264, 261]","[view, view, view, view, view, view, view, view]"
1,1,"[84, 257, 252, 250]","[view, view, view, view]"
2,2,"[241, 237, 221, 309, 310, 306, 301]","[view, view, apply, apply, apply, apply, apply]"
3,3,"[303, 297, 296, 298, 294, 295, 292, 293]","[apply, apply, apply, apply, apply, apply, app..."
4,4,"[171, 291, 289, 166, 288, 155]","[apply, apply, apply, apply, apply, apply]"


In [5]:
job_ids = [set(job_list) for job_list in df_train.job_ids]
job_ids = set.union(*job_ids)
job_id_to_idx = {id: idx for idx, id in enumerate(job_ids)}
job_idx_to_id = {idx: id for id, idx in job_id_to_idx.items()}

df_train.job_ids = df_train.job_ids.apply(lambda ids : [job_id_to_idx[id] for id in ids])

def indices_to_binary_vector(indices):
    binary_vector = np.zeros(len(job_ids))
    binary_vector[indices] = 1
    return binary_vector

df_train.job_ids = df_train.job_ids.apply(indices_to_binary_vector).to_list()

# we create the interaction matrix
interaction_matrix_train = np.array(df_train.job_ids.to_list())

In [6]:
def get_prediction(interaction_matrix_test=None, K=20, T=.3):
    
    if type(interaction_matrix_test) != type(None):
        interaction_matrix_test = interaction_matrix_train

    similarity_matrix = cosine_similarity(interaction_matrix_test, interaction_matrix_train)
    similar_users = np.argsort(similarity_matrix, axis=1)[:, -K-1:-1][:, ::-1]

    # transform the indices into a binary vector
    vectorized_similar_users = np.zeros((interaction_matrix_test.shape[0], interaction_matrix_train.shape[0]))
    np.put_along_axis(vectorized_similar_users, similar_users, 1, axis=1)
    
    # compute the popularity of the jobs
    job_popularity = vectorized_similar_users @ interaction_matrix_train / K
    popular_jobs = np.argsort(job_popularity, axis=1)[:, -10:][:, ::-1]
    sorted_popularity = np.take_along_axis(job_popularity, popular_jobs, axis=1)

    actions = list((sorted_popularity.mean(axis=1) > T))
    actions = ['apply' if action else 'view' for action in actions]
    next_jobs = [list(map(lambda idx: job_idx_to_id[idx], job_list)) for job_list in popular_jobs]

    return actions, next_jobs

In [14]:
def evaluate(actions_pred, next_job_pred, actions_true, next_jobs_true):
    accuracy = sum([1 if pred == true else 0 for pred, true in zip(actions_pred, actions_true)]) / len(actions_true)
    MRR = 0
    for pred, true in zip(next_job_pred, next_jobs_true):
        if true in pred:
            MRR += 1 / (1 + pred.index(true))
    MRR /= len(next_jobs_true)
    print(f"Accuracy: {accuracy: .4f}, MRR: {MRR: .4f}, Score: {0.3 * accuracy + 0.7 * MRR: .4f}")
    return 0.3 * accuracy + 0.7 * MRR

### 2. Testing part

In [8]:
df_test = pd.read_csv("datasets/x_test.csv", index_col=None)
y_test = pd.read_csv("datasets/y_test.csv", index_col=None)
actions_true = y_test.action.to_list()
next_jobs_true = y_test.job_id.to_list()
df_test.job_ids = df_test.job_ids.apply(ast.literal_eval)
df_test.actions = df_test.actions.apply(ast.literal_eval)

df_test.job_ids = df_test.job_ids.apply(lambda ids : [job_id_to_idx[id] for id in ids if id in job_id_to_idx])
df_test.job_ids = df_test.job_ids.apply(indices_to_binary_vector).to_list()
interaction_matrix_test = np.array(df_test.job_ids.to_list())

In [9]:
actions, next_jobs = get_prediction(interaction_matrix_test)

In [15]:
_ = evaluate(actions, next_jobs, actions_true, next_jobs_true)

Accuracy:  0.5278, MRR:  0.0018, Score:  0.1596
