## Build ranking dataset for topN recommendation - Part 2
## Source: TF-IDF features (tabular)
## Method: Knn (nearest neighbor), metrics: cosine similarity

- Add label to the knn_tfidf_ranking_data

- Process:
    - For each application in the augmented dataset, check if they already exist in the original interaction data
    - If the applicaiont has existed in the interaction data: retrieve corresponding label (1 or 0)
    - If the application has not existed in interaction data, assign label 0

- Result: knn_tfidf_ranking_data_v2.csv
    - UserID
    - ItemID
    - label
    - City, State, Country

In [1]:
import pandas as pd
import numpy as np

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Run this for reproduce
# Gets the current working directory
import os
cwd = os.getcwd()
print("Working directory:", cwd)
# Go up one directory from working directory
os.chdir("..")

Working directory: /Users/anhtth/Library/CloudStorage/OneDrive-UniversityofTwente/2023 UT- THESIS/1-Code/0.cb12_main/nb_recsys_tabular


In [4]:
# Load ranking data (generated by KNN-TFIDF features)
ranking_data = pd.read_csv('./nb_recsys_tabular/ranking_data_knn_tfidf.csv')

In [5]:
# Load input: train_data_flat, test_data_flat
train_data_flat = pd.read_csv('./xai_posthoc/train_data_flat.csv')
test_data_flat = pd.read_csv('./xai_posthoc/test_data_flat.csv')

## Build evaluation data (add label to the ranking data for all pairs of interaction)
- Look up in the original interaction data
- If the interaction exists, get the label there
- If the interaction does not exist, assign 0

In [6]:
# Positive interaction (from original data)
select_cols = ['UserID', 'JobID', 'label']
select_train = train_data_flat[select_cols]
select_test = test_data_flat[select_cols]
interactions_df = pd.concat([select_train,select_test])

In [7]:
len(select_train), len(select_test), len(interactions_df)

(563889, 15736, 579625)

In [8]:
interactions_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 579625 entries, 0 to 15735
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   UserID  579625 non-null  int64
 1   JobID   579625 non-null  int64
 2   label   579625 non-null  int64
dtypes: int64(3)
memory usage: 17.7 MB


In [9]:
test_uid = ranking_data.loc[0].UserID
test_jid = ranking_data.loc[0].JobID

In [10]:
check_interaction = [(u_id, j_id) for (u_id, j_id) in zip(interactions_df.UserID, interactions_df.JobID)]

In [11]:
len(set(check_interaction))

579625

In [12]:
len(check_interaction)

579625

In [13]:
(test_uid, test_jid)

(13.0, 824471.0)

In [14]:
(test_uid, test_jid) in check_interaction

False

In [15]:
(7, 309823) in check_interaction

True

In [16]:
interactions_df.loc[0]

Unnamed: 0,UserID,JobID,label
0,7,309823,1
0,13,821691,1


In [17]:
def get_label(u_id, j_id):
    check_interaction = [(u_id, j_id) for (u_id, j_id) in zip(interactions_df.UserID, interactions_df.JobID)]
    if (u_id, j_id) in check_interaction:
        label_result = interactions_df[(interactions_df.UserID == u_id) & (interactions_df.JobID == j_id)].reset_index().loc[0]['label']
    else:
        label_result = 0
    return label_result

In [18]:
get_label(u_id = test_uid, j_id=test_jid)

0

In [None]:
# %%time
# ranking_data['label'] = ranking_data.apply(lambda x: get_label(x.UserID, x.JobID), axis=1)

In [19]:
ranking_interaction = [(u_id, j_id) for (u_id, j_id) in zip(ranking_data.UserID, ranking_data.JobID)]

In [20]:
len(ranking_interaction)

371600

In [28]:
%%time
ranking_label =  []
# mark_stop = 0 
for item in ranking_interaction:
    if item in check_interaction:
        label_result = interactions_df[(interactions_df.UserID == item[0]) & (interactions_df.JobID == item[1])].reset_index().loc[0]['label']
        check_interaction.remove(item)
    else:
        label_result = 0
    ranking_label.append(label_result)
#     mark_stop +=1
#     if mark_stop ==3:
#         break
        

CPU times: user 1h 10min 23s, sys: 31.6 s, total: 1h 10min 54s
Wall time: 1h 12min 53s


In [29]:
len(ranking_label)

371600

In [36]:
evaluation_data = ranking_data.copy()

In [37]:
evaluation_data['label'] = ranking_label

In [38]:
evaluation_data.head()

Unnamed: 0,UserID,JobID,City,State,Country,label
0,13,824471,0.0,0.0,1.0,0
1,13,7415,0.0,0.0,1.0,0
2,13,759423,0.0,0.0,1.0,0
3,13,729658,0.0,0.0,1.0,0
4,13,976335,0.0,0.0,1.0,0


In [39]:
cols = ['UserID', 'JobID', 'label', 'City', 'State', 'Country']
evaluation_data = evaluation_data[cols]

In [40]:
evaluation_data

Unnamed: 0,UserID,JobID,label,City,State,Country
0,13,824471,0,0.0,0.0,1.0
1,13,7415,0,0.0,0.0,1.0
2,13,759423,0,0.0,0.0,1.0
3,13,729658,0,0.0,0.0,1.0
4,13,976335,0,0.0,0.0,1.0
...,...,...,...,...,...,...
371595,1471988,324597,0,0.0,0.0,1.0
371596,1471988,663219,0,0.0,0.0,1.0
371597,1471988,1046491,0,0.0,0.0,1.0
371598,1471988,226330,0,0.0,0.0,1.0


In [41]:
evaluation_data.to_csv('./nb_ranking_data/knn_tfidf_ranking_data_v2.csv', header = True, index = False)