## Build ranking dataset for topN recommendation - PART 1
## Source: TF-IDF features (tabular)
## Method: Knn (nearest neighbor), metrics: cosine similarity

- Result: knn_tfidf_ranking_data.csv
    - UserID
    - ItemID
    - City, State, Country

- Process:
    - Use tabular dataset to build KNN models with 100 neighbors (trained on X_train)
    - Get 100 neighbors for each test application (total: 15,776 applications)
    - Reduce duplication of (UserId, ItemID, cosine distance) in the final result
    - Sorted result based on cosine distance
    - Group result by UserID, retain only first 100 JobID for a User
    - Generate City, State, Country matching

In [127]:
import caffeine
caffeine.on(display=False)

In [128]:
import warnings
warnings.filterwarnings('ignore')

In [1]:
# Run this for reproduce
# Gets the current working directory
import os
cwd = os.getcwd()
print("Working directory:", cwd)
# Go up one directory from working directory
os.chdir("..")

Working directory: /Users/anhtth/Library/CloudStorage/OneDrive-UniversityofTwente/2023 UT- THESIS/1-Code/0.cb12_main/nb_recsys_tabular


In [2]:
import pandas as pd
import numpy as np

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_distances

In [4]:
# Load input: train_data_flat, test_data_flat
train_data_flat = pd.read_csv('./xai_posthoc/train_data_flat.csv')
test_data_flat = pd.read_csv('./xai_posthoc/test_data_flat.csv')

In [5]:
train_data_flat.head()

Unnamed: 0,UserID,JobID,label,City,State,Country,DegreeType,WorkHistoryCount,TotalYearsExperience,CurrentlyEmployed,...,job_matrix_90,job_matrix_91,job_matrix_92,job_matrix_93,job_matrix_94,job_matrix_95,job_matrix_96,job_matrix_97,job_matrix_98,job_matrix_99
0,7,309823,1,1.0,1.0,1.0,1,2,13.0,1,...,0.0,0.417426,0.0,0.0,0.065994,0.070672,0.101036,0.073804,0.0,0.055815
1,7,703889,1,1.0,1.0,1.0,1,2,13.0,1,...,0.0,0.105243,0.0,0.0,0.116469,0.0,0.044579,0.130255,0.0,0.049252
2,7,566574,0,0.0,0.0,1.0,1,2,13.0,1,...,0.0,0.052103,0.07757,0.0,0.0,0.0,0.264839,0.0,0.0,0.048767
3,7,481216,0,0.0,0.0,1.0,1,2,13.0,1,...,0.0,0.0,0.125297,0.204315,0.0,0.0,0.0,0.156244,0.0,0.078773
4,9,809208,1,1.0,1.0,1.0,1,3,3.0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.240853,0.0,0.0,0.266103


In [6]:
# Load input: train_data_extended, test_data_extended
train_data_extended = pd.read_csv('./xai_posthoc/train_data_extended.csv')
test_data_extended = pd.read_csv('./xai_posthoc/test_data_extended.csv')

In [7]:
train_data_extended.head()
test_data_extended.head()

Unnamed: 0,UserID,JobID,label,City,State,Country,DegreeType,WorkHistoryCount,TotalYearsExperience,CurrentlyEmployed,ManagedOthers,ManagedHowMany,u_idx,work_history_matrix,j_idx,job_matrix,mean_work_history_matrix,mean_job_matrix
0,13,821691,1,0.0,1.0,1.0,4,6,5.0,1,0,0,112440,[[0. 0. 0. 0.45926196 ...,905490,[[0. 0. 0. 0. ...,0.033316,0.033698
1,13,329572,0,0.0,0.0,1.0,4,6,5.0,1,0,0,112440,[[0. 0. 0. 0.45926196 ...,854303,[[0. 0. 0. 0. ...,0.033316,0.037174
2,514,131166,1,0.0,0.0,1.0,5,4,5.0,0,0,0,131256,[[0. 0. 0. 0. ...,949428,[[0.15860748 0. 0. 0. ...,0.028284,0.041802
3,514,620304,0,0.0,0.0,1.0,5,4,5.0,0,0,0,131256,[[0. 0. 0. 0. ...,341134,[[0.12856694 0. 0. 0.08879664 ...,0.028284,0.048736
4,681,654542,1,0.0,1.0,1.0,1,2,4.0,0,0,0,28761,[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0....,345495,[[0. 0.24413428 0. 0. ...,0.02,0.04394


In [8]:
train_data_extended.columns

Index(['UserID', 'JobID', 'label', 'City', 'State', 'Country', 'DegreeType',
       'WorkHistoryCount', 'TotalYearsExperience', 'CurrentlyEmployed',
       'ManagedOthers', 'ManagedHowMany', 'u_idx', 'work_history_matrix',
       'j_idx', 'job_matrix', 'mean_work_history_matrix', 'mean_job_matrix'],
      dtype='object')

## UDF function to convert string matrix to list

In [9]:
test_str = train_data_extended.work_history_matrix.loc[0]

In [10]:
test_str

'[[0.         0.         0.         0.         0.         0.\n  0.         0.         0.         0.         0.         0.\n  0.         0.         0.         0.         0.         0.\n  0.         0.         0.         0.         0.         0.\n  0.         0.         0.467288   0.         0.         0.\n  0.69007561 0.         0.         0.         0.         0.\n  0.         0.         0.55266407 0.         0.         0.\n  0.         0.         0.         0.         0.         0.\n  0.         0.        ]]'

In [11]:
replace_str = test_str.replace('\n', '').replace('[','') \
              .replace(']','') \
              .split()

In [12]:
replace_numpy = [float(i) for i in replace_str]

In [13]:
type(replace_numpy)

list

In [14]:
def convert_matrix(matrix_str):
    replace_str = matrix_str.replace('\n', '').replace('[','') \
              .replace(']','') \
              .split()
    new_list = [float(i) for i in replace_str]
    result = np.array(new_list)
    return result

In [15]:
convert_matrix(test_str)

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.467288  , 0.        , 0.        , 0.        ,
       0.69007561, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.55266407, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ])

## Apply UDF to work_history_matrix, job_matrix

In [16]:
%%time
clean_work_matrix = train_data_extended.work_history_matrix.apply(convert_matrix)

CPU times: user 6.25 s, sys: 115 ms, total: 6.36 s
Wall time: 6.42 s


In [17]:
%%time
clean_job_matrix = train_data_extended.job_matrix.apply(convert_matrix)

CPU times: user 13.4 s, sys: 202 ms, total: 13.6 s
Wall time: 13.6 s


In [18]:
len(clean_work_matrix)

563889

In [19]:
len(clean_work_matrix[10])

50

In [20]:
len(clean_job_matrix)

563889

In [21]:
len(clean_job_matrix[10])

100

In [22]:
train_data_extended['work_history_matrix'] = clean_work_matrix

In [23]:
train_data_extended['job_matrix'] = clean_job_matrix

In [24]:
train_data_extended.work_history_matrix.head()

0    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
1    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
2    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
3    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
4    [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...
Name: work_history_matrix, dtype: object

In [25]:
%%time

clean_work_matrix = test_data_extended.work_history_matrix.apply(convert_matrix)
clean_job_matrix = test_data_extended.job_matrix.apply(convert_matrix)
test_data_extended['work_history_matrix'] = clean_work_matrix
test_data_extended['job_matrix'] = clean_job_matrix

CPU times: user 550 ms, sys: 6.86 ms, total: 556 ms
Wall time: 557 ms


## Build feature vector by stacking numeric features and TF-IDF vectors

In [26]:
numeric_cols = ['City', 'State', 'Country', 'DegreeType',
       'WorkHistoryCount', 'TotalYearsExperience', 'CurrentlyEmployed',
       'ManagedOthers', 'ManagedHowMany']

In [27]:
tfidf_cols = ['job_matrix', 'work_history_matrix']

In [28]:
# numeric_features = train_data_extended[numeric_cols]
# numeric_features

In [29]:
# tfidf_features = train_data_extended[tfidf_cols]

In [30]:
# len(combined_features)

In [31]:
# combined_features[0]

In [32]:
# len(combined_features[0])

In [33]:
drop_cols = ['UserID','JobID', 'label','u_idx','j_idx', 
                  'mean_work_history_matrix', 'mean_job_matrix']
X_train = train_data_extended.drop(columns = drop_cols, axis = 1)
X_test = test_data_extended.drop(columns = drop_cols, axis = 1)

In [34]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 563889 entries, 0 to 563888
Data columns (total 11 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   City                  563889 non-null  float64
 1   State                 563889 non-null  float64
 2   Country               563889 non-null  float64
 3   DegreeType            563889 non-null  int64  
 4   WorkHistoryCount      563889 non-null  int64  
 5   TotalYearsExperience  563889 non-null  float64
 6   CurrentlyEmployed     563889 non-null  int64  
 7   ManagedOthers         563889 non-null  int64  
 8   ManagedHowMany        563889 non-null  int64  
 9   work_history_matrix   563889 non-null  object 
 10  job_matrix            563889 non-null  object 
dtypes: float64(4), int64(5), object(2)
memory usage: 47.3+ MB


In [35]:
type(X_train.work_history_matrix.values)

numpy.ndarray

In [36]:
# Assuming you have a pandas DataFrame called 'data' containing the combined features

# Step 1: Convert 'work_history_matrix' and 'job_matrix' columns to dense TF-IDF matrices
work_history_matrix = np.array(X_train['work_history_matrix'].tolist())
job_matrix = np.array(X_train['job_matrix'].tolist())

# Step 2: Combine numeric features and TF-IDF matrices
numeric_features = X_train.drop(['work_history_matrix', 'job_matrix'], axis=1)
combined_features = np.hstack((numeric_features.values, work_history_matrix, job_matrix))

In [37]:
len(combined_features)

563889

In [38]:
combined_features[0]

array([ 1.        ,  1.        ,  1.        ,  1.        ,  2.        ,
       13.        ,  1.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.467288  ,  0.        ,  0.        ,  0.        ,  0.69007561,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.55266407,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.18734094,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.  

In [39]:
y_train = np.load("./xai_posthoc/y_train_tabular.npy")
y_test = np.load("./xai_posthoc/y_test_tabular.npy")

In [40]:
# Step 3: Create an instance of the NearestNeighbors model
knn = NearestNeighbors(metric='cosine', n_neighbors = 100)  # Set the distance metric, e.g., 'euclidean' or 'cosine'

In [41]:
# Step 4: Fit the model to the combined feature matrix
knn.fit(combined_features, y_train)

## Get 100 similar items for one test data
- 1 Test application: 1 row in X_test
- 100 similar transaction are derived from X_train using knn models
- Return a dictionary: 
    - key: user_id
    - value: tupple of (job_id, label of that transaction)
- For topN recommendation we only need list of 100 jobs 

In [42]:
X_test.loc[0]

City                                                                  0.0
State                                                                 1.0
Country                                                               1.0
DegreeType                                                              4
WorkHistoryCount                                                        6
TotalYearsExperience                                                  5.0
CurrentlyEmployed                                                       1
ManagedOthers                                                           0
ManagedHowMany                                                          0
work_history_matrix     [0.0, 0.0, 0.0, 0.45926196, 0.0, 0.0, 0.0, 0.0...
job_matrix              [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.12754311, 0.0...
Name: 0, dtype: object

In [43]:
# Step 5: Query for similar items
numeric_query = X_test[numeric_cols].loc[0]  # Example numeric query


In [44]:
numeric_query

City                    0.0
State                   1.0
Country                 1.0
DegreeType              4.0
WorkHistoryCount        6.0
TotalYearsExperience    5.0
CurrentlyEmployed       1.0
ManagedOthers           0.0
ManagedHowMany          0.0
Name: 0, dtype: float64

In [45]:
work_history_query = X_test['work_history_matrix'].loc[0]

In [46]:
work_history_query 

array([0.        , 0.        , 0.        , 0.45926196, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.77823495, 0.        ,
       0.        , 0.4282859 , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ])

In [47]:
job_query = X_test['job_matrix'].loc[0]

In [48]:
# Reshape the 1D arrays to 2D arrays
numeric_query = np.reshape(numeric_query, (1, -1))
work_history_query = np.reshape(work_history_query, (1, -1))
job_query = np.reshape(job_query, (1, -1))

query = np.hstack((numeric_query, work_history_query, job_query))
distances, indices = knn.kneighbors(query, n_neighbors=100)

In [50]:
distances

array([[0.0091867 , 0.01005865, 0.0108206 , 0.01136098, 0.01215592,
        0.01223885, 0.01228455, 0.01319291, 0.0133877 , 0.01349083,
        0.01376926, 0.01438585, 0.01462858, 0.01463986, 0.01466234,
        0.01476294, 0.0149151 , 0.0149296 , 0.01494694, 0.01496132,
        0.0150405 , 0.01505649, 0.01506081, 0.01512025, 0.01515492,
        0.01518929, 0.01522716, 0.01527963, 0.01527986, 0.01529073,
        0.01544319, 0.01547978, 0.0155155 , 0.01551824, 0.01557892,
        0.01566057, 0.01568255, 0.01568321, 0.01574798, 0.01575859,
        0.01580091, 0.01580308, 0.01581454, 0.01586965, 0.01597117,
        0.01598009, 0.01602946, 0.01603485, 0.01607026, 0.01607709,
        0.01608691, 0.01609292, 0.01610928, 0.01617589, 0.01618043,
        0.01620676, 0.01620973, 0.01621164, 0.01622227, 0.01625306,
        0.01626124, 0.01634722, 0.01634841, 0.01634867, 0.01636237,
        0.01638019, 0.01639921, 0.0164041 , 0.01641863, 0.01643086,
        0.01643327, 0.01649157, 0.01649871, 0.01

In [51]:
indices

array([[166619, 359909, 359907, 359908, 352856, 305167,  31744, 527596,
        281643, 334791, 416068, 387207,   4766, 174102, 352857, 319832,
        235677, 319829, 386311, 235679, 386312,  90221, 299609, 309726,
        145155, 319831, 307817,   8706, 421893, 242259, 531729, 493853,
        195033, 319830, 387208, 235678, 441187, 145322, 154571, 301137,
        441190, 134174, 129055, 512174, 216788, 134176, 481961, 206018,
         25402, 145321, 412709, 511491, 336205,  15248,  38642, 415757,
        170322, 233315, 111411,  15246, 293972, 366437, 206019, 210711,
        216787, 154572,  15252, 326794, 500827, 134173, 156792, 210714,
        184555,  10544, 349236, 153839, 492119,  63466, 349235, 117407,
         95322, 111412, 337127, 324569, 441188, 427149, 255825,  95324,
          2237, 195034, 527593, 303107, 366436, 493854, 352855,   6000,
        416067, 206020, 532667, 363529]])

In [52]:
indices[0]

array([166619, 359909, 359907, 359908, 352856, 305167,  31744, 527596,
       281643, 334791, 416068, 387207,   4766, 174102, 352857, 319832,
       235677, 319829, 386311, 235679, 386312,  90221, 299609, 309726,
       145155, 319831, 307817,   8706, 421893, 242259, 531729, 493853,
       195033, 319830, 387208, 235678, 441187, 145322, 154571, 301137,
       441190, 134174, 129055, 512174, 216788, 134176, 481961, 206018,
        25402, 145321, 412709, 511491, 336205,  15248,  38642, 415757,
       170322, 233315, 111411,  15246, 293972, 366437, 206019, 210711,
       216787, 154572,  15252, 326794, 500827, 134173, 156792, 210714,
       184555,  10544, 349236, 153839, 492119,  63466, 349235, 117407,
        95322, 111412, 337127, 324569, 441188, 427149, 255825,  95324,
         2237, 195034, 527593, 303107, 366436, 493854, 352855,   6000,
       416067, 206020, 532667, 363529])

In [53]:
# Retrieve the actual items using the indices (Retrieve from X_train)
knn_items = X_train.iloc[indices[0]]
# Print the retrieved items
knn_items.head()

Unnamed: 0,City,State,Country,DegreeType,WorkHistoryCount,TotalYearsExperience,CurrentlyEmployed,ManagedOthers,ManagedHowMany,work_history_matrix,job_matrix
166619,0.0,1.0,1.0,4,6,5.0,1,0,0,"[0.0, 0.0, 0.0, 0.45926196, 0.0, 0.0, 0.0, 0.0...","[0.25631867, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0..."
359909,0.0,1.0,1.0,4,6,5.0,1,0,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.06730579, 0.05464249, 0.1050..."
359907,0.0,1.0,1.0,4,6,5.0,1,0,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.10106707, 0.0, 0.16979964, 0.0815..."
359908,0.0,1.0,1.0,4,6,5.0,1,0,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.05994761, 0.0, 0.08002949, 0.16561477, 0.06..."
352856,0.0,1.0,1.0,4,6,5.0,1,0,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.05006725, 0.06774129, 0.0, 0.0, 0.0, 0.0, 0..."


In [54]:
# Retrieve the actual items using the indices (Retrieve from data_train)
knn_items_detail = train_data_extended.iloc[indices[0]]

In [81]:
knn_items_detail.head()

Unnamed: 0,UserID,JobID,label,City,State,Country,DegreeType,WorkHistoryCount,TotalYearsExperience,CurrentlyEmployed,ManagedOthers,ManagedHowMany,u_idx,work_history_matrix,j_idx,job_matrix,mean_work_history_matrix,mean_job_matrix
166619,444209,824471,1,0.0,1.0,1.0,4,6,5.0,1,0,0,118297,"[0.0, 0.0, 0.0, 0.45926196, 0.0, 0.0, 0.0, 0.0...",905769,"[0.25631867, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0...",0.033316,0.051927
359909,946589,7415,0,0.0,1.0,1.0,4,6,5.0,1,0,0,145529,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",404990,"[0.0, 0.0, 0.0, 0.06730579, 0.05464249, 0.1050...",0.027165,0.061986
359907,946589,759423,1,0.0,1.0,1.0,4,6,5.0,1,0,0,145529,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1014218,"[0.0, 0.0, 0.10106707, 0.0, 0.16979964, 0.0815...",0.027165,0.043394
359908,946589,729658,1,0.0,1.0,1.0,4,6,5.0,1,0,0,145529,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1010769,"[0.05994761, 0.0, 0.08002949, 0.16561477, 0.06...",0.027165,0.059429
352856,929065,11852,1,0.0,1.0,1.0,4,6,5.0,1,0,0,83635,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",540631,"[0.05006725, 0.06774129, 0.0, 0.0, 0.0, 0.0, 0...",0.0,0.051566


In [67]:
len(knn_items_detail)

100

In [55]:
knn_items_detail.UserID.values

array([ 444209,  946589,  946589,  946589,  929065,  806599,   86729,
       1376600,  747316,  881965, 1087617, 1015529,   13411,  464077,
        929065,  844665,  629299,  844665, 1013302,  629299, 1013302,
        240691,  791693,  818116,  386704,  844665,  813286,   24188,
       1103627,  646298, 1387500, 1288029,  519668,  844665, 1015529,
        629299, 1152182,  387150,  412618,  795779, 1152182,  357243,
        343248, 1336072,  578269,  357243, 1256859,  549380,   69549,
        387150, 1078721, 1334208,  885507,   42466,  106491, 1086929,
        454274,  622810,  295971,   42466,  777687,  962847,  549380,
        561350,  578269,  412618,   42466,  861592, 1306651,  357243,
        418157,  561350,  491871,   29512,  919848,  410713, 1283477,
        171296,  919848,  312028,  253826,  295971,  888054,  856452,
       1152182, 1116969,  680983,  253826,    6622,  519668, 1376600,
        800966,  962847, 1288029,  929065,   16740, 1087617,  549380,
       1389869,  955

In [77]:
knn_items_detail.UserID.duplicated().sum()

31

In [71]:
len(knn_items_detail.UserID)

100

In [82]:
knn_items_detail[['UserID', 'JobID']].duplicated().sum()

0

In [72]:
# This may result in less than 100 items in knn_result if there is duplication in userid (In this case result is 69)
knn_result = {}
count = 0
for i in range(len(knn_items_detail.UserID)):
    rec_user = knn_items_detail.UserID.values[i]
    rec_item = knn_items_detail.JobID.values[i]
    rec_label = knn_items_detail.label.values[i]
    #print(rec_item, rec_label)
    zip_value = (rec_item, rec_label)
    knn_result[rec_user] = zip_value
    count +=1

In [73]:
print(count)

100


In [74]:
len(knn_result)

69

In [58]:
# gel all items for this user
rec_jobid = [i[0] for i in knn_result.values()]

In [59]:
test_data_extended.loc[0]

UserID                                                                     13
JobID                                                                  821691
label                                                                       1
City                                                                      0.0
State                                                                     1.0
Country                                                                   1.0
DegreeType                                                                  4
WorkHistoryCount                                                            6
TotalYearsExperience                                                      5.0
CurrentlyEmployed                                                           1
ManagedOthers                                                               0
ManagedHowMany                                                              0
u_idx                                                           

In [83]:
knn_result_df = knn_items_detail[['UserID','JobID','label','City','State','Country']]

In [85]:
knn_result_df['cosine_distance'] = distances[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  knn_result_df['cosine_distance'] = distances[0]


In [86]:
knn_result_df

Unnamed: 0,UserID,JobID,label,City,State,Country,cosine_distance
166619,444209,824471,1,0.0,1.0,1.0,0.009187
359909,946589,7415,0,0.0,1.0,1.0,0.010059
359907,946589,759423,1,0.0,1.0,1.0,0.010821
359908,946589,729658,1,0.0,1.0,1.0,0.011361
352856,929065,11852,1,0.0,1.0,1.0,0.012156
...,...,...,...,...,...,...,...
6000,16740,773803,1,0.0,1.0,1.0,0.016983
416067,1087617,1090870,1,0.0,1.0,1.0,0.016998
206020,549380,935186,1,0.0,1.0,1.0,0.017010
532667,1389869,286088,1,0.0,1.0,1.0,0.017082


In [87]:
def zipmap(df): return list(zip(*map(df.get, df)))

In [90]:
knn_result_list = zipmap(knn_result_df)

In [91]:
len(knn_result_list)

100

In [100]:
# knn_indices and knn_distances are 1D array from the result of knn_neighbors
def get_knn_results(knn_indices, knn_distances, select_cols): 
    # Retrieve the actual items using the indices (Retrieve from data_train)
    knn_items_detail = train_data_extended.iloc[knn_indices]
    
    # Select columns to be included in the result: Example: ['JobID']
    knn_result_df = knn_items_detail[select_cols]
    knn_result_df['cosine_distance'] = knn_distances
    
    def zipmap(df): return list(zip(*map(df.get, df)))
    
    knn_result_list = zipmap(knn_result_df)
    return knn_result_list

In [119]:
# Get only jobID
check_knn_list = get_knn_results(knn_indices = indices[0],
                                 knn_distances = distances[0],
                select_cols = ['JobID'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  knn_result_df['cosine_distance'] = knn_distances


In [103]:
len(check_knn_list)

100

In [125]:
check_knn_list

[(824471, 0.009186704246106236),
 (7415, 0.010058654315184823),
 (759423, 0.010820599504993256),
 (729658, 0.01136097770215072),
 (11852, 0.012155924188430145),
 (677614, 0.01223884716537138),
 (586140, 0.012284545052840823),
 (641999, 0.013192911006157049),
 (241804, 0.013387699446543722),
 (460628, 0.013490833802810798),
 (445769, 0.013769260442408227),
 (1100319, 0.014385848736315321),
 (248446, 0.014628580764413979),
 (317288, 0.014639863196743264),
 (411167, 0.01466234421867263),
 (261063, 0.01476293717053645),
 (881785, 0.014915104616859076),
 (334428, 0.014929598109359898),
 (601438, 0.014946941470374453),
 (1083803, 0.014961319060332379),
 (453088, 0.015040496887263277),
 (610480, 0.015056492153392775),
 (872594, 0.015060807517992103),
 (339239, 0.01512025055728472),
 (367918, 0.015154924832629924),
 (449205, 0.015189287083262504),
 (504586, 0.015227159609435548),
 (789664, 0.015279630451044302),
 (355903, 0.01527985502132656),
 (1061269, 0.015290733043243465),
 (848303, 0.0154

## Get 100 neighbors all test interaction data

In [104]:
# Assuming you have a pandas DataFrame called 'data' containing the combined features

# Step 1: Convert 'work_history_matrix' and 'job_matrix' columns to dense TF-IDF matrices
work_history_matrix = np.array(X_test['work_history_matrix'].tolist())
job_matrix = np.array(X_test['job_matrix'].tolist())

# Step 2: Combine numeric features and TF-IDF matrices
numeric_features_test = X_test.drop(['work_history_matrix', 'job_matrix'], axis=1)
combined_features_test = np.hstack((numeric_features_test.values, work_history_matrix, job_matrix))

In [105]:
%%time
distances_test, indices_test = knn.kneighbors(combined_features_test, 
                                              n_neighbors=100)

CPU times: user 4min 17s, sys: 1min 22s, total: 5min 39s
Wall time: 3min 26s


In [107]:
len(indices_test), len(distances_test)

(15736, 15736)

In [108]:
knn_test_results = test_data_extended[['UserID']]

In [109]:
knn_test_results

Unnamed: 0,UserID
0,13
1,13
2,514
3,514
4,681
...,...
15731,1471251
15732,1471251
15733,1471251
15734,1471988


In [111]:
knn_test_results.loc[0]

UserID    13
Name: 0, dtype: int64

In [112]:
indices_test[0]

array([166619, 359909, 359907, 359908, 352856, 305167,  31744, 527596,
       281643, 334791, 416068, 387207,   4766, 174102, 352857, 319832,
       235677, 319829, 386311, 235679, 386312,  90221, 299609, 309726,
       145155, 319831, 307817,   8706, 421893, 242259, 531729, 493853,
       195033, 319830, 387208, 235678, 441187, 145322, 154571, 301137,
       441190, 134174, 129055, 512174, 216788, 134176, 481961, 206018,
        25402, 145321, 412709, 511491, 336205,  15248,  38642, 415757,
       170322, 233315, 111411,  15246, 293972, 366437, 206019, 210711,
       216787, 154572,  15252, 326794, 500827, 134173, 156792, 210714,
       184555,  10544, 349236, 153839, 492119,  63466, 349235, 117407,
        95322, 111412, 337127, 324569, 441188, 427149, 255825,  95324,
         2237, 195034, 527593, 303107, 366436, 493854, 352855,   6000,
       416067, 206020, 532667, 363529])

In [114]:
distances_test[0]

array([0.0091867 , 0.01005865, 0.0108206 , 0.01136098, 0.01215592,
       0.01223885, 0.01228455, 0.01319291, 0.0133877 , 0.01349083,
       0.01376926, 0.01438585, 0.01462858, 0.01463986, 0.01466234,
       0.01476294, 0.0149151 , 0.0149296 , 0.01494694, 0.01496132,
       0.0150405 , 0.01505649, 0.01506081, 0.01512025, 0.01515492,
       0.01518929, 0.01522716, 0.01527963, 0.01527986, 0.01529073,
       0.01544319, 0.01547978, 0.0155155 , 0.01551824, 0.01557892,
       0.01566057, 0.01568255, 0.01568321, 0.01574798, 0.01575859,
       0.01580091, 0.01580308, 0.01581454, 0.01586965, 0.01597117,
       0.01598009, 0.01602946, 0.01603485, 0.01607026, 0.01607709,
       0.01608691, 0.01609292, 0.01610928, 0.01617589, 0.01618043,
       0.01620676, 0.01620973, 0.01621164, 0.01622227, 0.01625306,
       0.01626124, 0.01634722, 0.01634841, 0.01634867, 0.01636237,
       0.01638019, 0.01639921, 0.0164041 , 0.01641863, 0.01643086,
       0.01643327, 0.01649157, 0.01649871, 0.01651707, 0.01653

In [118]:
verify_knn_list = get_knn_results(knn_indices = indices_test[0], knn_distances = distances_test[0],
                select_cols = ['JobID'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  knn_result_df['cosine_distance'] = knn_distances


In [121]:
len(verify_knn_list)

100

In [122]:
len(check_knn_list)

100

In [123]:
verify_knn_list

[(824471, 0.009186704246106236),
 (7415, 0.010058654315184712),
 (759423, 0.010820599504993589),
 (729658, 0.011360977702150832),
 (11852, 0.012155924188430256),
 (677614, 0.012238847165371158),
 (586140, 0.012284545052841045),
 (641999, 0.013192911006156938),
 (241804, 0.013387699446543833),
 (460628, 0.013490833802810798),
 (445769, 0.013769260442408116),
 (1100319, 0.01438584873631521),
 (248446, 0.014628580764413535),
 (317288, 0.014639863196743041),
 (411167, 0.01466234421867263),
 (261063, 0.01476293717053645),
 (881785, 0.014915104616859076),
 (334428, 0.01492959810936001),
 (601438, 0.014946941470374453),
 (1083803, 0.014961319060332268),
 (453088, 0.015040496887263166),
 (610480, 0.015056492153392664),
 (872594, 0.015060807517991992),
 (339239, 0.01512025055728472),
 (367918, 0.015154924832629812),
 (449205, 0.015189287083262504),
 (504586, 0.015227159609435659),
 (789664, 0.015279630451044413),
 (355903, 0.015279855021326894),
 (1061269, 0.015290733043243354),
 (848303, 0.015

In [129]:
%%time
knn_test_details = []
for i in range(len(knn_test_results)):
    result_list = get_knn_results(knn_indices = indices_test[i], 
                                  knn_distances = distances_test[i],
                select_cols = ['JobID'])
    knn_test_details.append(result_list)

CPU times: user 12.8 s, sys: 204 ms, total: 13 s
Wall time: 13 s


In [130]:
len(knn_test_details)

15736

In [132]:
len(knn_test_details[0])

100

In [133]:
knn_test_results['detail'] = knn_test_details

In [134]:
knn_test_results.head()

Unnamed: 0,UserID,detail
0,13,"[(824471, 0.009186704246106236), (7415, 0.0100..."
1,13,"[(976335, 0.011466280723388289), (857891, 0.01..."
2,514,"[(308083, 0.01556519949295787), (389827, 0.015..."
3,514,"[(266863, 0.014065778940661411), (1026333, 0.0..."
4,681,"[(54495, 0.023697970364968235), (54508, 0.0241..."


In [153]:
knn_test_explode = knn_test_results.explode(column='detail', ignore_index=False)

In [154]:
len(knn_test_explode)

1573600

In [155]:
knn_test_explode.head()

Unnamed: 0,UserID,detail
0,13,"(824471, 0.009186704246106236)"
0,13,"(7415, 0.010058654315184712)"
0,13,"(759423, 0.010820599504993589)"
0,13,"(729658, 0.011360977702150832)"
0,13,"(11852, 0.012155924188430256)"


In [156]:
knn_test_explode['JobID'] = knn_test_explode.apply(lambda x: x.detail[0], axis=1)

In [157]:
knn_test_explode.head()

Unnamed: 0,UserID,detail,JobID
0,13,"(824471, 0.009186704246106236)",824471
0,13,"(7415, 0.010058654315184712)",7415
0,13,"(759423, 0.010820599504993589)",759423
0,13,"(729658, 0.011360977702150832)",729658
0,13,"(11852, 0.012155924188430256)",11852


In [158]:
knn_test_explode['cosine_distance'] = knn_test_explode.apply(lambda x: x.detail[1], axis=1)

In [159]:
knn_test_explode.head()

Unnamed: 0,UserID,detail,JobID,cosine_distance
0,13,"(824471, 0.009186704246106236)",824471,0.009187
0,13,"(7415, 0.010058654315184712)",7415,0.010059
0,13,"(759423, 0.010820599504993589)",759423,0.010821
0,13,"(729658, 0.011360977702150832)",729658,0.011361
0,13,"(11852, 0.012155924188430256)",11852,0.012156


In [160]:
mask = knn_test_explode[['UserID','JobID','cosine_distance']].duplicated()
mask

0        False
0        False
0        False
0        False
0        False
         ...  
15735    False
15735    False
15735    False
15735    False
15735    False
Length: 1573600, dtype: bool

In [161]:
# drop duplicate value
knn_test_explode = knn_test_explode[mask==False]

In [162]:
len(knn_test_explode)

1571951

In [163]:
knn_test_explode[['UserID','JobID','cosine_distance']].duplicated().sum()

0

In [167]:
# Sorted values
knn_test_sort = knn_test_explode.groupby('UserID').apply(lambda x: x.sort_values('cosine_distance'))

In [168]:
knn_test_sort.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,UserID,detail,JobID,cosine_distance
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
13,0,13,"(824471, 0.009186704246106236)",824471,0.009187
13,0,13,"(7415, 0.010058654315184712)",7415,0.010059
13,0,13,"(759423, 0.010820599504993589)",759423,0.010821
13,0,13,"(729658, 0.011360977702150832)",729658,0.011361
13,1,13,"(976335, 0.011466280723388289)",976335,0.011466


In [171]:
knn_test_sort[knn_test_sort.UserID==13][:100]

Unnamed: 0_level_0,Unnamed: 1_level_0,UserID,detail,JobID,cosine_distance
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
13,0,13,"(824471, 0.009186704246106236)",824471,0.009187
13,0,13,"(7415, 0.010058654315184712)",7415,0.010059
13,0,13,"(759423, 0.010820599504993589)",759423,0.010821
13,0,13,"(729658, 0.011360977702150832)",729658,0.011361
13,1,13,"(976335, 0.011466280723388289)",976335,0.011466
13,...,...,...,...,...
13,1,13,"(824471, 0.01577473865264667)",824471,0.015775
13,1,13,"(367372, 0.015775473587694933)",367372,0.015775
13,0,13,"(259223, 0.015800905716400027)",259223,0.015801
13,0,13,"(218263, 0.015803079762136907)",218263,0.015803


In [174]:
type(knn_test_sort)

pandas.core.frame.DataFrame

In [176]:
knn_test = knn_test_sort.reset_index(drop=True)

In [177]:
knn_test

Unnamed: 0,UserID,detail,JobID,cosine_distance
0,13,"(824471, 0.009186704246106236)",824471,0.009187
1,13,"(7415, 0.010058654315184712)",7415,0.010059
2,13,"(759423, 0.010820599504993589)",759423,0.010821
3,13,"(729658, 0.011360977702150832)",729658,0.011361
4,13,"(976335, 0.011466280723388289)",976335,0.011466
...,...,...,...,...
1571946,1471988,"(1020168, 0.005910744151707115)",1020168,0.005911
1571947,1471988,"(11217, 0.005917366221147358)",11217,0.005917
1571948,1471988,"(461752, 0.005926877222813043)",461752,0.005927
1571949,1471988,"(101103, 0.005928917855883786)",101103,0.005929


## Build ranking data:

In [172]:
ranking_data = pd.DataFrame(columns = ["UserID","JobID"])

In [179]:
%%time
from tqdm import tqdm
user_ids = []
job_ids = []

for idx, group in tqdm(knn_test.groupby('UserID')):
    candidate_jobs = group.JobID.unique().tolist()
    selected_jobs = candidate_jobs[:100]
    job_ids.extend(selected_jobs)
    user_ids.extend([idx] * 100)

ranking_data.UserID = user_ids
ranking_data.JobID = job_ids

100%|███████████████████████████████████| 3716/3716 [00:00<00:00, 7117.78it/s]


CPU times: user 749 ms, sys: 73.6 ms, total: 822 ms
Wall time: 829 ms


In [180]:
ranking_data

Unnamed: 0,UserID,JobID
0,13,824471
1,13,7415
2,13,759423
3,13,729658
4,13,976335
...,...,...
371595,1471988,324597
371596,1471988,663219
371597,1471988,1046491
371598,1471988,226330


## Add 3 binary features for location matching

In [184]:
# Load clean job data
job_set = pd.read_csv("./data_processed/jobset_clean.csv")
# Load the dataset from Step 3
user_set = pd.read_csv("./data_interim/user_set_cleaned.csv")

In [185]:
def get_city_match(user_id, job_id):
    user = user_set[user_set['UserID'] == user_id].iloc[0]
    job = job_set[job_set['JobID'] == job_id].iloc[0]
    return float(user['City'] == job['City'])

In [186]:
def get_state_match(user_id, job_id):
    user = user_set[user_set['UserID'] == user_id].iloc[0]
    job = job_set[job_set['JobID'] == job_id].iloc[0]
    return float(user['State'] == job['State'])

In [187]:
def get_country_match(user_id, job_id):
    user = user_set[user_set['UserID'] == user_id].iloc[0]
    job = job_set[job_set['JobID'] == job_id].iloc[0]
    return float(user['Country'] == job['Country'])

In [188]:
%%time
ranking_data['City'] = ranking_data.apply(lambda x: get_city_match(x.UserID, x.JobID), axis=1)


CPU times: user 13min 4s, sys: 46.1 s, total: 13min 50s
Wall time: 8min 34s


In [189]:
%%time
ranking_data['State'] = ranking_data.apply(lambda x: get_state_match(x.UserID, x.JobID), axis=1)

CPU times: user 17min 10s, sys: 44.1 s, total: 17min 54s
Wall time: 10min 3s


In [190]:
%%time
ranking_data['Country'] = ranking_data.apply(lambda x: get_country_match(x.UserID, x.JobID), axis=1)


CPU times: user 16min 51s, sys: 43.2 s, total: 17min 34s
Wall time: 9min 55s


In [191]:
ranking_data.head()

Unnamed: 0,UserID,JobID,City,State,Country
0,13,824471,0.0,0.0,1.0
1,13,7415,0.0,0.0,1.0
2,13,759423,0.0,0.0,1.0
3,13,729658,0.0,0.0,1.0
4,13,976335,0.0,0.0,1.0


In [192]:
ranking_data.to_csv('./nb_recsys_tabular/ranking_data_knn_tfidf.csv', header=True, index=False)