## Build ranking dataset for topN recommendation
## Source: LDA features (tabular)
## Method: Knn (nearest neighbor), metrics: hamming distance

- Result: ranking_data: 371,600
    - UserID
    - ItemID
    - City, State, Country

- Process:
    - Use tabular dataset to build KNN models with 100 neighbors (trained on X_train)
    - Get 100 neighbors for each test application (total: 15,776 applications)
    - Reduce duplication of (UserId, ItemID, cosine distance) in the final result
    - Sorted result based on cosine distance
    - Group result by UserID, retain only first 100 JobID for a User
    - Apply checking function for UserID, JobID

In [1]:
# import caffeine
# caffeine.on(display=False)

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Run this for reproduce
# Gets the current working directory
import os
cwd = os.getcwd()
print("Working directory:", cwd)
# Go up one directory from working directory
os.chdir("..")

Working directory: /home/jovyan/1_UT THESIS/CB12_MAIN/nb_recsys_ebm


In [4]:
import pandas as pd
import numpy as np

In [5]:
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_distances

In [6]:
# Load input: train_data_extended, test_data_extended
train_data_extended = pd.read_csv('./xai_recsys/train_data_extended.csv')
test_data_extended = pd.read_csv('./xai_recsys/test_data_extended.csv')

In [7]:
train_data_extended.head()
test_data_extended.head()

Unnamed: 0,UserID,JobID,label,City,State,Country,Split,DegreeType,CurrentlyEmployed,ManagedOthers,WorkHistoryTopic,WorkHistoryLevel,SeniorLevel,ReqTopic,DescTopic,TitTopic
0,13,821691,1,0.0,1.0,1.0,Test,4,1,0,0,3,2.0,5,1,0
1,13,329572,0,0.0,0.0,1.0,Test,4,1,0,0,3,2.0,15,18,0
2,514,131166,1,0.0,0.0,1.0,Test,5,0,0,0,2,2.0,18,1,0
3,514,620304,0,0.0,0.0,1.0,Test,5,0,0,0,2,2.0,18,15,0
4,681,654542,1,0.0,1.0,1.0,Test,1,0,0,0,1,2.0,18,18,0


In [8]:
train_data_extended.columns

Index(['UserID', 'JobID', 'label', 'City', 'State', 'Country', 'Split',
       'DegreeType', 'CurrentlyEmployed', 'ManagedOthers', 'WorkHistoryTopic',
       'WorkHistoryLevel', 'SeniorLevel', 'ReqTopic', 'DescTopic', 'TitTopic'],
      dtype='object')

## Get feature vectors

In [9]:
test_data_extended = test_data_extended.dropna()

In [10]:
train_data_extended = train_data_extended.dropna()

In [11]:
X_train = train_data_extended.drop(columns = ['label','Split','UserID', 'JobID'], axis = 1)
X_test = test_data_extended.drop(columns = ['label','Split','UserID', 'JobID'], axis = 1)

In [12]:
y_train = np.load("./xai_posthoc/y_train_tabular.npy")
y_test = np.load("./xai_posthoc/y_test_tabular.npy")

In [13]:
X_train.head()

Unnamed: 0,City,State,Country,DegreeType,CurrentlyEmployed,ManagedOthers,WorkHistoryTopic,WorkHistoryLevel,SeniorLevel,ReqTopic,DescTopic,TitTopic
0,1.0,1.0,1.0,1,1,0,0,1,4.0,18,2,0
1,1.0,1.0,1.0,1,1,0,0,1,4.0,18,2,0
2,0.0,0.0,1.0,1,1,0,0,1,4.0,0,18,0
3,0.0,0.0,1.0,1,1,0,0,1,4.0,18,15,0
4,1.0,1.0,1.0,1,1,0,0,1,1.0,5,18,0


In [14]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 560759 entries, 0 to 563888
Data columns (total 12 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   City               560759 non-null  float64
 1   State              560759 non-null  float64
 2   Country            560759 non-null  float64
 3   DegreeType         560759 non-null  int64  
 4   CurrentlyEmployed  560759 non-null  int64  
 5   ManagedOthers      560759 non-null  int64  
 6   WorkHistoryTopic   560759 non-null  int64  
 7   WorkHistoryLevel   560759 non-null  int64  
 8   SeniorLevel        560759 non-null  float64
 9   ReqTopic           560759 non-null  int64  
 10  DescTopic          560759 non-null  int64  
 11  TitTopic           560759 non-null  int64  
dtypes: float64(4), int64(8)
memory usage: 55.6 MB


## Build KNN models

In [15]:
# Create an instance of the NearestNeighbors model
knn = NearestNeighbors(metric='hamming', n_neighbors = 100)  # Set the distance metric, e.g., 'euclidean' or 'cosine'

In [16]:
# Step 4: Fit the model to the combined feature matrix
knn.fit(X_train, y_train)

## Get 100 similar items for one test data
- 1 Test application: 1 row in X_test
- 100 similar transaction are derived from X_train using knn models
- Return a dictionary: 
    - key: user_id
    - value: tupple of (job_id, label of that transaction)
- For topN recommendation we only need list of 100 jobs 

In [None]:
query = np.reshape(X_test.loc[0], (1, -1))

In [None]:
distances, indices = knn.kneighbors(query, n_neighbors=100)

In [None]:
distances

In [None]:
indices

In [None]:
indices[0]

In [None]:
# Retrieve the actual items using the indices (Retrieve from X_train)
knn_items = X_train.iloc[indices[0]]
# Print the retrieved items
knn_items.head()

In [None]:
# Retrieve the actual items using the indices (Retrieve from data_train)
knn_items_detail = train_data_extended.iloc[indices[0]]

In [None]:
knn_items_detail.head()

In [None]:
len(knn_items_detail)

In [None]:
knn_items_detail.UserID.values

In [None]:
knn_items_detail.UserID.duplicated().sum()

In [None]:
len(knn_items_detail.UserID)

In [None]:
knn_items_detail[['UserID', 'JobID']].duplicated().sum()

In [None]:
# This may result in less than 100 items in knn_result if there is duplication in userid (In this case result is 69)
knn_result = {}
count = 0
for i in range(len(knn_items_detail.UserID)):
    rec_user = knn_items_detail.UserID.values[i]
    rec_item = knn_items_detail.JobID.values[i]
    rec_label = knn_items_detail.label.values[i]
    #print(rec_item, rec_label)
    zip_value = (rec_item, rec_label)
    knn_result[rec_user] = zip_value
    count +=1

In [None]:
print(count)

In [None]:
len(knn_result)

In [None]:
# gel all items for this user
rec_jobid = [i[0] for i in knn_result.values()]

In [None]:
test_data_extended.loc[0]

In [None]:
knn_result_df = knn_items_detail[['UserID','JobID','label','City','State','Country']]

In [None]:
knn_result_df['cosine_distance'] = distances[0]

In [None]:
knn_result_df

In [None]:
def zipmap(df): return list(zip(*map(df.get, df)))

In [None]:
knn_result_list = zipmap(knn_result_df)

In [None]:
len(knn_result_list)

In [25]:
# knn_indices and knn_distances are 1D array from the result of knn_neighbors
def get_knn_results(knn_indices, knn_distances, select_cols): 
    # Retrieve the actual items using the indices (Retrieve from data_train)
    knn_items_detail = train_data_extended.iloc[knn_indices]
    
    # Select columns to be included in the result: Example: ['JobID']
    knn_result_df = knn_items_detail[select_cols]
    knn_result_df['hamming_distance'] = knn_distances
    
    def zipmap(df): return list(zip(*map(df.get, df)))
    
    knn_result_list = zipmap(knn_result_df)
    return knn_result_list

In [None]:
# Get only jobID
check_knn_list = get_knn_results(knn_indices = indices[0],
                                 knn_distances = distances[0],
                select_cols = ['JobID'])

In [None]:
len(check_knn_list)

In [None]:
check_knn_list

## Get 100 neighbors all test interaction data

In [17]:
%%time
distances_test, indices_test = knn.kneighbors(X_test, 
                                              n_neighbors=100)

CPU times: user 5min 41s, sys: 0 ns, total: 5min 41s
Wall time: 5min 41s


In [18]:
len(indices_test), len(distances_test)

(15640, 15640)

In [19]:
knn_test_results = test_data_extended[['UserID']]

In [20]:
knn_test_results

Unnamed: 0,UserID
0,13
1,13
2,514
3,514
4,681
...,...
15731,1471251
15732,1471251
15733,1471251
15734,1471988


In [21]:
knn_test_results.loc[0]

UserID    13
Name: 0, dtype: int64

In [22]:
indices_test[0]

array([ 38224, 483351, 188915, 382179, 145215, 186389, 243615, 350562,
       550264,   5655,  97369,  97018, 128812, 142397, 156946, 179373,
       219919, 231183, 297793, 496712, 511733, 320303,  64236,  62393,
        63935,   9137,  78043,  98251,   1318, 106783, 112977, 117705,
        44452,  63690, 115157, 119661,   8998,  62009,  41038, 110853,
        15497,  41114,  40870,  42638,  54410,  41533,  51977,  63091,
        67308,  67309,   5656,  42468, 150245,  78053,  80031,  90553,
        21635,  43107,    744,   3420,  63850, 113300,   1320,  71350,
       111200, 114338,  98025, 122177,  62538, 119091,  25163,   8550,
        37267,  44752,  47105,  32622,  13727,  19550,  47611,  41592,
        16587,  15982,  24236,  29491,  38699,  39131,  46756,  42378,
        43610,  41385,  55152,  58690,  54878,  56407,  61383,   4242,
        66419,  67307,  64790,  66164])

In [23]:
distances_test[0]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.08333333, 0.08333333, 0.08333333,
       0.08333333, 0.08333333, 0.08333333, 0.08333333, 0.08333333,
       0.08333333, 0.08333333, 0.08333333, 0.08333333, 0.08333333,
       0.08333333, 0.08333333, 0.08333333, 0.08333333, 0.08333333,
       0.08333333, 0.08333333, 0.08333333, 0.08333333, 0.08333333,
       0.08333333, 0.08333333, 0.08333333, 0.08333333, 0.08333333,
       0.08333333, 0.08333333, 0.08333333, 0.08333333, 0.08333333,
       0.08333333, 0.08333333, 0.08333333, 0.08333333, 0.08333333,
       0.08333333, 0.08333333, 0.08333333, 0.08333333, 0.08333333,
       0.08333333, 0.08333333, 0.08333333, 0.08333333, 0.08333333,
       0.08333333, 0.08333333, 0.08333333, 0.08333333, 0.08333

In [26]:
verify_knn_list = get_knn_results(knn_indices = indices_test[0], knn_distances = distances_test[0],
                select_cols = ['JobID'])

In [27]:
len(verify_knn_list)

100

In [29]:
# len(check_knn_list)

In [32]:
verify_knn_list[:5]

[(828955, 0.0), (341276, 0.0), (614701, 0.0), (568507, 0.0), (852123, 0.0)]

In [33]:
%%time
knn_test_details = []
for i in range(len(knn_test_results)):
    result_list = get_knn_results(knn_indices = indices_test[i], 
                                  knn_distances = distances_test[i],
                select_cols = ['JobID', 'label'])
    knn_test_details.append(result_list)

CPU times: user 12.1 s, sys: 279 ms, total: 12.4 s
Wall time: 12 s


In [34]:
len(knn_test_details)

15640

In [35]:
len(knn_test_details[0])

100

In [36]:
knn_test_results['detail'] = knn_test_details

In [37]:
knn_test_results.head()

Unnamed: 0,UserID,detail
0,13,"[(828955, 1, 0.0), (341276, 1, 0.0), (614701, ..."
1,13,"[(755985, 0, 0.0), (265838, 1, 0.0), (602478, ..."
2,514,"[(261968, 0, 0.0), (196371, 1, 0.0), (164289, ..."
3,514,"[(334090, 1, 0.0), (1056249, 0, 0.0), (128458,..."
4,681,"[(23230, 1, 0.0), (156182, 1, 0.0), (261651, 1..."


In [38]:
knn_test_explode = knn_test_results.explode(column='detail', ignore_index=False)

In [39]:
len(knn_test_explode)

1564000

In [40]:
knn_test_explode.head()

Unnamed: 0,UserID,detail
0,13,"(828955, 1, 0.0)"
0,13,"(341276, 1, 0.0)"
0,13,"(614701, 1, 0.0)"
0,13,"(568507, 1, 0.0)"
0,13,"(852123, 1, 0.0)"


In [41]:
knn_test_explode['JobID'] = knn_test_explode.apply(lambda x: x.detail[0], axis=1)

In [42]:
knn_test_explode.head()

Unnamed: 0,UserID,detail,JobID
0,13,"(828955, 1, 0.0)",828955
0,13,"(341276, 1, 0.0)",341276
0,13,"(614701, 1, 0.0)",614701
0,13,"(568507, 1, 0.0)",568507
0,13,"(852123, 1, 0.0)",852123


In [43]:
knn_test_explode['hamming_distance'] = knn_test_explode.apply(lambda x: x.detail[2], axis=1)

In [44]:
knn_test_explode.head()

Unnamed: 0,UserID,detail,JobID,hamming_distance
0,13,"(828955, 1, 0.0)",828955,0.0
0,13,"(341276, 1, 0.0)",341276,0.0
0,13,"(614701, 1, 0.0)",614701,0.0
0,13,"(568507, 1, 0.0)",568507,0.0
0,13,"(852123, 1, 0.0)",852123,0.0


In [45]:
mask = knn_test_explode[['UserID','JobID','hamming_distance']].duplicated()
mask

0        False
0        False
0        False
0        False
0        False
         ...  
15735    False
15735    False
15735    False
15735    False
15735    False
Length: 1564000, dtype: bool

In [46]:
# drop duplicate value
knn_test_explode = knn_test_explode[mask==False]

In [47]:
# Get the label from knn neighbors (it is POSSIBLE that the label is from other UserID for the same JobID)
knn_test_explode['knn_label'] = knn_test_explode.apply(lambda x: x.detail[1], axis=1)

In [48]:
len(knn_test_explode)

1425303

In [49]:
knn_test_explode[['UserID','JobID','hamming_distance']].duplicated().sum()

0

## Check label of UserID_JobID in the knn_test_explode
Similar to script make_knn_tfidf_evaluation. This script check the label on final ranking data but result in a lot of 1.
Thus, we do the checking at this step to increase the positive label 

In [50]:
# Create interaction data (from original data)
select_cols = ['UserID', 'JobID', 'label']
select_train = train_data_extended[select_cols]
select_test = test_data_extended[select_cols]
interactions_df = pd.concat([select_train,select_test])

In [51]:
len(select_train), len(select_test), len(interactions_df)

(560759, 15640, 576399)

In [52]:
# check_interaction = [(u_id, j_id) for (u_id, j_id) in zip(interactions_df.UserID, interactions_df.JobID)]
# ranking_interaction = [(u_id, j_id) for (u_id, j_id) in zip(knn_test_explode.UserID, knn_test_explode.JobID)]

In [71]:
ranking_data = knn_test_explode.copy()

In [72]:
%%time
# Check if the pair of columns in knn_test_explode exists in interactions_df
check_result = ranking_data[['UserID', 'JobID']].apply(tuple, axis=1).isin(interactions_df[['UserID', 'JobID']].apply(tuple, axis=1))


CPU times: user 10.3 s, sys: 231 ms, total: 10.5 s
Wall time: 10.5 s


In [73]:
len(check_result)

1425303

In [74]:
check_result[:5]

0    False
0    False
0    False
0    False
0    False
dtype: bool

In [75]:
ranking_data

Unnamed: 0,UserID,detail,JobID,hamming_distance,knn_label
0,13,"(828955, 1, 0.0)",828955,0.0,1
0,13,"(341276, 1, 0.0)",341276,0.0,1
0,13,"(614701, 1, 0.0)",614701,0.0,1
0,13,"(568507, 1, 0.0)",568507,0.0,1
0,13,"(852123, 1, 0.0)",852123,0.0,1
...,...,...,...,...,...
15735,1471988,"(188836, 0, 0.0)",188836,0.0,0
15735,1471988,"(514728, 1, 0.0)",514728,0.0,1
15735,1471988,"(597826, 0, 0.0)",597826,0.0,0
15735,1471988,"(226330, 0, 0.0)",226330,0.0,0


In [80]:
%%time
ranking_data = knn_test_explode.copy()
# Merge ranking_data and interaction_df based on UserID, JobID
merged_df = pd.merge(ranking_data, interactions_df, left_on=['UserID', 'JobID'], right_on=['UserID', 'JobID'], how='left')


CPU times: user 551 ms, sys: 75.7 ms, total: 627 ms
Wall time: 619 ms


In [81]:
%%time
# Create column label in ranking_data based on the merged result
ranking_data['label'] = merged_df['label'].fillna(0)
print(ranking_data)

        UserID             detail    JobID  hamming_distance  knn_label  label
0           13   (828955, 1, 0.0)   828955               0.0          1    0.0
0           13   (341276, 1, 0.0)   341276               0.0          1    0.0
0           13   (614701, 1, 0.0)   614701               0.0          1    0.0
0           13   (568507, 1, 0.0)   568507               0.0          1    0.0
0           13   (852123, 1, 0.0)   852123               0.0          1    0.0
...        ...                ...      ...               ...        ...    ...
15735  1471988   (188836, 0, 0.0)   188836               0.0          0    0.0
15735  1471988   (514728, 1, 0.0)   514728               0.0          1    0.0
15735  1471988   (597826, 0, 0.0)   597826               0.0          0    0.0
15735  1471988   (226330, 0, 0.0)   226330               0.0          0    0.0
15735  1471988  (1106055, 0, 0.0)  1106055               0.0          0    0.0

[1425303 rows x 6 columns]
CPU times: user 61.9 ms,

In [82]:
ranking_data.label.value_counts()

label
0.0    1424621
1.0        682
Name: count, dtype: int64

In [None]:
# %%time
# MUCH LONGER TIME
# ranking_label =  []
# # mark_stop = 0 
# for item in ranking_interaction:
#     if item in check_interaction:
#         label_result = interactions_df[(interactions_df.UserID == item[0]) & (interactions_df.JobID == item[1])].reset_index().loc[0]['label']
#         check_interaction.remove(item)
#     else:
#         label_result = 0
#     ranking_label.append(label_result)
# #     mark_stop +=1
# #     if mark_stop ==3:
# #         break

In [94]:
# Sorted values
ranking_data_sort = ranking_data.groupby('UserID').apply(lambda x: x.sort_values('hamming_distance'))

In [95]:
ranking_data_sort.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,UserID,detail,JobID,hamming_distance,knn_label,label
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
13,0,13,"(828955, 1, 0.0)",828955,0.0,1,0.0
13,1,13,"(755985, 0, 0.0)",755985,0.0,0,0.0
13,1,13,"(602478, 0, 0.0)",602478,0.0,0,0.0
13,1,13,"(299880, 0, 0.0)",299880,0.0,0,0.0
13,1,13,"(873212, 0, 0.0)",873212,0.0,0,0.0


In [96]:
ranking_data_sort[ranking_data_sort.UserID==13][:100]

Unnamed: 0_level_0,Unnamed: 1_level_0,UserID,detail,JobID,hamming_distance,knn_label,label
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
13,0,13,"(828955, 1, 0.0)",828955,0.000000,1,0.0
13,1,13,"(755985, 0, 0.0)",755985,0.000000,0,0.0
13,1,13,"(602478, 0, 0.0)",602478,0.000000,0,0.0
13,1,13,"(299880, 0, 0.0)",299880,0.000000,0,0.0
13,1,13,"(873212, 0, 0.0)",873212,0.000000,0,0.0
13,...,...,...,...,...,...,...
13,1,13,"(276030, 0, 0.08333333333333333)",276030,0.083333,0,0.0
13,1,13,"(329106, 0, 0.08333333333333333)",329106,0.083333,0,0.0
13,1,13,"(15092, 0, 0.08333333333333333)",15092,0.083333,0,0.0
13,1,13,"(764500, 1, 0.08333333333333333)",764500,0.083333,1,0.0


In [97]:
ranking_data_sort = ranking_data_sort.reset_index(drop=True)

In [98]:
ranking_data_sort

Unnamed: 0,UserID,detail,JobID,hamming_distance,knn_label,label
0,13,"(828955, 1, 0.0)",828955,0.000000,1,0.0
1,13,"(755985, 0, 0.0)",755985,0.000000,0,0.0
2,13,"(602478, 0, 0.0)",602478,0.000000,0,0.0
3,13,"(299880, 0, 0.0)",299880,0.000000,0,0.0
4,13,"(873212, 0, 0.0)",873212,0.000000,0,0.0
...,...,...,...,...,...,...
1425298,1471988,"(172078, 0, 0.08333333333333333)",172078,0.083333,0,0.0
1425299,1471988,"(566170, 1, 0.08333333333333333)",566170,0.083333,1,0.0
1425300,1471988,"(875497, 1, 0.08333333333333333)",875497,0.083333,1,0.0
1425301,1471988,"(803761, 1, 0.08333333333333333)",803761,0.083333,1,0.0


In [99]:
mask = ranking_data_sort[['UserID','JobID','label']].duplicated()
mask

0          False
1          False
2          False
3          False
4          False
           ...  
1425298    False
1425299    False
1425300    False
1425301    False
1425302    False
Length: 1425303, dtype: bool

In [100]:
# drop duplicate value
ranking_data_sort = ranking_data_sort[mask==False]

In [101]:
ranking_data_sort

Unnamed: 0,UserID,detail,JobID,hamming_distance,knn_label,label
0,13,"(828955, 1, 0.0)",828955,0.000000,1,0.0
1,13,"(755985, 0, 0.0)",755985,0.000000,0,0.0
2,13,"(602478, 0, 0.0)",602478,0.000000,0,0.0
3,13,"(299880, 0, 0.0)",299880,0.000000,0,0.0
4,13,"(873212, 0, 0.0)",873212,0.000000,0,0.0
...,...,...,...,...,...,...
1425298,1471988,"(172078, 0, 0.08333333333333333)",172078,0.083333,0,0.0
1425299,1471988,"(566170, 1, 0.08333333333333333)",566170,0.083333,1,0.0
1425300,1471988,"(875497, 1, 0.08333333333333333)",875497,0.083333,1,0.0
1425301,1471988,"(803761, 1, 0.08333333333333333)",803761,0.083333,1,0.0


In [102]:
knn_shortlisted = ranking_data_sort.groupby('UserID').head(100)

In [103]:
knn_shortlisted.label.value_counts()

label
0.0    368951
1.0       147
Name: count, dtype: int64

In [104]:
knn_shortlisted.to_csv('./nb_recsys_ebm/knn_shortlisted_lda.csv', header=True, index=False)

## Add 3 binary features for location matching

In [106]:
# Load clean job data
job_set = pd.read_csv("./data_processed/jobset_clean.csv")
# Load the dataset from Step 3
user_set = pd.read_csv("./data_interim/user_set_cleaned.csv")

In [107]:
def get_city_match(user_id, job_id):
    user = user_set[user_set['UserID'] == user_id].iloc[0]
    job = job_set[job_set['JobID'] == job_id].iloc[0]
    return float(user['City'] == job['City'])

In [108]:
def get_state_match(user_id, job_id):
    user = user_set[user_set['UserID'] == user_id].iloc[0]
    job = job_set[job_set['JobID'] == job_id].iloc[0]
    return float(user['State'] == job['State'])

In [109]:
def get_country_match(user_id, job_id):
    user = user_set[user_set['UserID'] == user_id].iloc[0]
    job = job_set[job_set['JobID'] == job_id].iloc[0]
    return float(user['Country'] == job['Country'])

In [110]:
%%time
knn_shortlisted['City'] = knn_shortlisted.apply(lambda x: get_city_match(x.UserID, x.JobID), axis=1)


CPU times: user 25min 27s, sys: 2min 1s, total: 27min 29s
Wall time: 9min 24s


In [111]:
%%time
knn_shortlisted['State'] = knn_shortlisted.apply(lambda x: get_state_match(x.UserID, x.JobID), axis=1)

CPU times: user 23min 48s, sys: 1min 18s, total: 25min 7s
Wall time: 8min 36s


In [112]:
%%time
knn_shortlisted['Country'] = knn_shortlisted.apply(lambda x: get_country_match(x.UserID, x.JobID), axis=1)


CPU times: user 24min 48s, sys: 1min 20s, total: 26min 8s
Wall time: 8min 54s


In [113]:
knn_shortlisted = knn_shortlisted[['UserID','JobID','label','City','State','Country']]

In [114]:
knn_shortlisted.head()

Unnamed: 0,UserID,JobID,label,City,State,Country
0,13,828955,0.0,0.0,0.0,1.0
1,13,755985,0.0,0.0,0.0,1.0
2,13,602478,0.0,0.0,0.0,1.0
3,13,299880,0.0,0.0,0.0,1.0
4,13,873212,0.0,0.0,0.0,1.0


In [115]:
knn_shortlisted.info()

<class 'pandas.core.frame.DataFrame'>
Index: 369098 entries, 0 to 1425202
Data columns (total 6 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   UserID   369098 non-null  int64  
 1   JobID    369098 non-null  int64  
 2   label    369098 non-null  float64
 3   City     369098 non-null  float64
 4   State    369098 non-null  float64
 5   Country  369098 non-null  float64
dtypes: float64(4), int64(2)
memory usage: 19.7 MB


In [116]:
knn_shortlisted.label.value_counts()

label
0.0    368951
1.0       147
Name: count, dtype: int64

In [117]:
knn_shortlisted.to_csv('./nb_ranking_data/ranking_data_knn_lda_v2.csv', header=True, index=False)