# Generate potential application - TopN recommendation
## Random sampling with controlled positive labels

Generate rankning dataset for topN recommendation.

- INPUT:
    - user_set_cleaned.csv: warm users with work history
    - work_history_cleaned.csv: work history of warm users
    - dataset_cleaned.csv: warm application for user with work history, already include negative label
    - jobset_clean.csv: clean job dataset
    - tfidf_matrix: TF-IDF transformation matrix on jobs info (title + description + requirement)
    - word_history_tf_matrix: TF-IDF transformation matrix on work historu (all jobs title linked to a user)
- OUTPUT: ranking_data_random.csv
- SUMMARY:
    - Filter test_user and test application
    - For each test user, make sample of 99 jobs from the candidate jobs (candidate jobs = all jobs - applied jobs)
    - 100 labels correspond to a users = 1 (for the first application), 0 for the remaining 99 application
    - Check matching in City, State, Country for new pair of (UserID, JobID)


In [1]:
import caffeine
caffeine.on(display=False)

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import random
import warnings
warnings.filterwarnings("ignore")

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, precision_score, f1_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB

In [3]:
# Load pre-processed data
path = "./data_interim/"
user_set = pd.read_csv(path + "user_set_cleaned.csv")
work_history = pd.read_csv(path + "work_history_cleaned.csv")
dataset = pd.read_csv(path + "dataset_cleaned.csv")
# Load clean job set
job_set = pd.read_csv("./data_processed/jobset_clean.csv")

In [4]:
# Load TF-IDF matrix for jobs and user work history
from scipy import sparse 
tfidf_matrix = sparse.load_npz("./data_interim_tfidf/tfidf_matrix.npz")
word_history_tf_matrix = sparse.load_npz("./data_interim_tfidf/work_history_tf_matrix.npz")

In [5]:
test_user = user_set[user_set.Split=="Test"].UserID.values
test_data = dataset[dataset.UserID.isin(test_user)]

In [6]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15736 entries, 10 to 579588
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   UserID   15736 non-null  int64  
 1   JobID    15736 non-null  int64  
 2   label    15736 non-null  int64  
 3   City     15736 non-null  float64
 4   State    15736 non-null  float64
 5   Country  15736 non-null  float64
dtypes: float64(3), int64(3)
memory usage: 860.6 KB


## Build ranking data (small sample)

In [7]:
# Try on small sample
ranking_data_small = pd.DataFrame(columns = ["UserID","JobID","label", "City", "State"])
job_id = job_set.JobID.unique().tolist()
groups = test_data.groupby("UserID")
user_ids = []
job_ids = []
labels = []
City = []
State = []
Country = []

for idx, group in tqdm(groups):
    size = 99
    print('idx: ', idx)
    exist_job = group.JobID.unique().tolist()
    print('len exist_job:', len(exist_job))
    
    # Candidate jobs: existing jobs - applied jobs by user
    candidate_job = [i for i in job_id if i not in exist_job ]
    print('len candidate_job: ', len(candidate_job))
    
    # Create sample job include 99 randomly draw from the candidate job
    sample_job = random.sample(range(0,len(candidate_job)),size)
    print('sample jobs:', sample_job)
    
    user_ids.extend([idx] * (size+1)) # extend user_ids by the size of sample job + 1 => 100 users
    
    # in the job ID list: the first job ID is existing job, 99 remaining job ids is random sample jobs
    job_ids.append(exist_job[0]) # 
    job_ids.extend([candidate_job[i] for i in sample_job])
    
    # Label list: the first one is 1 (as it is existing application), remaining 99 label is 0
    labels.append(1)
    labels.extend([0] * (size))
    
    print('Sample result:')
    print('user_ids: ', user_ids)
    print('job_ids: ', job_ids)
    print('label: ', labels)
    
    City.append(group.City.values[0])
    State.append(group.State.values[0])
    Country.append(group.Country.values[0])
    
    print('City:', City)
    print('State:', State)
    
    jobs = job_set[job_set.JobID.isin([candidate_job[i] for i in sample_job])]
    print('Sample jobs details: ', len(jobs))
    
    print('Check matching City and State:')
    City.extend([0 if i!=group.City.values[0] else a for i in jobs.City.values.tolist()])
    State.extend([0 if i!=group.State.values[0] else a for i in jobs.State.values.tolist()])
    Country.extend([0 if i!=group.State.values[0] else a for i in jobs.Country.values.tolist()])
    print('City final:', City)
    print('State final:', State)
    
    if len(user_ids) >= 20:
        break



  0%|                                                  | 0/3716 [00:00<?, ?it/s]

idx:  13
len exist_job: 2



  0%|                                                  | 0/3716 [00:00<?, ?it/s]

len candidate_job:  1050507
sample jobs: [538198, 188887, 269366, 503546, 706871, 599583, 97518, 445865, 602750, 611030, 541032, 239576, 620728, 499240, 859661, 116193, 380294, 957775, 297381, 703892, 676133, 559529, 93648, 261474, 77292, 424033, 372177, 561015, 845711, 652902, 407727, 287059, 276417, 496881, 923779, 56013, 248283, 148292, 1020092, 449744, 804577, 190409, 92500, 236767, 576059, 642683, 221235, 604633, 276662, 860944, 16178, 520065, 580008, 484107, 821652, 153227, 933571, 480608, 927216, 654672, 140574, 104922, 299589, 489134, 415044, 544743, 546796, 305238, 895090, 874802, 842997, 815709, 12197, 755301, 248001, 953132, 300156, 469865, 945160, 747794, 394702, 456585, 353794, 899928, 350407, 326109, 321515, 818146, 3230, 169367, 658571, 204541, 956914, 629017, 42365, 220686, 363792, 162346, 926456]
Sample result:
user_ids:  [13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 




In [8]:
ranking_data_small.UserID = user_ids
ranking_data_small.JobID = job_ids
ranking_data_small.label = labels
ranking_data_small.City = City
ranking_data_small.State = State
ranking_data_small.Country = Country
ranking_data_small
# ranking_data.to_csv("ranking_data.csv",index=False)

Unnamed: 0,UserID,JobID,label,City,State
0,13,821691,1,0.0,1.0
1,13,1110646,0,0.0,0.0
2,13,804323,0,0.0,0.0
3,13,59598,0,0.0,0.0
4,13,823345,0,0.0,0.0
...,...,...,...,...,...
95,13,178686,0,0.0,0.0
96,13,939218,0,0.0,0.0
97,13,799566,0,0.0,0.0
98,13,690846,0,0.0,0.0


## Build ranking dataset on the whole test users

In [9]:
%%time
ranking_data = pd.DataFrame(columns = ["UserID","JobID","label", "City", "State", "Country"])
job_id = job_set.JobID.unique().tolist()
groups = test_data.groupby("UserID")
user_ids = []
job_ids = []
labels = []
City = []
State = []
Country = []
for idx, group in tqdm(groups):
    size = 99
    exist_job = group.JobID.unique().tolist()
    candidate_job = [i for i in job_id if i not in exist_job ]
    sample_job = random.sample(range(0,len(candidate_job)),size)
    user_ids.extend([idx] * (size+1))
    job_ids.append(exist_job[0])
    job_ids.extend([candidate_job[i] for i in sample_job])
    labels.append(1)
    labels.extend([0] * (size))
    City.append(group.City.values[0])
    State.append(group.State.values[0])
    Country.append(group.State.values[0])
    jobs = job_set[job_set.JobID.isin([candidate_job[i] for i in sample_job])]
    
    City.extend([0 if i!=group.City.values[0] else a for i in jobs.City.values.tolist()])
    State.extend([0 if i!=group.State.values[0] else a for i in jobs.State.values.tolist()])
    Country.extend([0 if i!=group.State.values[0] else a for i in jobs.Country.values.tolist()])
    
ranking_data.UserID = user_ids
ranking_data.JobID = job_ids
ranking_data.label = labels
ranking_data.City = City
ranking_data.State = State
ranking_data.Country = Country
# ranking_data.to_csv("ranking_data.csv",index=False)

100%|███████████████████████████████████████| 3716/3716 [07:25<00:00,  8.33it/s]


CPU times: user 7min 5s, sys: 21.4 s, total: 7min 26s
Wall time: 7min 26s


In [10]:
ranking_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 371600 entries, 0 to 371599
Data columns (total 6 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   UserID   371600 non-null  int64  
 1   JobID    371600 non-null  int64  
 2   label    371600 non-null  int64  
 3   City     371600 non-null  float64
 4   State    371600 non-null  float64
 5   Country  371600 non-null  float64
dtypes: float64(3), int64(3)
memory usage: 17.0 MB


In [11]:
ranking_data.head()

Unnamed: 0,UserID,JobID,label,City,State,Country
0,13,821691,1,0.0,1.0,1.0
1,13,701157,0,0.0,0.0,0.0
2,13,472398,0,0.0,0.0,0.0
3,13,411244,0,0.0,0.0,0.0
4,13,868940,0,0.0,0.0,0.0


In [12]:
ranking_data.to_csv("./nb_ranking_data/ranking_data_random.csv",index=False, header=True)