# Negative Sampling for warm applications experienced users

- Last update: 10.06.23
- Input: 
    - warm_exp_users.csv: list of users with user_id in both work_history and application
    - warm_exp_apps.csv: applications made by warm_exp_users
    - jobset_clean.csv: job list
- Summary:
    - An application has 3 elements: user_id, job_id, label. All of the current applications are label as 1 (positive, meaning that user made application). Negative sampling produce application where user did not apply for the job (label 0)
    - Group applications data by user_id
    - For each user_id, sample job_id(s) from the list of jobs that they haven't apply and assign label 0 to those application (user_id, job_id)
- Source: [Repository PJFNN](https://github.com/doslim/Job-Recommendation-PJFNN) (only perform for user in 1 window) 
- Output: dataset.csv
- NOTE: Total execution time for sampling task: ~ 8.5 hours

In [1]:
import pandas as pd
import numpy as np

In [10]:
import warnings
from tqdm import tqdm
warnings.filterwarnings("ignore")

In [2]:
# user_set: warm_exp_users
# application_record: warm_exp_apps
user_set = pd.read_csv('./data_processed/warm_exp_users.csv')
application_record = pd.read_csv('./data_processed/warm_exp_apps.csv')

In [3]:
user_set.info(), application_record.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 208953 entries, 0 to 208952
Data columns (total 15 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   UserID                208953 non-null  int64  
 1   WindowID              208953 non-null  int64  
 2   Split                 208953 non-null  object 
 3   City                  208953 non-null  object 
 4   State                 208676 non-null  object 
 5   Country               208953 non-null  object 
 6   ZipCode               208050 non-null  object 
 7   DegreeType            208953 non-null  object 
 8   Major                 208953 non-null  object 
 9   GraduationDate        164218 non-null  object 
 10  WorkHistoryCount      208953 non-null  int64  
 11  TotalYearsExperience  208953 non-null  float64
 12  CurrentlyEmployed     208953 non-null  object 
 13  ManagedOthers         208953 non-null  object 
 14  ManagedHowMany        208953 non-null  int64  
dtype

(None, None)

In [4]:
# Check data for users and application records in training and testing (Column: Split)
application_record.Split.value_counts(), user_set.Split.value_counts()

(Train    921114
 Test     119556
 Name: Split, dtype: int64,
 Train    200248
 Test       8705
 Name: Split, dtype: int64)

In [5]:
job_set =  pd.read_csv('./data_processed/jobset_clean.csv')

  job_set =  pd.read_csv('./data_processed/jobset_clean.csv')


In [6]:
dataset_sample = pd.DataFrame(columns = ["UserID","JobID","label"])
job_id = job_set.JobID.unique().tolist()
groups = application_record.groupby("UserID")

In [7]:
len(job_id)

1050509

In [8]:
groups.head()

Unnamed: 0,UserID,WindowID,Split,ApplicationDate,JobID
0,72,1,Train,2012-04-02 22:36:43.033,834662
1,72,1,Train,2012-04-07 15:19:58.187,1020903
2,72,1,Train,2012-04-07 17:38:10.137,180313
3,72,1,Train,2012-04-30 20:05:15.293,480634
4,72,1,Train,2012-04-20 02:51:44.997,564184
...,...,...,...,...,...
1040665,1471878,7,Train,2012-06-26 09:01:18.913,405540
1040666,1471878,7,Train,2012-06-26 10:49:55.693,43932
1040667,1471878,7,Train,2012-06-11 12:20:54.323,999915
1040668,1471997,7,Train,2012-06-22 19:38:10.1,56412


In [11]:
# Try negative sampling with small sample
user_ids = []
job_ids = []
labels = []
for idx, group in tqdm(groups): #idx: user_id, group: applications made by 1 user_id
    print('idx: ', idx)
    size = len(group)
    print('size: ', size)
    exist_job = group.JobID.unique().tolist()
    print('len exist_job:', len(exist_job))
    candidate_job = [i for i in job_id if i not in exist_job ]
    print('len candidate_job: ', len(candidate_job))
    sample_job = np.random.randint(0,len(candidate_job),size) # Return random integers from low (inclusive) to high (exclusive)
    print('sample jobs:', sample_job)
    user_ids.extend([idx] * 2 * size)
    exist_job.extend([candidate_job[i] for i in sample_job])
    job_ids.extend(exist_job)
    label = [1] * size
    label.extend([0] * size)
    labels.extend(label)
    print('Sample result:')
    print('user_ids: ', user_ids)
    print('job_ids: ', job_ids)
    print('label: ', labels)
    if len(user_ids) >= 50:
        break
    

  0%|                                      | 1/208953 [00:00<9:45:51,  5.94it/s]

idx:  7
size:  2
len exist_job: 2
len candidate_job:  1050507
sample jobs: [ 82950 495695]
Sample result:
user_ids:  [7, 7, 7, 7]
job_ids:  [309823, 703889, 350167, 758932]
label:  [1, 1, 0, 0]
idx:  9
size:  3
len exist_job: 3
len candidate_job:  1050506
sample jobs: [159993 344844 207401]
Sample result:
user_ids:  [7, 7, 7, 7, 9, 9, 9, 9, 9, 9]
job_ids:  [309823, 703889, 350167, 758932, 809208, 136489, 617374, 681348, 648562, 888419]
label:  [1, 1, 0, 0, 1, 1, 1, 0, 0, 0]
idx:  13
size:  1
len exist_job: 1


  0%|                                      | 3/208953 [00:00<5:52:48,  9.87it/s]

len candidate_job:  1050508
sample jobs: [172406]
Sample result:
user_ids:  [7, 7, 7, 7, 9, 9, 9, 9, 9, 9, 13, 13]
job_ids:  [309823, 703889, 350167, 758932, 809208, 136489, 617374, 681348, 648562, 888419, 821691, 731767]
label:  [1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0]
idx:  14
size:  6
len exist_job: 6
len candidate_job:  1050503
sample jobs: [938277 313595 779687  90973 347916 391543]
Sample result:
user_ids:  [7, 7, 7, 7, 9, 9, 9, 9, 9, 9, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14]
job_ids:  [309823, 703889, 350167, 758932, 809208, 136489, 617374, 681348, 648562, 888419, 821691, 731767, 574999, 372423, 978868, 206046, 787741, 663552, 25411, 407441, 793185, 387369, 676787, 1018640]
label:  [1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]
idx:  16
size:  2
len exist_job: 2


  0%|                                      | 6/208953 [00:00<6:30:18,  8.92it/s]

len candidate_job:  1050507
sample jobs: [171446 305725]
Sample result:
user_ids:  [7, 7, 7, 7, 9, 9, 9, 9, 9, 9, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 16, 16, 16, 16]
job_ids:  [309823, 703889, 350167, 758932, 809208, 136489, 617374, 681348, 648562, 888419, 821691, 731767, 574999, 372423, 978868, 206046, 787741, 663552, 25411, 407441, 793185, 387369, 676787, 1018640, 185492, 747203, 717582, 340330]
label:  [1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0]
idx:  24
size:  7
len exist_job: 7
len candidate_job:  1050502
sample jobs: [574788 988115 248407 674780  50512 502961 980034]
Sample result:
user_ids:  [7, 7, 7, 7, 9, 9, 9, 9, 9, 9, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 16, 16, 16, 16, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24]
job_ids:  [309823, 703889, 350167, 758932, 809208, 136489, 617374, 681348, 648562, 888419, 821691, 731767, 574999, 372423, 978868, 206046, 787741, 663552, 25411, 407441, 7931

  0%|                                      | 6/208953 [00:00<7:29:19,  7.75it/s]

len candidate_job:  1050505
sample jobs: [351817 766810 121261 596366]
Sample result:
user_ids:  [7, 7, 7, 7, 9, 9, 9, 9, 9, 9, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 16, 16, 16, 16, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 26, 26, 26, 26, 26, 26, 26, 26]
job_ids:  [309823, 703889, 350167, 758932, 809208, 136489, 617374, 681348, 648562, 888419, 821691, 731767, 574999, 372423, 978868, 206046, 787741, 663552, 25411, 407441, 793185, 387369, 676787, 1018640, 185492, 747203, 717582, 340330, 1083186, 516837, 507614, 754917, 686406, 1058896, 335132, 285781, 508201, 1060230, 1082546, 214489, 818685, 429512, 584464, 666664, 22643, 964486, 708565, 694513, 518611, 457850]
label:  [1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0]





In [12]:
# Apply on the whole dataset
user_ids = []
job_ids = []
labels = []
for idx, group in tqdm(groups): #idx: user_id, group: applications made by 1 user_id
    size = len(group)
    exist_job = group.JobID.unique().tolist()
    candidate_job = [i for i in job_id if i not in exist_job ]
    sample_job = np.random.randint(0,len(candidate_job),size) # Return random integers from low (inclusive) to high (exclusive)
    user_ids.extend([idx] * 2 * size)
    exist_job.extend([candidate_job[i] for i in sample_job])
    job_ids.extend(exist_job)
    label = [1] * size
    label.extend([0] * size)
    labels.extend(label)
    

dataset_sample.UserID = user_ids
dataset_sample.JobID = job_ids
dataset_sample.label = labels

100%|█████████████████████████████████| 208953/208953 [8:38:53<00:00,  6.71it/s]


In [13]:
dataset_sample.head()

Unnamed: 0,UserID,JobID,label
0,7,309823,1
1,7,703889,1
2,7,566574,0
3,7,481216,0
4,9,809208,1


In [14]:
dataset_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2081340 entries, 0 to 2081339
Data columns (total 3 columns):
 #   Column  Dtype
---  ------  -----
 0   UserID  int64
 1   JobID   int64
 2   label   int64
dtypes: int64(3)
memory usage: 47.6 MB


In [15]:
dataset_sample.label.value_counts()

1    1040670
0    1040670
Name: label, dtype: int64

In [16]:
dataset_sample.to_csv("./data_processed/dataset.csv", index=False, header=True)

In [17]:
pd.read_csv("./data_processed/dataset.csv")

Unnamed: 0,UserID,JobID,label
0,7,309823,1
1,7,703889,1
2,7,566574,0
3,7,481216,0
4,9,809208,1
...,...,...,...
2081335,1472085,209482,0
2081336,1472090,209535,1
2081337,1472090,254881,1
2081338,1472090,999294,0
