# Build feature vector for a random test user
## => Wrap all steps in section 1 as a .py file (make_features.py) to reuse any test user_id

ASSUMPTION
- Warm users (already have information in user_set)
- With work history (already have information in work_history)

INPUT:
- tfidf_matrix: TF-IDF jobs
- word_history_tf_matrix: TF-IDF work history matrix
- list of top 15% popular jobs
- 3 user-defined functions for matching City, State, Countr

In [1]:
import caffeine
caffeine.on(display=False)

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

# 1. Pipeline for building features

## Import existing datasets

In [3]:
# Load clean job data
job_set = pd.read_csv("./data_processed/jobset_clean.csv")
# Load the dataset from Step 3
user_set = pd.read_csv("./data_interim/user_set_cleaned.csv")
dataset = pd.read_csv("./data_interim/dataset_cleaned.csv")
work_history = pd.read_csv('./data_interim/work_history_cleaned.csv')

  job_set = pd.read_csv("./data_processed/jobset_clean.csv")


In [4]:
# Load top 15% popular jobs
top_15_jobs = pd.read_csv('./data_interim/top15_jobs.csv')

In [5]:
len(top_15_jobs)

14721

In [6]:
top_15_jobs.head()

Unnamed: 0,JobID,count_job,freq,cum_freq
0,900797,45,7.8e-05,7.8e-05
1,1050711,41,7.1e-05,0.000148
2,608463,39,6.7e-05,0.000216
3,601126,37,6.4e-05,0.000279
4,802205,36,6.2e-05,0.000342


## Import TF-IDF vectorizers 
- Loading pre-trained vectorizers
- Loading result of existing vector for jobs and work history

FYI: TF-IDF on the combination of Title & Description & Requirements
tf = TfidfVectorizer(analyzer='word',
                     ngram_range=(1, 2),
                     min_df=5, 
                     max_features=100, 
                     stop_words='english')
tfidf_matrix = tf.fit_transform(job_set['word'])

FYI: 
word_history_tf = TfidfVectorizer(analyzer='word',
                                  ngram_range=(1, 2),
                                  min_df=0, 
                                  max_features=50, 
                                  stop_words='english')


In [7]:
import pickle

# For jobs: tf
tf_path = './output_baseline/job_tf.pickle'
tf = pickle.load(open(tf_path, 'rb'))

# For work_history: word_history_tf
work_history_tf_path = './output_baseline/work_history_tf.pickle'
word_history_tf = pickle.load(open(work_history_tf_path, 'rb'))

In [12]:
from scipy import sparse

tfidf_matrix = sparse.load_npz("./output_baseline/tfidf_matrix.npz")
word_history_tf_matrix = sparse.load_npz("./output_baseline/work_history_tf_matrix.npz")

In [13]:
# check shape of the matrix. should be:
# <1050509x100 sparse matrix of type '<class 'numpy.float64'>'
#	with 31754477 stored elements in Compressed Sparse Row format>

tfidf_matrix.shape

(1050509, 100)

In [14]:
# check shape of the matrix. should be:
#<152292x50 sparse matrix of type '<class 'numpy.float64'>'
#	with 327656 stored elements in Compressed Sparse Row format>
word_history_tf_matrix.shape

(152292, 50)

## Build groups dataframe with following columns
- UserID
- JobID
- Check CIty
- Check Country
- Check State

In [15]:
# Build all (User_ID, Job_ID) pairs and calculate matching City, State, Country
def get_city_match(user_id, job_id):
    user = user_set[user_set['UserID'] == user_id].iloc[0]
    job = job_set[job_set['JobID'] == job_id].iloc[0]
    return float(user['City'] == job['City'])
def get_state_match(user_id, job_id):
    user = user_set[user_set['UserID'] == user_id].iloc[0]
    job = job_set[job_set['JobID'] == job_id].iloc[0]
    return float(user['State'] == job['State'])
def get_country_match(user_id, job_id):
    user = user_set[user_set['UserID'] == user_id].iloc[0]
    job = job_set[job_set['JobID'] == job_id].iloc[0]
    return float(user['Country'] == job['Country'])

In [16]:
test_uid = 7
groups = top_15_jobs.copy()
groups['UserID'] = test_uid
groups.drop(columns=['count_job','freq','cum_freq'], axis=1, inplace=True)

In [17]:
groups.head()

Unnamed: 0,JobID,UserID
0,900797,7
1,1050711,7
2,608463,7
3,601126,7
4,802205,7


In [21]:
%%time
groups['City'] = groups.apply(lambda x: get_city_match(x.UserID, x.JobID), axis=1)


CPU times: user 42.1 s, sys: 1.85 s, total: 43.9 s
Wall time: 23.9 s


In [22]:
%%time
groups['State'] = groups.apply(lambda x: get_state_match(x.UserID, x.JobID), axis=1)


CPU times: user 45 s, sys: 1.91 s, total: 46.9 s
Wall time: 25.7 s


In [23]:
%%time
groups['Country'] = groups.apply(lambda x: get_country_match(x.UserID, x.JobID), axis=1)


CPU times: user 42.7 s, sys: 1.83 s, total: 44.5 s
Wall time: 24.6 s


In [24]:
groups.head()

Unnamed: 0,JobID,UserID,City,State,Country
0,900797,7,0.0,0.0,1.0
1,1050711,7,0.0,0.0,1.0
2,608463,7,0.0,0.0,1.0
3,601126,7,0.0,0.0,1.0
4,802205,7,0.0,0.0,1.0


## Build feature vector for given test user id (u_id = 7)

In [25]:
X_rec = np.zeros((1,159))

In [27]:
# Get user profile
u_id = test_uid
user = user_set[user_set.UserID==u_id][["DegreeType", "WorkHistoryCount", 
                                            "TotalYearsExperience", "CurrentlyEmployed", 
                                            "ManagedOthers", "ManagedHowMany"]]
user

Unnamed: 0,DegreeType,WorkHistoryCount,TotalYearsExperience,CurrentlyEmployed,ManagedOthers,ManagedHowMany
50231,1,2,13.0,1,0,0


In [28]:
# Get embedded feature for work_history of this user
u_idx = user.index.values[0]
print(u_idx)
word_history_tf_matrix[u_idx,:].toarray()

50231


array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.467288  , 0.        , 0.        , 0.        ,
        0.69007561, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.55266407, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ]])

In [29]:
# Get user_feature
user_feature = np.concatenate((user.values, word_history_tf_matrix[u_idx,:].toarray()),axis=1)
print('user feature: ', user_feature)

user feature:  [[ 1.          2.         13.          1.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.467288    0.          0.          0.
   0.69007561  0.          0.          0.          0.          0.
   0.          0.          0.55266407  0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.        ]]


In [30]:
# Get list of jobs 
job_id_list = groups.JobID.values
jobs = job_set[job_set.JobID.isin(job_id_list)]
print('jobs applied by the user: ', job_id_list)

jobs applied by the user:  [ 900797 1050711  608463 ...  496068  701594  617190]


In [31]:
jobs

Unnamed: 0,JobID,WindowID,Title,Description,Requirements,City,State,Country,Zip5,StartDate,EndDate
10,15,1,Administrative Assistant,This Administrative Assistant position is resp...,Please refer to the Job Description to view th...,Los Angeles,CA,US,90011,2012-03-09 01:12:16.81,2012-04-08 23:59:59
102,268,1,COMM FIBER TECHNICIAN,COMM FIBER TECHNICIAN BA18666 Min 3 yr exp ins...,Please refer to the Job Description to view th...,Rosedale,MD,US,21237,2012-03-28 07:31:04.24,2012-04-27 23:59:59
193,550,1,MDNOW Urgent Care Front Desk,MDNOW Urgent Care Front Desk - WE ARE GROWING...,Please refer to the Job Description to view th...,Coral Springs,FL,US,33076,2012-04-06 10:05:55.02,2012-05-05 23:59:59
195,556,1,CONSTRUCTION Equipment Operators Pipe Fitters ...,CONSTRUCTION Equipment Operators Pipe Fitters...,Please refer to the Job Description to view th...,Fort Lauderdale,FL,US,,2012-04-07 04:48:59.52,2012-05-06 23:59:59
246,788,1,Driver CDL A or B,Driver CDL A or B CB338115 Romeoville Ne...,Please refer to the Job Description to view th...,Romeoville,IL,US,60446,2012-04-05 05:10:21.52,2012-05-04 23:59:59
...,...,...,...,...,...,...,...,...,...,...,...
1049907,1108911,7,Production Supervisor,Job Summary: Ascend Personnel Services is curr...,Qualifications:To perform this job successfull...,Modesto,CA,US,,2012-05-24 13:07:35.197,2012-06-23 23:59:00
1050132,1112794,7,Customer Service Representatives,Customer Service Representatives Seasonal Be...,Experience using Microsoft Office Suite ap...,South Jordan,UT,US,84095.0,2012-05-29 09:18:15.267,2012-06-28 23:59:00
1050340,1114960,7,Hotel Resort and Restaurant Mangers,Hotel Resort and Restaurant Management Size ...,"Strong leadership skills, Great organization s...",Miami,FL,US,,2012-06-29 02:09:26.583,2012-07-28 23:59:59
1050429,1115793,7,Entry Level Office Assistant,Staffing Firm in Midtown is looking for Entry ...,H.S. Diploma a must; B.A. preferred.Must know ...,New York,NY,US,10017.0,2012-06-04 19:20:45.887,2012-07-03 23:59:00


In [32]:
j_idx = jobs.index.values
f = []

for i in j_idx:
    feature = np.concatenate((user_feature, tfidf_matrix[i,:].toarray()), axis=1).reshape(156,).tolist()
    f.append(feature)
print('combine user_feature and embeded job', len(f))
    
    

combine user_feature and embeded job 14721


In [33]:
# each element in f is  a combine feature
len(f[0])

156

In [34]:
groups[["City","State","Country"]].values.shape

(14721, 3)

In [35]:
np.array(f).shape

(14721, 156)

In [37]:
feature = np.concatenate((groups[["City","State", "Country"]].values, np.array(f)),axis=1)

In [38]:
feature.shape

(14721, 159)

In [39]:
X_rec = np.concatenate((X_rec, feature), axis=0)

In [40]:
X_rec

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.14746725],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.0216625 ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.14582704, 0.        ,
        0.22056316]])

In [60]:
X_rec.shape

(14722, 159)

# 2. Test: Make prediction for the feature using pre-trained binary cls models

In [41]:
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, precision_score, f1_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import xgboost as xgb
from xgboost import XGBClassifier
import pickle

In [42]:
import pickle
xgbt = pickle.load(open('./output_baseline/cb12_xgbt.pikle', "rb")) # XGBoost RecSys

In [49]:
Y_pred = xgbt.predict(X_rec)

In [50]:
Y_pred_pr = xgbt.predict_proba(X_rec)

In [51]:
Y_pred_pr

array([[0.54576397, 0.45423606],
       [0.57439405, 0.42560595],
       [0.57439405, 0.42560595],
       ...,
       [0.57439405, 0.42560595],
       [0.57439405, 0.42560595],
       [0.57439405, 0.42560595]], dtype=float32)

In [52]:
Y_pred_pr_max = list(map(max, Y_rec_pr))

In [48]:
job_id_list

array([ 900797, 1050711,  608463, ...,  496068,  701594,  617190])

In [55]:
rec_30items = sorted(
        [
            (ids_j, yprob_j, ypred_j) for yprob_j, ypred_j, ids_j in zip(Y_pred_pr_max, Y_pred, job_id_list)
        ],
        key=lambda x: -x[1]
    )[0:30]

In [56]:
rec_30items

[(863621, 0.5926077, 1),
 (663986, 0.5926077, 1),
 (448679, 0.5926077, 1),
 (795414, 0.5926077, 1),
 (189702, 0.5926077, 1),
 (372860, 0.5926077, 1),
 (611816, 0.5926077, 1),
 (797470, 0.58266145, 1),
 (899195, 0.58266145, 1),
 (882055, 0.58266145, 1),
 (640761, 0.58266145, 1),
 (877393, 0.58266145, 1),
 (24666, 0.58266145, 1),
 (809521, 0.58266145, 1),
 (525201, 0.58266145, 1),
 (695219, 0.58266145, 1),
 (868962, 0.58266145, 1),
 (152361, 0.58266145, 1),
 (282442, 0.58266145, 1),
 (417319, 0.58266145, 1),
 (15592, 0.58266145, 1),
 (930024, 0.58266145, 1),
 (876810, 0.58266145, 1),
 (777706, 0.58266145, 1),
 (1105002, 0.58266145, 1),
 (703408, 0.58266145, 1),
 (177122, 0.58266145, 1),
 (134540, 0.58266145, 1),
 (39740, 0.58266145, 1),
 (220029, 0.58266145, 1)]

In [57]:
rec_20items = sorted(
        [
            (ids_j, yprob_j, ypred_j) for yprob_j, ypred_j, ids_j in zip(Y_pred_pr_max, Y_pred, job_id_list)
        ],
        key=lambda x: -x[1]
    )[0:20]

In [58]:
rec_20items

[(863621, 0.5926077, 1),
 (663986, 0.5926077, 1),
 (448679, 0.5926077, 1),
 (795414, 0.5926077, 1),
 (189702, 0.5926077, 1),
 (372860, 0.5926077, 1),
 (611816, 0.5926077, 1),
 (797470, 0.58266145, 1),
 (899195, 0.58266145, 1),
 (882055, 0.58266145, 1),
 (640761, 0.58266145, 1),
 (877393, 0.58266145, 1),
 (24666, 0.58266145, 1),
 (809521, 0.58266145, 1),
 (525201, 0.58266145, 1),
 (695219, 0.58266145, 1),
 (868962, 0.58266145, 1),
 (152361, 0.58266145, 1),
 (282442, 0.58266145, 1),
 (417319, 0.58266145, 1)]