# Write python files for making features: baseline binary models

In [1]:
import caffeine
caffeine.on(display=False)

In [2]:
import pandas as pd
import numpy as np

In [3]:
# Load clean job data
job_set = pd.read_csv("./data_processed/jobset_clean.csv")
# Load the dataset from Step 3
user_set = pd.read_csv("./data_interim/user_set_cleaned.csv")
dataset = pd.read_csv("./data_interim/dataset_cleaned.csv")
work_history = pd.read_csv('./data_interim/work_history_cleaned.csv')

  job_set = pd.read_csv("./data_processed/jobset_clean.csv")


In [4]:
%%writefile make_features.py
'''
# Build feature vector for a random test user
ASSUMPTION:
- Warm users (already have information in user_set)
- With work history (already have information in work_history)

INPUT:
- tfidf_matrix: TF-IDF jobs
- word_history_tf_matrix: TF-IDF work history matrix
- list of top 15% popular jobs
- 3 user-defined functions for matching City, State, Countr
'''
# The following loading inputs should be done prior to running this script 

# # Load clean job data
# job_set = pd.read_csv("./data_processed/jobset_clean.csv")
# # Load the dataset from Step 3
# user_set = pd.read_csv("./data_interim/user_set_cleaned.csv")
# dataset = pd.read_csv("./data_interim/dataset_cleaned.csv")
# work_history = pd.read_csv('./data_interim/work_history_cleaned.csv')

# Load top 15% popular jobs
# top_15_jobs = pd.read_csv('./data_interim/top15_jobs.csv')

#def make_feature(test_uid):
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

'''
Build groups dataframe with following columns
- UserID: one single test user id that need recommendation
- JobID: (all jobs in the top 15% popular that can be pair with UserID)
- City: get_city_match
- State: get_state_match
- Country: get_country_match
'''

def build_group(test_uid, user_set, job_set, top_jobs):
    
#     top_jobs = top_jobs
    groups = top_jobs.copy()
    groups['UserID'] = test_uid
    groups.drop(columns=['count_job','freq','cum_freq'], axis=1, inplace=True)
    
    def get_city_match(user_id, job_id):
        user = user_set[user_set['UserID'] == user_id].iloc[0]
        job = job_set[job_set['JobID'] == job_id].iloc[0]
        return float(user['City'] == job['City'])
    
    def get_state_match(user_id, job_id):
        user = user_set[user_set['UserID'] == user_id].iloc[0]
        job = job_set[job_set['JobID'] == job_id].iloc[0]
        return float(user['State'] == job['State'])
    
    def get_country_match(user_id, job_id):
        user = user_set[user_set['UserID'] == user_id].iloc[0]
        job = job_set[job_set['JobID'] == job_id].iloc[0]
        return float(user['Country'] == job['Country'])
    
    groups['City'] = groups.apply(lambda x: get_city_match(x.UserID, x.JobID), axis=1)
    groups['State'] = groups.apply(lambda x: get_state_match(x.UserID, x.JobID), axis=1)
    groups['Country'] = groups.apply(lambda x: get_country_match(x.UserID, x.JobID), axis=1)
    return groups

'''
Build feature vector using the groups dataframe
'''
# class MakeFeature:
#     def __init__(self, groups, test_uid):
#         self.groups = groups
#         self.test_uid = test_uid
import pickle
from scipy import sparse  

def build_feature(groups, test_uid, user_set, job_set):
    '''
    Import TF-IDF vectorizers 
    - Loading pre-trained vectorizers
    - Loading result of existing vector for jobs and work history
    '''

    # For jobs: tf
    tf_path = './output_baseline/job_tf.pickle'
    tf = pickle.load(open(tf_path, 'rb'))

    # For work_history: word_history_tf
    work_history_tf_path = './output_baseline/work_history_tf.pickle'
    word_history_tf = pickle.load(open(work_history_tf_path, 'rb'))

    tfidf_matrix = sparse.load_npz("./output_baseline/tfidf_matrix.npz")
    word_history_tf_matrix = sparse.load_npz("./output_baseline/work_history_tf_matrix.npz")

    X_rec = np.zeros((1,159))
    # Get user profile
    u_id = test_uid
    user = user_set[user_set.UserID==u_id][["DegreeType", "WorkHistoryCount", 
                                                "TotalYearsExperience", "CurrentlyEmployed", 
                                                "ManagedOthers", "ManagedHowMany"]]
    # Get embedded feature for work_history of this user
    u_idx = user.index.values[0] 
    word_history_tf_matrix[u_idx,:].toarray()

    # Get user_feature
    user_feature = np.concatenate((user.values, word_history_tf_matrix[u_idx,:].toarray()),axis=1)

    # Get list of jobs 
    job_id_list = groups.JobID.values
    jobs = job_set[job_set.JobID.isin(job_id_list)]

    # Loop to build job feature, concat this to the user_feature
    j_idx = jobs.index.values
    f = []
    for i in j_idx:
        feature = np.concatenate((user_feature, tfidf_matrix[i,:].toarray()), axis=1).reshape(156,).tolist()
        f.append(feature)

    # Add matching features for city, state, country
    feature = np.concatenate((groups[["City","State", "Country"]].values, np.array(f)),axis=1)
    #feature.shape

    # Final feature vector
    X_rec = np.concatenate((X_rec, feature), axis=0)
    return X_rec


Writing make_features.py


# 2. TESTING: make features

In [5]:
# Load top 15% popular jobs
top_15_jobs = pd.read_csv('./data_interim/top15_jobs.csv')

In [6]:
test_uid = 7


In [7]:
from make_features import *

In [8]:
groups = build_group(test_uid=7, user_set=user_set, job_set=job_set, top_jobs=top_15_jobs)

In [9]:
groups

Unnamed: 0,JobID,UserID,City,State,Country
0,900797,7,0.0,0.0,1.0
1,1050711,7,0.0,0.0,1.0
2,608463,7,0.0,0.0,1.0
3,601126,7,0.0,0.0,1.0
4,802205,7,0.0,0.0,1.0
...,...,...,...,...,...
14716,1023467,7,0.0,0.0,1.0
14717,1023472,7,0.0,0.0,1.0
14718,496068,7,0.0,0.0,1.0
14719,701594,7,0.0,0.0,1.0


In [10]:
X_rec = build_feature(groups = groups, 
                      test_uid=7, 
                      user_set=user_set, 
                      job_set=job_set)

In [11]:
X_rec

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.14746725],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.0216625 ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.14582704, 0.        ,
        0.22056316]])

In [12]:
X_rec.shape

(14722, 159)

# Test Top-N recommendation

In [13]:
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, precision_score, f1_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import xgboost as xgb
from xgboost import XGBClassifier
import pickle

In [14]:
import pickle
xgbt = pickle.load(open('./output_baseline/cb12_xgbt.pikle', "rb")) # XGBoost RecSys

In [15]:
Y_pred = xgbt.predict(X_rec)

In [16]:
Y_pred_pr = xgbt.predict_proba(X_rec)

In [17]:
Y_pred_pr

array([[0.54576397, 0.45423606],
       [0.57439405, 0.42560595],
       [0.57439405, 0.42560595],
       ...,
       [0.57439405, 0.42560595],
       [0.57439405, 0.42560595],
       [0.57439405, 0.42560595]], dtype=float32)

In [18]:
Y_pred_pr_max = list(map(max, Y_pred_pr))

In [20]:
job_id_list = groups.JobID.values

In [21]:
rec_30items = sorted(
        [
            (ids_j, yprob_j, ypred_j) for yprob_j, ypred_j, ids_j in zip(Y_pred_pr_max, Y_pred, job_id_list)
        ],
        key=lambda x: -x[1]
    )[0:30]

In [22]:
rec_30items

[(863621, 0.5926077, 1),
 (663986, 0.5926077, 1),
 (448679, 0.5926077, 1),
 (795414, 0.5926077, 1),
 (189702, 0.5926077, 1),
 (372860, 0.5926077, 1),
 (611816, 0.5926077, 1),
 (797470, 0.58266145, 1),
 (899195, 0.58266145, 1),
 (882055, 0.58266145, 1),
 (640761, 0.58266145, 1),
 (877393, 0.58266145, 1),
 (24666, 0.58266145, 1),
 (809521, 0.58266145, 1),
 (525201, 0.58266145, 1),
 (695219, 0.58266145, 1),
 (868962, 0.58266145, 1),
 (152361, 0.58266145, 1),
 (282442, 0.58266145, 1),
 (417319, 0.58266145, 1),
 (15592, 0.58266145, 1),
 (930024, 0.58266145, 1),
 (876810, 0.58266145, 1),
 (777706, 0.58266145, 1),
 (1105002, 0.58266145, 1),
 (703408, 0.58266145, 1),
 (177122, 0.58266145, 1),
 (134540, 0.58266145, 1),
 (39740, 0.58266145, 1),
 (220029, 0.58266145, 1)]

In [23]:
rec_20items = sorted(
        [
            (ids_j, yprob_j, ypred_j) for yprob_j, ypred_j, ids_j in zip(Y_pred_pr_max, Y_pred, job_id_list)
        ],
        key=lambda x: -x[1]
    )[0:20]

In [24]:
rec_20items

[(863621, 0.5926077, 1),
 (663986, 0.5926077, 1),
 (448679, 0.5926077, 1),
 (795414, 0.5926077, 1),
 (189702, 0.5926077, 1),
 (372860, 0.5926077, 1),
 (611816, 0.5926077, 1),
 (797470, 0.58266145, 1),
 (899195, 0.58266145, 1),
 (882055, 0.58266145, 1),
 (640761, 0.58266145, 1),
 (877393, 0.58266145, 1),
 (24666, 0.58266145, 1),
 (809521, 0.58266145, 1),
 (525201, 0.58266145, 1),
 (695219, 0.58266145, 1),
 (868962, 0.58266145, 1),
 (152361, 0.58266145, 1),
 (282442, 0.58266145, 1),
 (417319, 0.58266145, 1)]