# TopN Recommendation: Perform topN recommendations
- Model: White-box and Black-box models trained on TF-IDF features
 (use vector for X_train, Y_train, X_test, Y_test)
    - White-box: Logistic Regression (logreg), Decision Tree (dt), Gaussian Naive Bayes (nb)
    - Black-box: XGBoost (xgbt), AdaBoost (ada), Linear Discriminant Analysis (lda), Quadratic Discriminant Analysis (qda)
- Source of potential applications: random_ranking_data

OUPTUT: Top 20 applications for each user, ranking based on probability of predicted class

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import random
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Import library for baseline classification models
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, precision_score, f1_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

In [3]:
# Load pre-processed data
path = "./data_interim/"
user_set = pd.read_csv(path + "user_set_cleaned.csv")
work_history = pd.read_csv(path + "work_history_cleaned.csv")
dataset = pd.read_csv(path + "dataset_cleaned.csv")
# Load clean job set
job_set = pd.read_csv("./data_processed/jobset_clean.csv")

In [4]:
job_set.head()

Unnamed: 0,JobID,WindowID,Title,Description,Requirements,City,State,Country,Zip5,StartDate,EndDate
0,1,1,Security Engineer Technical Lead,Security Clearance Required: Top Secret Job N...,SKILL SET Network Security tools: Webdefend We...,Washington,DC,US,20531.0,2012-03-07 13:17:01.643,2012-04-06 23:59:59
1,4,1,SAP Business Analyst WM,NO Corp. to Corp resumes are being considered ...,WHAT YOU NEED: Four year college degreeMinimum...,Charlotte,NC,US,28217.0,2012-03-21 02:03:44.137,2012-04-20 23:59:59
2,7,1,P T HUMAN RESOURCES ASSISTANT,P T HUMAN RESOURCES ASSISTANT 1-2 ye...,Please refer to the Job Description to view th...,Winter Park,FL,US,32792.0,2012-03-02 16:36:55.447,2012-04-01 23:59:59
3,8,1,Route Delivery Drivers,CITY BEVERAGES Come to work for the best in th...,Please refer to the Job Description to view th...,Orlando,FL,US,,2012-03-03 09:01:10.077,2012-04-02 23:59:59
4,9,1,Housekeeping,I make sure every part of their day is magica...,Please refer to the Job Description to view th...,Orlando,FL,US,,2012-03-03 09:01:11.88,2012-04-02 23:59:59


In [5]:
# Load ranking data
ranking_data = pd.read_csv('./nb_ranking_data/ranking_data_random.csv')

In [None]:
# # Load TF-IDF matrix for jobs and user work history
# from scipy import sparse 
# tfidf_matrix = sparse.load_npz("./output_baseline/tfidf_matrix.npz")
# word_history_tf_matrix = sparse.load_npz("./output_baseline/work_history_tf_matrix.npz")

In [None]:
# # Load matrix of X_train, Y_train, X_test, Y_test

# X_train = np.load('./output_baseline/X_train_159.npy')
# X_train = np.load('./output_baseline/Y_train_159.npy')
# X_test = np.load('./output_baseline/X_test_159.npy')
# Y_test = np.load('./output_baseline/Y_test_159.npy')

In [6]:
ranking_data

Unnamed: 0,UserID,JobID,label,City,State,Country
0,13,821691,1,0.0,1.0,1.0
1,13,701157,0,0.0,0.0,0.0
2,13,472398,0,0.0,0.0,0.0
3,13,411244,0,0.0,0.0,0.0
4,13,868940,0,0.0,0.0,0.0
...,...,...,...,...,...,...
371595,1471988,185389,0,0.0,0.0,0.0
371596,1471988,1032399,0,0.0,0.0,0.0
371597,1471988,656923,0,0.0,0.0,0.0
371598,1471988,985198,0,0.0,0.0,0.0


In [7]:
test_user = user_set[user_set.Split=="Test"].UserID.values
test_data = dataset[dataset.UserID.isin(test_user)]

In [8]:
groups = ranking_data.groupby("UserID")
len(groups)

3716

In [9]:
len(test_user)

3716

In [10]:
from test_hit_rate_baseline import *

In [12]:
rec_result={}

In [15]:
from topN_baseline import *

# Logistic Regression

In [16]:
# Load a baseline model and test this function
import pickle
model = 'logreg'
model_name = './output_baseline/cb12_logreg.pikle'
logreg = pickle.load(open(model_name, "rb"))

In [17]:
%%time
rec_20 = topN_baseline(model = logreg, N=20, ranking_data=ranking_data, user_set=user_set, job_set=job_set)

100%|██████████| 3716/3716 [01:52<00:00, 33.14it/s]

Wall time: 1min 52s





In [18]:
rec_result[model]= rec_20

In [19]:
rec_result

{'logreg': [[(821691, 0.9460819303209994, 1),
   (129969, 0.5388668921792139, 1),
   (860580, 0.5373474573189638, 1),
   (145443, 0.5300147978578085, 1),
   (1092282, 0.5228867006825255, 1),
   (208055, 0.5221627306716441, 1),
   (701157, 0.5176828903322702, 1),
   (234659, 0.5092400994187875, 1),
   (589357, 0.5030287281714315, 1),
   (448819, 0.49982856175442536, 0),
   (181759, 0.49909247299458764, 0),
   (482235, 0.49676894930862364, 0),
   (1094005, 0.4905521156600156, 0),
   (70771, 0.4901411602734877, 0),
   (598128, 0.4845800344176539, 0),
   (830711, 0.48348228859650866, 0),
   (785527, 0.4826591774620505, 0),
   (85439, 0.4811781413903799, 0),
   (1041876, 0.4806673531995891, 0),
   (949413, 0.48030151188360914, 0)],
  [(925751, 0.4865254513159264, 0),
   (6284, 0.4744662648096991, 0),
   (619462, 0.4628281839762106, 0),
   (1076663, 0.46008556611705104, 0),
   (347601, 0.4544136496118787, 0),
   (209747, 0.44855112737862757, 0),
   (646162, 0.44672736423153486, 0),
   (71316

## Naive Bayes

In [20]:
# Load a baseline model and test this function
import pickle
model = 'nb'
model_name = './output_baseline/cb12_nb.pikle'
nb = pickle.load(open(model_name, "rb"))

In [22]:
%%time
rec_20 = topN_baseline(model = nb, N=20, ranking_data=ranking_data, user_set=user_set, job_set=job_set)

100%|██████████| 3716/3716 [01:52<00:00, 33.17it/s]

Wall time: 1min 52s





In [23]:
rec_result[model]= rec_20

In [24]:
len(rec_result)

2

## Decision Tree

In [25]:
# Load a baseline model and test this function
import pickle
model = 'dt'
model_name = './output_baseline/cb12_dt.pikle'
dt = pickle.load(open(model_name, "rb"))

In [26]:
%%time
rec_20 = topN_baseline(model = dt, 
                       N=20, 
                       ranking_data=ranking_data, 
                       user_set=user_set, 
                       job_set=job_set)

100%|██████████| 3716/3716 [01:55<00:00, 32.19it/s]

Wall time: 1min 55s





In [27]:
rec_result[model]= rec_20

## AdaBoost

In [28]:
# Load a baseline model and test this function
import pickle
model = 'ada'
model_name = './output_baseline/cb12_ada.pikle'
ada = pickle.load(open(model_name, "rb"))

In [29]:
%%time
rec_20 = topN_baseline(model = ada, 
                       N=20, 
                       ranking_data=ranking_data, 
                       user_set=user_set, 
                       job_set=job_set)

100%|██████████| 3716/3716 [03:17<00:00, 18.80it/s]

Wall time: 3min 17s





In [30]:
rec_result[model]= rec_20

## Linear Discriminant Analysis

In [34]:
# Load a baseline model and test this function
import pickle
model = 'lda'
model_name = './output_baseline/cb12_lda.pikle'
lda = pickle.load(open(model_name, "rb"))

In [35]:
%%time
rec_20 = topN_baseline(model = lda, 
                       N=20, 
                       ranking_data=ranking_data, 
                       user_set=user_set, 
                       job_set=job_set)

100%|██████████| 3716/3716 [01:50<00:00, 33.72it/s]

Wall time: 1min 50s





In [36]:
rec_result[model]= rec_20

## Quadratic Discriminant Analysis

In [37]:
# Load a baseline model and test this function
import pickle
model = 'qda'
model_name = './output_baseline/cb12_qda.pikle'
qda = pickle.load(open(model_name, "rb"))

In [38]:
%%time
rec_20 = topN_baseline(model = qda, 
                       N=20, 
                       ranking_data=ranking_data, 
                       user_set=user_set, 
                       job_set=job_set)

100%|██████████| 3716/3716 [02:03<00:00, 30.13it/s]

Wall time: 2min 3s





In [39]:
rec_result[model]= rec_20

## XGBoost RecSys baseline

In [40]:
# Load a baseline model and test this function
import pickle
model = 'xgbt'
model_name = './output_baseline/cb12_xgbt.pikle'
xgbt = pickle.load(open(model_name, "rb"))

In [41]:
%%time
rec_20 = topN_baseline(model = xgbt, 
                       N=20, 
                       ranking_data=ranking_data, 
                       user_set=user_set, 
                       job_set=job_set)

100%|██████████| 3716/3716 [02:23<00:00, 25.91it/s]

Wall time: 2min 23s





In [42]:
rec_result[model]= rec_20

# Summarize all results

In [47]:
len(rec_result)

9

In [48]:
len(rec_result['xgbt'])

3716

In [50]:
import pickle
with open('./output_ranking/baseline_topN_result.pickle', 'wb') as handle:
    pickle.dump(rec_result, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [51]:
# Test loaded result
with open('./output_ranking/baseline_topN_result.pickle', 'rb') as handle:
    loaded_rec_result = pickle.load(handle)

In [53]:
print(rec_result == loaded_rec_result)

True
