# Get recommendations for myfm models - FM_MATCH
## Ranking data: ranking_data_knn_lda_v2.csv
- Run full list of users in ranking data (#user < # test_users due to dropping some duplication)
- Load scripts:
    - Making features: make_features_myfm_ranking.py
    - Getting topN for a given user Id: topN_myfm_single.py

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
from sklearn import metrics

import myfm

In [2]:
from make_features_myfm_ranking import * # Import customer script for building features

In [3]:
from topN_myfm_single import * 

In [4]:
# Run this for reproduce
# Gets the current working directory
import os
cwd = os.getcwd()
print("Working directory:", cwd)
# Go up one directory from working directory
os.chdir("..")

Working directory: /home/jovyan/1_UT THESIS/CB12_MAIN/nb_recsys_myfm


In [5]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
# Load clean job data
job_set = pd.read_csv("./data_processed/jobset_clean.csv")

# Load the dataset from Step 3
user_set = pd.read_csv("./data_interim/user_set_cleaned.csv")
dataset = pd.read_csv("./data_interim/dataset_cleaned.csv")
work_history = pd.read_csv('./data_interim/work_history_cleaned.csv')


In [7]:
train_user = user_set[user_set.Split=="Train"].UserID.values
test_user = user_set[user_set.Split=="Test"].UserID.values
train_data = dataset[dataset.UserID.isin(train_user)]
test_data = dataset[dataset.UserID.isin(test_user)]

In [8]:
user_fm = pd.read_csv('./data_interim/users_fm.csv')
job_fm = pd.read_csv('./data_interim/jobs_fm.csv')

In [9]:
# Load ranking data (generated by KNN-LDA features)
ranking_data = pd.read_csv('./nb_recsys_ebm/ranking_data_knn_lda_v2.csv')

## Load pre-train FM models

In [10]:
%%time
import pickle
model_name = './output_myfm/fm_match.pikle'
fm_match = pickle.load(open(model_name, "rb"))

CPU times: user 8.17 s, sys: 41.5 s, total: 49.7 s
Wall time: 1min 32s


In [11]:
ranking_users = ranking_data.UserID.unique()

In [12]:
len(ranking_users)

3691

# Model: FM_match

In [13]:
%%time
rec_result_match = {}
for u_id in ranking_users:
    # print('UserID:', u_id)
    rec_20 = topN_fm_simple(u_id = u_id, 
                          fm_type = 'fm_match', 
                          fm_model = fm_match, 
                          ranking_data = ranking_data, 
                          train_data = train_data, 
                          N=20)
    rec_result_match[u_id] = rec_20

CPU times: user 3h 12min 15s, sys: 5min 29s, total: 3h 17min 45s
Wall time: 3h 17min 32s


In [14]:
%%time
final_rec_result_match = pd.DataFrame(columns = ['JobID', 'Y_prob', 'Y_pred','UserID','rank'])

for u_id in ranking_users:
    
    temp_df = get_rec_result_df(u_id=u_id, 
                                rec_N=rec_result_match[u_id])
    # print(u_id, len(temp_df))
    final_rec_result_match = pd.concat([final_rec_result_match , temp_df])

# final_rec_result = final_rec_result[['UserID','JobID', 'Y_prob', 'Y_pred', 'rank']]

CPU times: user 11.7 s, sys: 83.7 ms, total: 11.8 s
Wall time: 11.8 s


In [15]:
rec_result_match[13]

[(242040, 0.9903147289318188, 1),
 (110296, 0.9883287849143759, 1),
 (997257, 0.9425317216852747, 1),
 (11974, 0.9404031004283993, 1),
 (601850, 0.6223938432169538, 1),
 (919556, 0.3223811503988055, 0),
 (1057569, 0.3091854383372541, 0),
 (1116150, 0.2824541095294884, 0),
 (764500, 0.24346870312053526, 0),
 (230268, 0.22448414531458094, 0),
 (498712, 0.2061848907864432, 0),
 (842053, 0.20577051054619025, 0),
 (328664, 0.19681714211214277, 0),
 (885417, 0.18730584922826227, 0),
 (258050, 0.18694204774614426, 0),
 (265838, 0.18636001993594442, 0),
 (823659, 0.17914503915353014, 0),
 (916743, 0.16711772076843115, 0),
 (28031, 0.15841269127530463, 0),
 (30091, 0.15362664195684067, 0)]

# Export results

In [16]:
final_rec_result_match.to_csv('./output_topN_myfm/rec20_fm_match_knn.csv', header=True, index=False)

In [17]:
# Export rec result dictionary
import gzip, pickle, pickletools

filepath = "./nb_recsys_myfm/rec_result_fm_match_knn.pikle"
with gzip.open(filepath, "wb") as f:
    pickled = pickle.dumps(rec_result_match)
    optimized_pickle = pickletools.optimize(pickled)
    f.write(optimized_pickle)

In [18]:
%%time
with gzip.open(filepath, 'rb') as f:
    p = pickle.Unpickler(f)
    loaded_rec_result = p.load()

CPU times: user 54.6 ms, sys: 4.1 ms, total: 58.7 ms
Wall time: 59.9 ms


In [19]:
loaded_rec_result[13]

[(242040, 0.9903147289318188, 1),
 (110296, 0.9883287849143759, 1),
 (997257, 0.9425317216852747, 1),
 (11974, 0.9404031004283993, 1),
 (601850, 0.6223938432169538, 1),
 (919556, 0.3223811503988055, 0),
 (1057569, 0.3091854383372541, 0),
 (1116150, 0.2824541095294884, 0),
 (764500, 0.24346870312053526, 0),
 (230268, 0.22448414531458094, 0),
 (498712, 0.2061848907864432, 0),
 (842053, 0.20577051054619025, 0),
 (328664, 0.19681714211214277, 0),
 (885417, 0.18730584922826227, 0),
 (258050, 0.18694204774614426, 0),
 (265838, 0.18636001993594442, 0),
 (823659, 0.17914503915353014, 0),
 (916743, 0.16711772076843115, 0),
 (28031, 0.15841269127530463, 0),
 (30091, 0.15362664195684067, 0)]