# Get recommendations for myfm models - FM
## Ranking data: ranking_data_knn_lda_v2.csv
- Run full list of users in ranking data (#user < # test_users due to dropping some duplication)
- Load scripts:
    - Making features: make_features_myfm_ranking.py
    - Getting topN for a given user Id: topN_myfm_single.py

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
from sklearn import metrics

import myfm

In [2]:
from make_features_myfm_ranking import * # Import customer script for building features

In [3]:
from topN_myfm_single import * 

In [4]:
# Run this for reproduce
# Gets the current working directory
import os
cwd = os.getcwd()
print("Working directory:", cwd)
# Go up one directory from working directory
os.chdir("..")

Working directory: /home/jovyan/1_UT THESIS/CB12_MAIN/nb_recsys_myfm


In [5]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
# Load clean job data
job_set = pd.read_csv("./data_processed/jobset_clean.csv")

# Load the dataset from Step 3
user_set = pd.read_csv("./data_interim/user_set_cleaned.csv")
dataset = pd.read_csv("./data_interim/dataset_cleaned.csv")
work_history = pd.read_csv('./data_interim/work_history_cleaned.csv')


In [7]:
train_user = user_set[user_set.Split=="Train"].UserID.values
test_user = user_set[user_set.Split=="Test"].UserID.values
train_data = dataset[dataset.UserID.isin(train_user)]
test_data = dataset[dataset.UserID.isin(test_user)]

In [8]:
user_fm = pd.read_csv('./data_interim/users_fm.csv')
job_fm = pd.read_csv('./data_interim/jobs_fm.csv')

In [9]:
# Load ranking data (generated by KNN-LDA features)
ranking_data = pd.read_csv('./nb_recsys_ebm/ranking_data_knn_lda_v2.csv')

## Load pre-train FM models

In [11]:
%%time
import pickle
model_name = './output_myfm/fm.pikle'
fm = pickle.load(open(model_name, "rb"))

CPU times: user 8.24 s, sys: 43.7 s, total: 52 s
Wall time: 1min 46s


In [12]:
ranking_users = ranking_data.UserID.unique()

In [13]:
len(ranking_users)

3691

# Model: FM

In [14]:
%%time
rec_result_fm = {}
for u_id in ranking_users:
    # print('UserID:', u_id)
    rec_20 = topN_fm_simple(u_id = u_id, 
                          fm_type = 'fm', 
                          fm_model = fm, 
                          ranking_data = ranking_data, 
                          train_data = train_data, 
                          N=20)
    rec_result_fm[u_id] = rec_20

CPU times: user 2h 59min 10s, sys: 5min 20s, total: 3h 4min 31s
Wall time: 3h 4min 20s


In [15]:
len(rec_result_fm)

3691

In [16]:
rec_result_fm[13]

[(328664, 0.7719287811634643, 1),
 (916743, 0.7518047581141348, 1),
 (164436, 0.7407665836999285, 1),
 (997257, 0.7291420201749421, 1),
 (11974, 0.7122105892249021, 1),
 (230268, 0.7083941469402446, 1),
 (1116150, 0.708004811606941, 1),
 (1057569, 0.707714613129258, 1),
 (568507, 0.7070080639231877, 1),
 (601850, 0.7044830107415203, 1),
 (764500, 0.7013997445525804, 1),
 (919556, 0.7008988070251321, 1),
 (842053, 0.6644481093987531, 1),
 (265838, 0.6537678265057497, 1),
 (296344, 0.6424239979466447, 1),
 (28031, 0.6324183180231109, 1),
 (602913, 0.6323106372773917, 1),
 (931979, 0.6309755829085892, 1),
 (614701, 0.6271885520928864, 1),
 (1061072, 0.6243355641671254, 1)]

In [17]:
pd.DataFrame(rec_result_fm).head(2)

Unnamed: 0,13,514,681,767,883,1006,1066,1149,2520,2639,...,1463776,1464243,1464260,1470280,1470641,1470705,1470706,1470779,1471251,1471988
0,"(328664, 0.7719287811634643, 1)","(1076104, 0.8122548600882044, 1)","(996746, 0.7629624469886238, 1)","(151832, 0.8017387196552688, 1)","(490779, 0.7825699531053649, 1)","(148525, 0.7772448023462882, 1)","(163278, 0.7271595346481137, 1)","(344145, 0.76809187605935, 1)","(790782, 0.7575044072364716, 1)","(837469, 0.6899364323382742, 1)",...,"(175748, 0.7142357294336719, 1)","(751378, 0.7825850361382813, 1)","(806524, 0.733210532386134, 1)","(171597, 0.7598345990491284, 1)","(808822, 0.7544599678247991, 1)","(277265, 0.7250443611278037, 1)","(872952, 0.8118134515771996, 1)","(8374, 0.7458242904039797, 1)","(800836, 0.8080563969783542, 1)","(140160, 0.7029341969490728, 1)"
1,"(916743, 0.7518047581141348, 1)","(755161, 0.7476940133393549, 1)","(551679, 0.7322549421616368, 1)","(344145, 0.76809187605935, 1)","(1041638, 0.7396966629966971, 1)","(14333, 0.7635368702147671, 1)","(431912, 0.6038793975588058, 1)","(613479, 0.7467525703013174, 1)","(16123, 0.7192079192942163, 1)","(245476, 0.676548377417714, 1)",...,"(632928, 0.6962002517929343, 1)","(1001870, 0.6455799715453978, 1)","(653698, 0.7042287624727274, 1)","(850643, 0.7566263889923929, 1)","(1025396, 0.7270482702423571, 1)","(62836, 0.7198339096409573, 1)","(394969, 0.7994897018278883, 1)","(324196, 0.7427217594382906, 1)","(157903, 0.7924046526961529, 1)","(514728, 0.6847574407628291, 1)"


In [18]:
%%time
final_rec_result = pd.DataFrame(columns = ['JobID', 'Y_prob', 'Y_pred','UserID','rank'])

for u_id in ranking_users:
    
    temp_df = get_rec_result_df(u_id=u_id, 
                                rec_N=rec_result_fm[u_id])
    # print(u_id, len(temp_df))
    final_rec_result = pd.concat([final_rec_result, temp_df])

# final_rec_result = final_rec_result[['UserID','JobID', 'Y_prob', 'Y_pred', 'rank']]

CPU times: user 11.6 s, sys: 91.7 ms, total: 11.7 s
Wall time: 11.7 s


In [19]:
len(final_rec_result)

73820

# Export results

In [20]:
final_rec_result.to_csv('./output_topN_myfm/rec20_fm_knn.csv', header=True, index=False)

In [21]:
# Export rec result dictionary
import gzip, pickle, pickletools

filepath = "./nb_recsys_myfm/rec_result_fm_knn.pikle"
with gzip.open(filepath, "wb") as f:
    pickled = pickle.dumps(rec_result_fm)
    optimized_pickle = pickletools.optimize(pickled)
    f.write(optimized_pickle)

In [22]:
%%time
with gzip.open(filepath, 'rb') as f:
    p = pickle.Unpickler(f)
    loaded_rec_result = p.load()

CPU times: user 39.2 ms, sys: 15.8 ms, total: 55.1 ms
Wall time: 53.6 ms


In [23]:
loaded_rec_result[13]

[(328664, 0.7719287811634643, 1),
 (916743, 0.7518047581141348, 1),
 (164436, 0.7407665836999285, 1),
 (997257, 0.7291420201749421, 1),
 (11974, 0.7122105892249021, 1),
 (230268, 0.7083941469402446, 1),
 (1116150, 0.708004811606941, 1),
 (1057569, 0.707714613129258, 1),
 (568507, 0.7070080639231877, 1),
 (601850, 0.7044830107415203, 1),
 (764500, 0.7013997445525804, 1),
 (919556, 0.7008988070251321, 1),
 (842053, 0.6644481093987531, 1),
 (265838, 0.6537678265057497, 1),
 (296344, 0.6424239979466447, 1),
 (28031, 0.6324183180231109, 1),
 (602913, 0.6323106372773917, 1),
 (931979, 0.6309755829085892, 1),
 (614701, 0.6271885520928864, 1),
 (1061072, 0.6243355641671254, 1)]