# Get recommendations for myfm models - FM
## Ranking data: ranking_data_random
- Run full list of users in ranking data (#user < # test_users due to dropping some duplication)
- Load scripts:
    - Making features: make_features_myfm_ranking.py
    - Getting topN for a given user Id: topN_myfm_single.py

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
from sklearn import metrics

import myfm

In [2]:
from make_features_myfm_ranking import * # Import customer script for building features

In [3]:
from topN_myfm_single import * 

In [4]:
# Run this for reproduce
# Gets the current working directory
import os
cwd = os.getcwd()
print("Working directory:", cwd)
# Go up one directory from working directory
os.chdir("..")

Working directory: /home/jovyan/1_UT THESIS/CB12_MAIN/nb_recsys_myfm


In [5]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
# Load clean job data
job_set = pd.read_csv("./data_processed/jobset_clean.csv")

# Load the dataset from Step 3
user_set = pd.read_csv("./data_interim/user_set_cleaned.csv")
dataset = pd.read_csv("./data_interim/dataset_cleaned.csv")
work_history = pd.read_csv('./data_interim/work_history_cleaned.csv')


In [7]:
train_user = user_set[user_set.Split=="Train"].UserID.values
test_user = user_set[user_set.Split=="Test"].UserID.values
train_data = dataset[dataset.UserID.isin(train_user)]
test_data = dataset[dataset.UserID.isin(test_user)]

In [8]:
user_fm = pd.read_csv('./data_interim/users_fm.csv')
job_fm = pd.read_csv('./data_interim/jobs_fm.csv')

In [9]:
# Load ranking data random
ranking_data = pd.read_csv('./nb_recsys_ebm/ranking_data_random.csv')

## Load pre-train FM models

In [10]:
%%time
import pickle
model_name = './output_myfm/fm.pikle'
fm = pickle.load(open(model_name, "rb"))

CPU times: user 9.49 s, sys: 41.4 s, total: 50.9 s
Wall time: 1min 16s


In [11]:
ranking_users = ranking_data.UserID.unique()

In [12]:
len(ranking_users)

3716

# Model: FM

In [13]:
%%time
rec_result_fm = {}
for u_id in ranking_users:
    # print('UserID:', u_id)
    rec_20 = topN_fm_simple(u_id = u_id, 
                          fm_type = 'fm', 
                          fm_model = fm, 
                          ranking_data = ranking_data, 
                          train_data = train_data, 
                          N=20)
    rec_result_fm[u_id] = rec_20

CPU times: user 3h 6min 31s, sys: 5min 39s, total: 3h 12min 10s
Wall time: 3h 11min 58s


In [14]:
len(rec_result_fm)

3716

In [15]:
rec_result_fm[13]

[(638811, 0.7218407096228636, 1),
 (129969, 0.7168436040557893, 1),
 (489470, 0.6719174016253556, 1),
 (821691, 0.6715150121639518, 1),
 (1066465, 0.6646443960053428, 1),
 (561513, 0.6501749968889212, 1),
 (237243, 0.6401492302725701, 1),
 (75809, 0.6371026571800227, 1),
 (855212, 0.6293803811802222, 1),
 (904793, 0.6244986043398364, 1),
 (559170, 0.6207597155294439, 1),
 (350081, 0.6021001225905913, 1),
 (598128, 0.5999332015192613, 1),
 (123120, 0.5963917283239489, 1),
 (383666, 0.5936058631738987, 1),
 (472398, 0.591920651605492, 1),
 (1041876, 0.5911259070823139, 1),
 (482235, 0.5909327414755127, 1),
 (508898, 0.588569717394669, 1),
 (29419, 0.5820600912187534, 1)]

In [16]:
pd.DataFrame(rec_result_fm).head(2)

Unnamed: 0,13,514,681,767,883,1006,1066,1149,2520,2639,...,1464243,1464260,1470151,1470280,1470641,1470705,1470706,1470779,1471251,1471988
0,"(638811, 0.7218407096228636, 1)","(131166, 0.7029697745375464, 1)","(1013609, 0.7088708523552077, 1)","(491762, 0.739534449457711, 1)","(46854, 0.6627290482123902, 1)","(1012531, 0.6380059237802229, 1)","(709999, 0.7594332693792967, 1)","(807824, 0.755150274597153, 1)","(930936, 0.6516578858059391, 1)","(316774, 0.6609906188627939, 1)",...,"(917900, 0.7097199805552189, 1)","(906237, 0.7112793538140937, 1)","(768835, 0.6773767752988719, 1)","(510070, 0.7118771337694693, 1)","(44890, 0.6858127807611676, 1)","(813782, 0.7532165057301071, 1)","(994358, 0.6854168493510994, 1)","(597857, 0.7545975199742243, 1)","(232965, 0.7193165051457576, 1)","(1026607, 0.5963615966554502, 1)"
1,"(129969, 0.7168436040557893, 1)","(616523, 0.6814811579151427, 1)","(484750, 0.6930133079204106, 1)","(762663, 0.7375728884484928, 1)","(969100, 0.6370222503712079, 1)","(883643, 0.6356558472412309, 1)","(957610, 0.6722629214914195, 1)","(1040820, 0.66795410712108, 1)","(283997, 0.6361747767833328, 1)","(874575, 0.6547567054955663, 1)",...,"(301543, 0.6931450958890996, 1)","(581451, 0.7053104312511369, 1)","(327471, 0.6655367195267725, 1)","(674314, 0.6971531496058277, 1)","(694177, 0.6660196214188668, 1)","(163278, 0.7271595346481137, 1)","(616988, 0.6713456189871875, 1)","(780847, 0.6870924465278277, 1)","(1053341, 0.6936236883966593, 1)","(320656, 0.5945385723425424, 1)"


In [17]:
%%time
final_rec_result = pd.DataFrame(columns = ['JobID', 'Y_prob', 'Y_pred','UserID','rank'])

for u_id in ranking_users:
    
    temp_df = get_rec_result_df(u_id=u_id, 
                                rec_N=rec_result_fm[u_id])
    # print(u_id, len(temp_df))
    final_rec_result = pd.concat([final_rec_result, temp_df])

# final_rec_result = final_rec_result[['UserID','JobID', 'Y_prob', 'Y_pred', 'rank']]

CPU times: user 11.8 s, sys: 76.1 ms, total: 11.9 s
Wall time: 11.9 s


In [18]:
len(final_rec_result)

74320

# Export results

In [19]:
final_rec_result.to_csv('./output_topN_myfm/rec20_fm_random.csv', header=True, index=False)

In [20]:
# Export rec result dictionary
import gzip, pickle, pickletools

filepath = "./nb_recsys_myfm/rec_result_fm_random.pikle"
with gzip.open(filepath, "wb") as f:
    pickled = pickle.dumps(rec_result_fm)
    optimized_pickle = pickletools.optimize(pickled)
    f.write(optimized_pickle)

In [21]:
%%time
with gzip.open(filepath, 'rb') as f:
    p = pickle.Unpickler(f)
    loaded_rec_result = p.load()

CPU times: user 51.7 ms, sys: 4.01 ms, total: 55.7 ms
Wall time: 54.7 ms


In [22]:
loaded_rec_result[13]

[(638811, 0.7218407096228636, 1),
 (129969, 0.7168436040557893, 1),
 (489470, 0.6719174016253556, 1),
 (821691, 0.6715150121639518, 1),
 (1066465, 0.6646443960053428, 1),
 (561513, 0.6501749968889212, 1),
 (237243, 0.6401492302725701, 1),
 (75809, 0.6371026571800227, 1),
 (855212, 0.6293803811802222, 1),
 (904793, 0.6244986043398364, 1),
 (559170, 0.6207597155294439, 1),
 (350081, 0.6021001225905913, 1),
 (598128, 0.5999332015192613, 1),
 (123120, 0.5963917283239489, 1),
 (383666, 0.5936058631738987, 1),
 (472398, 0.591920651605492, 1),
 (1041876, 0.5911259070823139, 1),
 (482235, 0.5909327414755127, 1),
 (508898, 0.588569717394669, 1),
 (29419, 0.5820600912187534, 1)]