# Get recommendations for myfm models - FM_MATCH
## Ranking data: ranking_data_random
- Run full list of users in ranking data (#user < # test_users due to dropping some duplication)
- Load scripts:
    - Making features: make_features_myfm_ranking.py
    - Getting topN for a given user Id: topN_myfm_single.py

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
from sklearn import metrics

import myfm

In [2]:
from make_features_myfm_ranking import * # Import customer script for building features

In [3]:
from topN_myfm_single import * 

In [4]:
# Run this for reproduce
# Gets the current working directory
import os
cwd = os.getcwd()
print("Working directory:", cwd)
# Go up one directory from working directory
os.chdir("..")

Working directory: /home/jovyan/1_UT THESIS/CB12_MAIN/nb_recsys_myfm


In [5]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
# Load clean job data
job_set = pd.read_csv("./data_processed/jobset_clean.csv")

# Load the dataset from Step 3
user_set = pd.read_csv("./data_interim/user_set_cleaned.csv")
dataset = pd.read_csv("./data_interim/dataset_cleaned.csv")
work_history = pd.read_csv('./data_interim/work_history_cleaned.csv')


In [7]:
train_user = user_set[user_set.Split=="Train"].UserID.values
test_user = user_set[user_set.Split=="Test"].UserID.values
train_data = dataset[dataset.UserID.isin(train_user)]
test_data = dataset[dataset.UserID.isin(test_user)]

In [8]:
user_fm = pd.read_csv('./data_interim/users_fm.csv')
job_fm = pd.read_csv('./data_interim/jobs_fm.csv')

In [9]:
# Load ranking data random
ranking_data = pd.read_csv('./nb_recsys_ebm/ranking_data_random.csv')

## Load pre-train FM models

In [10]:
%%time
import pickle
model_name = './output_myfm/fm_match.pikle'
fm_match = pickle.load(open(model_name, "rb"))

CPU times: user 7.26 s, sys: 35.9 s, total: 43.2 s
Wall time: 1min 9s


In [11]:
ranking_users = ranking_data.UserID.unique()

In [12]:
len(ranking_users)

3716

# Model: FM_match

In [13]:
%%time
rec_result_match = {}
for u_id in ranking_users:
    # print('UserID:', u_id)
    rec_20 = topN_fm_simple(u_id = u_id, 
                          fm_type = 'fm_match', 
                          fm_model = fm_match, 
                          ranking_data = ranking_data, 
                          train_data = train_data, 
                          N=20)
    rec_result_match[u_id] = rec_20

CPU times: user 3h 12min 1s, sys: 5min 32s, total: 3h 17min 34s
Wall time: 3h 17min 20s


In [14]:
%%time
final_rec_result_match = pd.DataFrame(columns = ['JobID', 'Y_prob', 'Y_pred','UserID','rank'])

for u_id in ranking_users:
    
    temp_df = get_rec_result_df(u_id=u_id, 
                                rec_N=rec_result_match[u_id])
    # print(u_id, len(temp_df))
    final_rec_result_match = pd.concat([final_rec_result_match , temp_df])

# final_rec_result = final_rec_result[['UserID','JobID', 'Y_prob', 'Y_pred', 'rank']]

CPU times: user 11.7 s, sys: 120 ms, total: 11.8 s
Wall time: 11.8 s


In [15]:
rec_result_match[13]

[(821691, 0.9443351192244838, 1),
 (508898, 0.6011482223662387, 1),
 (719969, 0.5761096290825414, 1),
 (145443, 0.4056479223542181, 0),
 (855212, 0.3687416980379906, 0),
 (497733, 0.3682171379841186, 0),
 (123120, 0.3651509819564898, 0),
 (524122, 0.36356930679986377, 0),
 (350081, 0.35692296239694815, 0),
 (561513, 0.3552939034391221, 0),
 (598128, 0.3509139494149052, 0),
 (75809, 0.3509051225601515, 0),
 (411244, 0.35079877270034804, 0),
 (93235, 0.3499486601332794, 0),
 (29419, 0.3499173994458538, 0),
 (324559, 0.34874641601731987, 0),
 (1041876, 0.34828179458731146, 0),
 (482235, 0.34826291978908785, 0),
 (237243, 0.3476500026986336, 0),
 (904793, 0.3464156930011535, 0)]

# Export results

In [16]:
final_rec_result_match.to_csv('./output_topN_myfm/rec20_fm_match_random.csv', header=True, index=False)

In [17]:
# Export rec result dictionary
import gzip, pickle, pickletools

filepath = "./nb_recsys_myfm/rec_result_fm_match_random.pikle"
with gzip.open(filepath, "wb") as f:
    pickled = pickle.dumps(rec_result_match)
    optimized_pickle = pickletools.optimize(pickled)
    f.write(optimized_pickle)

In [18]:
%%time
with gzip.open(filepath, 'rb') as f:
    p = pickle.Unpickler(f)
    loaded_rec_result = p.load()

CPU times: user 46.5 ms, sys: 8.09 ms, total: 54.6 ms
Wall time: 53.1 ms


In [19]:
loaded_rec_result[13]

[(821691, 0.9443351192244838, 1),
 (508898, 0.6011482223662387, 1),
 (719969, 0.5761096290825414, 1),
 (145443, 0.4056479223542181, 0),
 (855212, 0.3687416980379906, 0),
 (497733, 0.3682171379841186, 0),
 (123120, 0.3651509819564898, 0),
 (524122, 0.36356930679986377, 0),
 (350081, 0.35692296239694815, 0),
 (561513, 0.3552939034391221, 0),
 (598128, 0.3509139494149052, 0),
 (75809, 0.3509051225601515, 0),
 (411244, 0.35079877270034804, 0),
 (93235, 0.3499486601332794, 0),
 (29419, 0.3499173994458538, 0),
 (324559, 0.34874641601731987, 0),
 (1041876, 0.34828179458731146, 0),
 (482235, 0.34826291978908785, 0),
 (237243, 0.3476500026986336, 0),
 (904793, 0.3464156930011535, 0)]