# Get recommendations for myfm models: FM_SIDE_INFO
## Ranking data: ranking_data_knn_lda_v2.csv
- Run full list of users in ranking data (#user < # test_users due to dropping some duplication)
- Load scripts:
    - Making features: make_features_myfm_ranking.py
    - Getting topN for a given user Id: topN_myfm_single.py

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
from sklearn import metrics

import myfm

In [2]:
from make_features_myfm_ranking import * # Import customer script for building features

In [3]:
from topN_myfm_single import * 

In [4]:
# Run this for reproduce
# Gets the current working directory
import os
cwd = os.getcwd()
print("Working directory:", cwd)
# Go up one directory from working directory
os.chdir("..")

Working directory: /home/jovyan/1_UT THESIS/CB12_MAIN/nb_recsys_myfm


In [5]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
# Load clean job data
job_set = pd.read_csv("./data_processed/jobset_clean.csv")

# Load the dataset from Step 3
user_set = pd.read_csv("./data_interim/user_set_cleaned.csv")
dataset = pd.read_csv("./data_interim/dataset_cleaned.csv")
work_history = pd.read_csv('./data_interim/work_history_cleaned.csv')


In [7]:
train_user = user_set[user_set.Split=="Train"].UserID.values
test_user = user_set[user_set.Split=="Test"].UserID.values
train_data = dataset[dataset.UserID.isin(train_user)]
test_data = dataset[dataset.UserID.isin(test_user)]

In [8]:
user_fm = pd.read_csv('./data_interim/users_fm.csv')
job_fm = pd.read_csv('./data_interim/jobs_fm.csv')

In [9]:
# Load ranking data (generated by KNN-LDA features)
ranking_data = pd.read_csv('./nb_recsys_ebm/ranking_data_knn_lda_v2.csv')

## Load pre-train FM models

In [12]:
%%time
import gzip, pickle, pickletools
filepath = "./output_myfm/fm_side_info.pikle"
with gzip.open(filepath, 'rb') as f:
    p = pickle.Unpickler(f)
    fm_side_info = p.load()

CPU times: user 1min 24s, sys: 35.4 s, total: 1min 59s
Wall time: 2min 44s


In [13]:
ranking_users = ranking_data.UserID.unique()

In [14]:
len(ranking_users)

3691

# Model: FM_side_info

In [15]:
user_fm.drop(columns=['Split'], axis=1, inplace=True) #DROP THIS COLUMN BEFORE RUNNING BUILD FEATURE FOR EXTENDED MODEL

In [16]:
%%time
rec_result_fm_side_info = {}
for u_id in ranking_users:
    # print('UserID:', u_id)
    rec_20 = topN_fm_extend(u_id = u_id, 
                          fm_type = 'fm_side_info', 
                          fm_model = fm_side_info, 
                          ranking_data = ranking_data, 
                          train_data = train_data,
                        user_fm = user_fm, job_fm = job_fm,
                          N=20)
    rec_result_fm_side_info[u_id] = rec_20

CPU times: user 3h 10min 5s, sys: 4min 53s, total: 3h 14min 59s
Wall time: 3h 14min 44s


In [17]:
# Export rec result dictionary
import gzip, pickle, pickletools

filepath = "./nb_recsys_myfm/rec_result_fm_side_info_knn.pikle"
with gzip.open(filepath, "wb") as f:
    pickled = pickle.dumps(rec_result_fm_side_info)
    optimized_pickle = pickletools.optimize(pickled)
    f.write(optimized_pickle)

In [18]:
%%time
final_rec_result_side_info = pd.DataFrame(columns = ['JobID', 'Y_prob', 'Y_pred','UserID','rank'])

for u_id in ranking_users:
    
    temp_df = get_rec_result_df(u_id=u_id, 
                                rec_N=rec_result_fm_side_info[u_id])
    # print(u_id, len(temp_df))
    final_rec_result_side_info = pd.concat([final_rec_result_side_info , temp_df])

# final_rec_result = final_rec_result[['UserID','JobID', 'Y_prob', 'Y_pred', 'rank']]

CPU times: user 11.5 s, sys: 64.1 ms, total: 11.6 s
Wall time: 11.6 s


In [19]:
final_rec_result_side_info

Unnamed: 0,JobID,Y_prob,Y_pred,UserID,rank
0,1116150,0.918094,1,13,0
1,328664,0.912786,1,13,1
2,164436,0.893532,1,13,2
3,997257,0.879731,1,13,3
4,11974,0.867149,1,13,4
...,...,...,...,...,...
15,732410,0.518977,1,1471988,15
16,157325,0.506350,1,1471988,16
17,581928,0.497092,0,1471988,17
18,130608,0.469354,0,1471988,18


# Export results

In [20]:
final_rec_result_side_info.to_csv('./output_topN_myfm/rec20_fm_side_info_knn.csv', header=True, index=False)