# Get recommendations for myfm models - FM_EXTENDED
## Ranking data: ranking_data_random
- Full test users
- Load scripts:
    - Making features: make_features_myfm_ranking.py
    - Getting topN for a given user Id: topN_myfm_single.py

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
from sklearn import metrics

import myfm

In [2]:
from make_features_myfm_ranking import * # Import customer script for building features

In [3]:
from topN_myfm_single import * 

In [4]:
# Run this for reproduce
# Gets the current working directory
import os
cwd = os.getcwd()
print("Working directory:", cwd)
# Go up one directory from working directory
os.chdir("..")

Working directory: /home/jovyan/1_UT THESIS/CB12_MAIN/nb_recsys_myfm


In [5]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
# Load clean job data
job_set = pd.read_csv("./data_processed/jobset_clean.csv")

# Load the dataset from Step 3
user_set = pd.read_csv("./data_interim/user_set_cleaned.csv")
dataset = pd.read_csv("./data_interim/dataset_cleaned.csv")
work_history = pd.read_csv('./data_interim/work_history_cleaned.csv')


In [7]:
train_user = user_set[user_set.Split=="Train"].UserID.values
test_user = user_set[user_set.Split=="Test"].UserID.values
train_data = dataset[dataset.UserID.isin(train_user)]
test_data = dataset[dataset.UserID.isin(test_user)]

In [8]:
user_fm = pd.read_csv('./data_interim/users_fm.csv')
job_fm = pd.read_csv('./data_interim/jobs_fm.csv')

In [10]:
# Load ranking data (generated by KNN-LDA features)
ranking_data_knn = pd.read_csv('./nb_recsys_ebm/ranking_data_knn_lda_v2.csv')

In [11]:
rec_users = ranking_data_knn.UserID.unique()

In [12]:
# Load random ranking data
ranking_data_random = pd.read_csv('./nb_recsys_ebm/ranking_data_random.csv')

In [13]:
ranking_data = ranking_data_random[ranking_data_random.UserID.isin(rec_users)]

In [14]:
len(rec_users)

3691

In [15]:
len(ranking_data.UserID.unique())

3691

## Load pre-train FM models

In [16]:
%%time
import gzip, pickle, pickletools
filepath = "./output_myfm/fm_extended.pikle"
with gzip.open(filepath, 'rb') as f:
    p = pickle.Unpickler(f)
    fm_extended = p.load()

CPU times: user 1min 17s, sys: 25.8 s, total: 1min 43s
Wall time: 1min 43s


In [17]:
ranking_users = ranking_data.UserID.unique()

In [18]:
len(ranking_users)

3691

In [19]:
user_fm.drop(columns=['Split'], axis=1, inplace=True) #DROP THIS COLUMN BEFORE RUNNING BUILD FEATURE FOR EXTENDED MODEL

# Model: FM_extended

In [20]:
%%time
rec_result_fm_extended = {}
for u_id in ranking_users:
    # print('UserID:', u_id)
    rec_20 = topN_fm_extend(u_id = u_id, 
                          fm_type = 'fm_extended', 
                          fm_model = fm_extended, 
                          ranking_data = ranking_data, 
                          train_data = train_data,
                        user_fm = user_fm, job_fm = job_fm,
                          N=20)
    rec_result_fm_extended[u_id] = rec_20

CPU times: user 3h 35min 15s, sys: 5min 17s, total: 3h 40min 32s
Wall time: 3h 40min 16s


In [21]:
# Export rec result dictionary
import gzip, pickle, pickletools

filepath = "./nb_recsys_myfm/rec_result_fm_extended_random.pikle"
with gzip.open(filepath, "wb") as f:
    pickled = pickle.dumps(rec_result_fm_extended)
    optimized_pickle = pickletools.optimize(pickled)
    f.write(optimized_pickle)

In [22]:
%%time
final_rec_result_extended = pd.DataFrame(columns = ['JobID', 'Y_prob', 'Y_pred','UserID','rank'])

for u_id in ranking_users:
    
    temp_df = get_rec_result_df(u_id=u_id, 
                                rec_N=rec_result_fm_extended[u_id])
    # print(u_id, len(temp_df))
    final_rec_result_extended = pd.concat([final_rec_result_extended, temp_df])

# final_rec_result = final_rec_result[['UserID','JobID', 'Y_prob', 'Y_pred', 'rank']]

CPU times: user 11.7 s, sys: 67.9 ms, total: 11.8 s
Wall time: 11.8 s


In [23]:
final_rec_result_extended

Unnamed: 0,JobID,Y_prob,Y_pred,UserID,rank
0,821691,0.957912,1,13,0
1,508898,0.624375,1,13,1
2,322183,0.555795,1,13,2
3,501864,0.555795,1,13,3
4,890398,0.547108,1,13,4
...,...,...,...,...,...
15,139808,0.465831,0,1471988,15
16,288076,0.465831,0,1471988,16
17,383634,0.465831,0,1471988,17
18,917683,0.463999,0,1471988,18


# Export results

In [24]:
final_rec_result_extended.to_csv('./output_topN_myfm/rec20_fm_extended_random.csv', header=True, index=False)

In [None]:
print('hello world')