In [5]:
import numpy as np
import pandas as pd
from libreco.data import split_by_ratio_chrono, DatasetFeat
from libreco.algorithms import YouTubeRanking

# remove unnecessary tensorflow logging
import os
import tensorflow as tf
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ["KMP_WARNINGS"] = "FALSE"
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

In [53]:
data = pd.read_csv(
    "../../data/kaggle-job-recommendation/apps_with_item_user_data.tsv",
    sep = "\t"
)

In [54]:
data.head()

Unnamed: 0,UserID,JobID,Label,WindowID,Title,Popularity,DegreeType,Major,WorkHistoryCount,TotalYearsExperience,ManagedHowMany
0,47,169528,1,1,"Resort Host/Marketing Coordinator - Anaheim, CA",2,High School,,3,10.0,0
1,47,284009,1,1,Administrative Assistant,43,High School,,3,10.0,0
2,47,2121,1,1,MEDICAL- FRONT OFFICE,20,High School,,3,10.0,0
3,47,848187,1,1,Administrative Assistant,51,High School,,3,10.0,0
4,47,733748,1,1,Administrative Assistant,7,High School,,3,10.0,0


In [55]:
def split_train_test_by_windowid(df, test_size = 0.2):
    first_time = True
    for windowid in df['WindowID'].unique():
        if first_time:
            test_df = df.groupby('WindowID').get_group(windowid).sample(frac = test_size)
            first_time = False
        else:
            test_df.append(df.groupby('WindowID').get_group(windowid).sample(frac = test_size))
    train_df = df.drop(test_df.index)
    return train_df, test_df

In [56]:
# important: rename columns. 
# Throws ValueError if 'user', 'item' aren't the first two columns of the data 

data.rename(columns = {
    "UserID": "user",
    "JobID": "item",
    "Label": "label"
}, inplace=True)

In [63]:
# replace nan on int columns with 0

data['WorkHistoryCount'] = data['WorkHistoryCount'].fillna(0)
data['TotalYearsExperience'] = data['TotalYearsExperience'].fillna(0)
data['ManagedHowMany'] = data['ManagedHowMany'].fillna(0)

In [64]:
data = data.astype({
    "user":                 int,
    "item":                 int,
    "label":                int,
    "WindowID":             int,
    "Title":                str,
    "Popularity":           int,
    "DegreeType":           str,
    "Major":                str,
    "WorkHistoryCount":     int,
    "TotalYearsExperience": int,
    "ManagedHowMany":       int
})

In [65]:
train_df, test_df = split_train_test_by_windowid(data)

In [66]:
sparse_col = ["Title", "DegreeType", "Major"]
dense_col = ["Popularity", "WorkHistoryCount", "TotalYearsExperience", "ManagedHowMany"]
user_col = ["DegreeType", "Major", "WorkHistoryCount", "TotalYearsExperience", "ManagedHowMany"]
item_col = ["Title", "Popularity"]



In [67]:
train_df.head()

Unnamed: 0,user,item,label,WindowID,Title,Popularity,DegreeType,Major,WorkHistoryCount,TotalYearsExperience,ManagedHowMany
1,47,284009,1,1,Administrative Assistant,43,High School,,3,10,0
2,47,2121,1,1,MEDICAL- FRONT OFFICE,20,High School,,3,10,0
4,47,733748,1,1,Administrative Assistant,7,High School,,3,10,0
5,47,576958,1,1,RECEPTIONIST,21,High School,,3,10,0
6,47,262470,1,1,Account Manager- Customer Service,5,High School,,3,10,0


In [70]:
train_data, data_info = DatasetFeat.build_trainset(
        train_df, user_col, item_col, sparse_col, dense_col
    )
test_data = DatasetFeat.build_testset(test_df)

In [71]:
# sample negative items for each record
train_data.build_negative_samples(data_info)
test_data.build_negative_samples(data_info)
print(data_info)

random neg item sampling elapsed: 0.001s
random neg item sampling elapsed: 0.000s
n_users: 155, n_items: 630, data sparsity: 0.6564 %


In [72]:
ytb_ranking = YouTubeRanking(task="ranking", data_info=data_info,
                                 embed_size=16, n_epochs=3, lr=1e-4,
                                 batch_size=512, use_bn=True,
                                 hidden_units="128,64,32")

In [73]:
ytb_ranking.fit(train_data, verbose=2, shuffle=True,
                    eval_data=test_data,
                    metrics=["loss", "roc_auc", "precision",
                             "recall", "map", "ndcg"])

Training start time: [35m2022-03-15 11:10:44[0m
total params: [33m54,273[0m | embedding params: [33m22,753[0m | network params: [33m31,520[0m


train: 100%|██████████| 3/3 [00:00<00:00,  4.83it/s]


Epoch 1 elapsed: 0.626s
	 [32mtrain_loss: 0.8309[0m


eval_pred: 100%|██████████| 1/1 [00:00<00:00,  6.98it/s]
eval_rec: 100%|██████████| 73/73 [00:00<00:00, 195.21it/s]


	 eval log_loss: 0.7137
	 eval roc_auc: 0.3833
	 eval precision@10: 0.0000
	 eval recall@10: 0.0000
	 eval map@10: 0.0000
	 eval ndcg@10: 0.0000


train: 100%|██████████| 3/3 [00:00<00:00, 81.66it/s]


Epoch 2 elapsed: 0.043s
	 [32mtrain_loss: 0.7607[0m


eval_pred: 100%|██████████| 1/1 [00:00<00:00, 352.88it/s]
eval_rec: 100%|██████████| 73/73 [00:00<00:00, 228.68it/s]


	 eval log_loss: 0.7100
	 eval roc_auc: 0.3990
	 eval precision@10: 0.0000
	 eval recall@10: 0.0000
	 eval map@10: 0.0000
	 eval ndcg@10: 0.0000


train: 100%|██████████| 3/3 [00:00<00:00, 66.61it/s]


Epoch 3 elapsed: 0.047s
	 [32mtrain_loss: 0.7225[0m


eval_pred: 100%|██████████| 1/1 [00:00<00:00, 121.86it/s]
eval_rec: 100%|██████████| 73/73 [00:00<00:00, 241.50it/s]


	 eval log_loss: 0.7061
	 eval roc_auc: 0.4263
	 eval precision@10: 0.0000
	 eval recall@10: 0.0000
	 eval map@10: 0.0000
	 eval ndcg@10: 0.0000


In [75]:
data.tail()

Unnamed: 0,user,item,label,WindowID,Title,Popularity,DegreeType,Major,WorkHistoryCount,TotalYearsExperience,ManagedHowMany
796,4724,261369,1,1,NDT Technician - Level - II,1,Bachelor's,Media Production,1,10,0
797,4724,1073733,1,1,NDT (non-destructive testing) Engineer,0,Bachelor's,Media Production,1,10,0
798,4724,1091073,1,1,Administrative Assistant,13,Bachelor's,Media Production,1,10,0
799,4724,761456,1,1,EXECUTIVE ASSISTANT,40,Bachelor's,Media Production,1,10,0
800,4724,1115815,1,1,Administrative Specialist,15,Bachelor's,Media Production,1,10,0


In [76]:
# predict preference of user 1 to item 2333
print("prediction: ", ytb_ranking.predict(user=47, item=1115815))
# recommend 7 items for user 1
print("recommendation: ", ytb_ranking.recommend_user(user=47, n_rec=7))

# cold-start prediction
print("cold prediction: ", ytb_ranking.predict(user="ccc", item="not item",
                                                cold_start="average"))
# cold-start recommendation
print("cold recommendation: ", ytb_ranking.recommend_user(user="are we good?",
                                                            n_rec=7,
                                                            cold_start="popular"))

prediction:  [0.48799565]
recommendation:  [(1038309, 0.50337595), (1032277, 0.5025235), (491441, 0.5017492), (844661, 0.5016066), (814970, 0.5014388), (898053, 0.5011532), (25250, 0.50113297)]
[31mDetect 1 unknown interaction(s), position: [0][0m
cold prediction:  [0.46888396]
[31mdetect unknown user: are we good?[0m
cold recommendation:  [28124, 381978, 284009, 512686, 512852, 1054653, 212273]
