# Calculate hit_rate
## Model: EBM models (trained on tabular LDA features)
## Source of potential applications: ranking_data_knn_lda_v2.csv
- Input: Top20 recommendation results by 4 fm models: fm, fm_match, fm_side_info, fm_extended
- Hit rate: Evaluated on different benchmarks: knn_lda_ranking_data, random_ranking_data, actual test_interaction data
- Hit rate @5, @10, @20 for each model

In [1]:
import numpy as np
import pandas as pd

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Run this for reproduce
# Gets the current working directory
import os
cwd = os.getcwd()
print("Working directory:", cwd)
# Go up one directory from working directory
os.chdir("..")

Working directory: /home/jovyan/1_UT THESIS/CB12_MAIN/nb_recsys_myfm


In [4]:
# Load evaluation data based on knn_lda
evaluation_data = pd.read_csv('./nb_recsys_myfm/ranking_data_knn_lda_v2.csv')

In [5]:
evaluation_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 369098 entries, 0 to 369097
Data columns (total 6 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   UserID   369098 non-null  int64  
 1   JobID    369098 non-null  int64  
 2   label    369098 non-null  float64
 3   City     369098 non-null  float64
 4   State    369098 non-null  float64
 5   Country  369098 non-null  float64
dtypes: float64(4), int64(2)
memory usage: 16.9 MB


In [6]:
evaluation_data.label.value_counts()

label
0.0    368951
1.0       147
Name: count, dtype: int64

In [7]:
# Load ranking random data
ranking_data_random = pd.read_csv('./nb_recsys_myfm/ranking_data_random.csv')
ranking_data_random.label.value_counts

<bound method IndexOpsMixin.value_counts of 0         1
1         0
2         0
3         0
4         0
         ..
371595    0
371596    0
371597    0
371598    0
371599    0
Name: label, Length: 371600, dtype: int64>

In [8]:
def get_hit_rate_at_k(recommendations, rankings, K):
    total_users = len(recommendations['UserID'].unique())
    hit_count = 0
    # mark_stop = 0
    for user_id in recommendations['UserID'].unique():
        user_recommendations = recommendations[recommendations['UserID'] == user_id]['JobID'].tolist()[:K]
        actual_job_ids = rankings[(rankings['UserID'] == user_id) & (rankings['label'] == 1)]['JobID'].tolist()

        for rec_job_id in user_recommendations:
            if rec_job_id in actual_job_ids:
                # print('hit detail UserID, JobID: ', user_id, rec_job_id)
                hit_count += 1
                break     
        # mark_stop =+1
        # if mark_stop == 2:
        #     break
    print('final hit_count:', hit_count)
    hit_rate = hit_count / total_users
    return hit_rate

In [9]:
models = []
benchmarks = []
hits_5 = []
hits_10 = []
hits_20 = []

## Load recommendation results

In [10]:
fm_recommendations = pd.read_csv('./output_topN_myfm/rec20_fm_v2.csv')
fm_match_recommendations = pd.read_csv('./output_topN_myfm/rec20_fm_v2.csv')

In [11]:
fm_side_info_recommendations = pd.read_csv('./output_topN_myfm/rec20_fm_side_info_v2.csv')
fm_extended_recommendations = pd.read_csv('./output_topN_myfm/rec20_fm_extended_v2.csv')

# Calculate hit_rate (manual) - Benchmark - ranking random data
Positive items: Only consider label 1

In [12]:
benchmark = 'ranking_data_random'

## Model: fm

In [13]:
%%time
# Calculate hit rate at K (precision at K)
model = 'fm'
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = fm_recommendations, 
                                   rankings = ranking_data_random, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = fm_recommendations, 
                                   rankings = ranking_data_random, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = fm_recommendations, 
                                   rankings = ranking_data_random, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_20}")

final hit_count: 11
Model: fm - Hit rate at 5: 0.002980222162015714
final hit_count: 11
Model: fm - Hit rate at 10: 0.002980222162015714
final hit_count: 18
Model: fm - Hit rate at 20: 0.004876727174207532
CPU times: user 12.6 s, sys: 0 ns, total: 12.6 s
Wall time: 12.6 s


In [14]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

## Model: fm_match

In [15]:
%%time
# Calculate hit rate at K (precision at K)
model = 'fm_match'
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = fm_match_recommendations, 
                                   rankings = ranking_data_random, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = fm_match_recommendations, 
                                   rankings = ranking_data_random, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = fm_match_recommendations, 
                                   rankings = ranking_data_random, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_20}")

final hit_count: 11
Model: fm_match - Hit rate at 5: 0.002980222162015714
final hit_count: 11
Model: fm_match - Hit rate at 10: 0.002980222162015714
final hit_count: 18
Model: fm_match - Hit rate at 20: 0.004876727174207532
CPU times: user 11.2 s, sys: 0 ns, total: 11.2 s
Wall time: 11.2 s


In [16]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

## Model: fm_side_info - PENDING

In [17]:
%%time
# Calculate hit rate at K (precision at K)
model = 'fm_side_info'
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = fm_side_info_recommendations, 
                                   rankings = ranking_data_random, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = fm_side_info_recommendations, 
                                   rankings = ranking_data_random, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = fm_side_info_recommendations, 
                                   rankings = ranking_data_random, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_20}")

final hit_count: 10
Model: fm_side_info - Hit rate at 5: 0.00270929287455974
final hit_count: 12
Model: fm_side_info - Hit rate at 10: 0.003251151449471688
final hit_count: 18
Model: fm_side_info - Hit rate at 20: 0.004876727174207532
CPU times: user 11.2 s, sys: 0 ns, total: 11.2 s
Wall time: 11.1 s


In [18]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

## Model: fm_extended - PENDING

In [19]:
%%time
# Calculate hit rate at K (precision at K)
model = 'fm_extended'
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = fm_extended_recommendations, 
                                   rankings = ranking_data_random, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = fm_extended_recommendations, 
                                   rankings = ranking_data_random, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = fm_extended_recommendations, 
                                   rankings = ranking_data_random, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_20}")

final hit_count: 19
Model: fm_extended - Hit rate at 5: 0.005147656461663506
final hit_count: 21
Model: fm_extended - Hit rate at 10: 0.005689515036575454
final hit_count: 22
Model: fm_extended - Hit rate at 20: 0.005960444324031428
CPU times: user 11.2 s, sys: 0 ns, total: 11.2 s
Wall time: 11.1 s


In [20]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

# Calculate hit_rate (manual) - Benchmark - evaluation data
Positive items: Only consider label 1

In [21]:
benchmark = 'evaluation_knn_lda'

## Model: fm

In [22]:
%%time
# Calculate hit rate at K (precision at K)
model = 'fm'
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = fm_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = fm_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = fm_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_20}")

final hit_count: 4
Model: fm - Hit rate at 5: 0.001083717149823896
final hit_count: 5
Model: fm - Hit rate at 10: 0.00135464643727987
final hit_count: 6
Model: fm - Hit rate at 20: 0.001625575724735844
CPU times: user 10.8 s, sys: 0 ns, total: 10.8 s
Wall time: 10.8 s


In [23]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

## Model: fm_match

In [24]:
%%time
# Calculate hit rate at K (precision at K)
model = 'fm_match'
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = fm_match_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = fm_match_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = fm_match_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_20}")

final hit_count: 4
Model: fm_match - Hit rate at 5: 0.001083717149823896
final hit_count: 5
Model: fm_match - Hit rate at 10: 0.00135464643727987
final hit_count: 6
Model: fm_match - Hit rate at 20: 0.001625575724735844
CPU times: user 10.8 s, sys: 0 ns, total: 10.8 s
Wall time: 10.8 s


In [25]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

## Model: fm_side_info - PENDING

In [26]:
%%time
# Calculate hit rate at K (precision at K)
model = 'fm_side_info'
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = fm_side_info_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = fm_side_info_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = fm_side_info_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_20}")

final hit_count: 5
Model: fm_side_info - Hit rate at 5: 0.00135464643727987
final hit_count: 5
Model: fm_side_info - Hit rate at 10: 0.00135464643727987
final hit_count: 6
Model: fm_side_info - Hit rate at 20: 0.001625575724735844
CPU times: user 10.8 s, sys: 0 ns, total: 10.8 s
Wall time: 10.8 s


In [27]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

## Model: fm_extended - PENDING

In [28]:
%%time
# Calculate hit rate at K (precision at K)
model = 'fm_extended'
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = fm_extended_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = fm_extended_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = fm_extended_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_20}")

final hit_count: 3
Model: fm_extended - Hit rate at 5: 0.000812787862367922
final hit_count: 4
Model: fm_extended - Hit rate at 10: 0.001083717149823896
final hit_count: 6
Model: fm_extended - Hit rate at 20: 0.001625575724735844
CPU times: user 10.8 s, sys: 0 ns, total: 10.8 s
Wall time: 10.8 s


In [29]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

# Calculate hit_rate (manual) - Evaluation data: test_interaction data

In [30]:
benchmark = 'actual_test_interaction'

In [31]:
# Load all interaction data (interaction data used to build FM models)
dataset = pd.read_csv("./data_interim/dataset_cleaned.csv")

In [32]:
dataset.head()

Unnamed: 0,UserID,JobID,label,City,State,Country
0,7,309823,1,1.0,1.0,1.0
1,7,703889,1,1.0,1.0,1.0
2,7,566574,0,0.0,0.0,1.0
3,7,481216,0,0.0,0.0,1.0
4,9,809208,1,1.0,1.0,1.0


In [33]:
user_set = pd.read_csv("./data_interim/user_set_cleaned.csv")

In [34]:
train_user = user_set[user_set.Split=="Train"].UserID.values
test_user = user_set[user_set.Split=="Test"].UserID.values
train_data = dataset[dataset.UserID.isin(train_user)]
test_data = dataset[dataset.UserID.isin(test_user)]

In [35]:
# Positive interaction (from original data)
select_cols = ['UserID', 'JobID', 'label']
actual_interaction_train = train_data[select_cols]
actual_interaction_test = test_data[select_cols]
actual_interaction_df = pd.concat([actual_interaction_train,actual_interaction_test])

In [36]:
len(actual_interaction_train), len(actual_interaction_test)

(563889, 15736)

### Model: fm

In [37]:
%%time
# Calculate hit rate at K (precision at K)
model = 'fm'
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = fm_recommendations, 
                                   rankings = actual_interaction_test, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - evaluated on actual interaction (test): {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = fm_recommendations, 
                                   rankings = actual_interaction_test, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - evaluated on actual interaction (test): {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = fm_recommendations, 
                                   rankings = actual_interaction_test, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - evaluated on actual interaction (test): {hit_20}")

final hit_count: 16
Model: fm - Hit rate at 5 - evaluated on actual interaction (test): 0.004334868599295584
final hit_count: 19
Model: fm - Hit rate at 10 - evaluated on actual interaction (test): 0.005147656461663506
final hit_count: 29
Model: fm - Hit rate at 20 - evaluated on actual interaction (test): 0.007856949336223246
CPU times: user 6.55 s, sys: 2.99 ms, total: 6.55 s
Wall time: 6.55 s


In [38]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

### Model: fm_match

In [39]:
%%time
# Calculate hit rate at K (precision at K)
model = 'fm_match'
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = fm_match_recommendations, 
                                   rankings = actual_interaction_test, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - evaluated on actual interaction (test): {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = fm_match_recommendations, 
                                   rankings = actual_interaction_test, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - evaluated on actual interaction (test): {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = fm_match_recommendations, 
                                   rankings = actual_interaction_test, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - evaluated on actual interaction (test): {hit_20}")

final hit_count: 16
Model: fm_match - Hit rate at 5 - evaluated on actual interaction (test): 0.004334868599295584
final hit_count: 19
Model: fm_match - Hit rate at 10 - evaluated on actual interaction (test): 0.005147656461663506
final hit_count: 29
Model: fm_match - Hit rate at 20 - evaluated on actual interaction (test): 0.007856949336223246
CPU times: user 6.55 s, sys: 7.28 ms, total: 6.56 s
Wall time: 6.56 s


In [40]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

## Model: fm_side_info

In [41]:
%%time
# Calculate hit rate at K (precision at K)
model = 'fm_side_info'
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = fm_side_info_recommendations, 
                                   rankings = actual_interaction_test, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - evaluated on actual interaction (test): {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = fm_side_info_recommendations, 
                                   rankings = actual_interaction_test, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - evaluated on actual interaction (test): {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = fm_side_info_recommendations, 
                                   rankings = actual_interaction_test, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - evaluated on actual interaction (test): {hit_20}")

final hit_count: 15
Model: fm_side_info - Hit rate at 5 - evaluated on actual interaction (test): 0.00406393931183961
final hit_count: 20
Model: fm_side_info - Hit rate at 10 - evaluated on actual interaction (test): 0.00541858574911948
final hit_count: 27
Model: fm_side_info - Hit rate at 20 - evaluated on actual interaction (test): 0.007315090761311298
CPU times: user 6.58 s, sys: 0 ns, total: 6.58 s
Wall time: 6.58 s


In [42]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

## Model: fm_extended

In [43]:
%%time
# Calculate hit rate at K (precision at K)
model = 'fm_extended'
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = fm_extended_recommendations, 
                                   rankings = actual_interaction_test, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = fm_extended_recommendations, 
                                   rankings = actual_interaction_test, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = fm_extended_recommendations, 
                                   rankings = actual_interaction_test, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_20}")

final hit_count: 31
Model: fm_extended - Hit rate at 5: 0.008398807911135195
final hit_count: 35
Model: fm_extended - Hit rate at 10: 0.00948252506095909
final hit_count: 36
Model: fm_extended - Hit rate at 20: 0.009753454348415064
CPU times: user 6.58 s, sys: 0 ns, total: 6.58 s
Wall time: 6.57 s


In [44]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

# Calculate hit_rate (manual) - Evaluation data: all interaction data

In [45]:
benchmark = 'all_interaction_data'

## Model: fm

In [46]:
%%time
# Calculate hit rate at K (precision at K)
model = 'fm'
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = fm_recommendations, 
                                   rankings = actual_interaction_df, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - evaluated on actual interaction (all): {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = fm_recommendations, 
                                   rankings = actual_interaction_df, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - evaluated on actual interaction (all): {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = fm_recommendations, 
                                   rankings = actual_interaction_df, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - evaluated on actual interaction (all): {hit_20}")

final hit_count: 16
Model: fm - Hit rate at 5 - evaluated on actual interaction (all): 0.004334868599295584
final hit_count: 19
Model: fm - Hit rate at 10 - evaluated on actual interaction (all): 0.005147656461663506
final hit_count: 29
Model: fm - Hit rate at 20 - evaluated on actual interaction (all): 0.007856949336223246
CPU times: user 13.1 s, sys: 6.39 ms, total: 13.1 s
Wall time: 13.1 s


In [47]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

## Model: fm_match

In [48]:
%%time
# Calculate hit rate at K (precision at K)
model = 'fm_match'
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = fm_match_recommendations, 
                                   rankings = actual_interaction_df, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - evaluated on actual interaction (all): {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = fm_match_recommendations, 
                                   rankings = actual_interaction_df, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - evaluated on actual interaction (all): {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = fm_match_recommendations, 
                                   rankings = actual_interaction_df, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - evaluated on actual interaction (all): {hit_20}")

final hit_count: 16
Model: fm_match - Hit rate at 5 - evaluated on actual interaction (all): 0.004334868599295584
final hit_count: 19
Model: fm_match - Hit rate at 10 - evaluated on actual interaction (all): 0.005147656461663506
final hit_count: 29
Model: fm_match - Hit rate at 20 - evaluated on actual interaction (all): 0.007856949336223246
CPU times: user 13.2 s, sys: 10.8 ms, total: 13.2 s
Wall time: 13.2 s


In [49]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

## Model: side_info

In [50]:
%%time
# Calculate hit rate at K (precision at K)
model = 'fm_side_info'
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = fm_side_info_recommendations, 
                                   rankings = actual_interaction_df, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - evaluated on actual interaction (all): {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = fm_side_info_recommendations, 
                                   rankings = actual_interaction_df, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - evaluated on actual interaction (all): {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = fm_side_info_recommendations, 
                                   rankings = actual_interaction_df, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - evaluated on actual interaction (all): {hit_20}")

final hit_count: 15
Model: fm_side_info - Hit rate at 5 - evaluated on actual interaction (all): 0.00406393931183961
final hit_count: 20
Model: fm_side_info - Hit rate at 10 - evaluated on actual interaction (all): 0.00541858574911948
final hit_count: 27
Model: fm_side_info - Hit rate at 20 - evaluated on actual interaction (all): 0.007315090761311298
CPU times: user 13.2 s, sys: 6.74 ms, total: 13.2 s
Wall time: 13.2 s


In [51]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

## Model: fm_extended

In [52]:
%%time
# Calculate hit rate at K (precision at K)
model = 'fm_extended'
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = fm_extended_recommendations, 
                                   rankings = actual_interaction_df, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - evaluated on actual interaction (all): {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = fm_extended_recommendations, 
                                   rankings = actual_interaction_df, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - evaluated on actual interaction (all): {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = fm_extended_recommendations, 
                                   rankings = actual_interaction_df, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - evaluated on actual interaction (all): {hit_20}")

final hit_count: 31
Model: fm_extended - Hit rate at 5 - evaluated on actual interaction (all): 0.008398807911135195
final hit_count: 35
Model: fm_extended - Hit rate at 10 - evaluated on actual interaction (all): 0.00948252506095909
final hit_count: 36
Model: fm_extended - Hit rate at 20 - evaluated on actual interaction (all): 0.009753454348415064
CPU times: user 13.2 s, sys: 11.1 ms, total: 13.2 s
Wall time: 13.2 s


In [53]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

# Summary

In [54]:
len(models), len(benchmarks), len(hits_5), len(hits_10), len(hits_20)

(16, 16, 16, 16, 16)

In [55]:
myfm_hit = pd.DataFrame(columns = ["model","hit_5", "hit_10", "hit_20", 'benchmark'])
myfm_hit.model = models
myfm_hit.benchmark = benchmarks
myfm_hit.hit_5 = hits_5
myfm_hit.hit_10 = hits_10
myfm_hit.hit_20 = hits_20

In [56]:
myfm_hit

Unnamed: 0,model,hit_5,hit_10,hit_20,benchmark
0,fm,0.00298,0.00298,0.004877,ranking_data_random
1,fm_match,0.00298,0.00298,0.004877,ranking_data_random
2,fm_side_info,0.002709,0.003251,0.004877,ranking_data_random
3,fm_extended,0.005148,0.00569,0.00596,ranking_data_random
4,fm,0.001084,0.001355,0.001626,evaluation_knn_lda
5,fm_match,0.001084,0.001355,0.001626,evaluation_knn_lda
6,fm_side_info,0.001355,0.001355,0.001626,evaluation_knn_lda
7,fm_extended,0.000813,0.001084,0.001626,evaluation_knn_lda
8,fm,0.004335,0.005148,0.007857,actual_test_interaction
9,fm_match,0.004335,0.005148,0.007857,actual_test_interaction


In [57]:
myfm_hit.to_csv('./nb_recsys_myfm/myfm_hit_v2.csv', header=True, index=False)

# Data-centric explanation
- Count number of JobID for positive interaction in test data

In [None]:
summary_test_interaction = actual_interaction_test.groupby('UserID').agg(Count_JobID=('JobID', 'count')).reset_index()

In [None]:
summary_test_interaction.sort_values(by = 'Count_JobID', ascending=False)

In [None]:
summary_test_interaction['Count_JobID'].value_counts()

In [None]:
summary_positive_test_interaction = actual_interaction_test[actual_interaction_test.label==1].groupby('UserID').agg(Count_JobID=('JobID', 'count')).reset_index()

In [None]:
summary_positive_test_interaction.sort_values(by = 'Count_JobID', ascending=False)

In [None]:
summary_positive_test_interaction['Count_JobID'].value_counts()

## Baseline pure popularity in the original interaction data
It's good to compare validation metrics against the best unpersonalized recommendations: overall item popularity rankings from the training data
In this dataset, however, all of the baseline hit_rate cannot be calculated because of the way it split Train/Test data.
All of test users are cold_start (i.e. have not appears in training interaction)

In [None]:
def base_hrt(k):
    most_popular = train_data.groupby('JobID')['UserID'].count().sort_values(ascending=False)[:k]
    #most_popular
    test_user_items = test_data.groupby('UserID')['JobID'].apply(set).to_dict()
    train_users = np.sort(train_data.UserID.unique())
    test_user_items = {key: val for key, val in test_user_items.items() if key in set(train_users)}
    base_hrt = np.mean([int(len(set(most_popular.index) & set(val)) > 0) for key, val in test_user_items.items()])
    return base_hrt

In [None]:
base_hit5 = base_hrt(5)

In [None]:
base_hit5

In [None]:
%%time
base_hit10 = base_hrt(10)

In [None]:
base_hit10

In [None]:
%%time
base_hit20 = base_hrt(20)

In [None]:
base_hit20

In [None]:
train_users = np.sort(train_data.UserID.unique())

In [None]:
test_users = np.sort(test_data.UserID.unique())

In [None]:
len(train_users), len(test_users)

In [None]:
cold_start_users = set(test_users) - set(train_users)

In [None]:
len(cold_start_users)

In [None]:

# base_pre = np.mean([len(set(most_popular.index) & set(val)) / len(set(most_popular.index)) for key, val in test_user_items.items()])
# base_rec = np.mean([len(set(most_popular.index) & set(val)) / len(set(val))                for key, val in test_user_items.items()])


In [None]:
def calculate_hit_rate_at_k(recommendations, rankings, K):
    total_users = len(recommendations)
    hits = 0

    for _, rec in recommendations.iterrows():
        user_id = rec['UserID']
        rec_job_ids = rankings.loc[rankings['UserID'] == user_id]['JobID'].tolist()[:K]
        actual_job_ids = rankings.loc[(rankings['UserID'] == user_id) & (rankings['label'] == 1)]['JobID'].tolist()

        for rec_job_id in rec_job_ids:
            if rec_job_id in actual_job_ids:
                hits += 1
                break

    hit_rate = hits / total_users
    return hit_rate

In [None]:
# Revised function PER USER
def get_hit_rate_at_k_v2(recommendations, rankings, K):
    total_users = len(recommendations['UserID'].unique())
    hits = 0

    for user_id in recommendations['UserID'].unique():
        user_recommendations = recommendations[recommendations['UserID'] == user_id].sort_values('rank').head(K)
        user_rankings = rankings[(rankings['UserID'] == user_id) & (rankings['label'] == 1)]['JobID'].tolist()

        for _, rec in user_recommendations.iterrows():
            if rec['JobID'] in user_rankings:
                hits += 1
                break

    hit_rate = hits / total_users
    return hit_rate

In [None]:
#MEAN PRECISION AT K

import numpy as np

def get_mean_precision_at_k(recommendations, rankings, K):
    precisions = []

    for user_id in recommendations['UserID'].unique():
        user_recommendations = recommendations[recommendations['UserID'] == user_id].sort_values('rank').head(K)
        user_rankings = rankings[(rankings['UserID'] == user_id) & (rankings['label'] == 1)]['JobID'].tolist()

        hits = sum(rec['JobID'] in user_rankings for _, rec in user_recommendations.iterrows())
        precision = hits / K
        precisions.append(precision)

    mean_precision = np.mean(precisions)
    return mean_precision

In [None]:
# # Calculate hit rate at K (precision at K)
# K = 5  # Set the desired value of K
# mean_precision_at_5 = get_mean_precision_at_k(recommendations = predicted_recommendations, 
#                                    rankings = evaluation_data, 
#                                    K=5)
# print(f"Mean Precision at {K}: {mean_precision_at_5}")

In [None]:
# hit_rate > mean_precision_at_5

## Calculate mean precision at K sklearn

In [None]:
from sklearn.metrics import precision_score

def get_mean_precision_at_k_sklearn(recommendations, rankings, K):
    mean_precision = 0

    for user_id in recommendations['UserID'].unique():
        user_recommendations = recommendations[recommendations['UserID'] == user_id]['Y_pred'].tolist()[:K]
        actual_labels = rankings[(rankings['UserID'] == user_id)]['label'].tolist()[:K]

        precision = precision_score(actual_labels, user_recommendations, average='binary')
        mean_precision += precision

    mean_precision /= len(recommendations['UserID'].unique())
    return mean_precision

In [None]:
# # Calculate hit rate at K (precision at K)
# K = 5  # Set the desired value of K
# mean_precision_at_5_sklearn = get_mean_precision_at_k_sklearn(recommendations = predicted_recommendations, 
#                                    rankings = evaluation_data, 
#                                    K=5)
# print(f"Mean Precision at {K}: {mean_precision_at_5_sklearn}")