# Calculate hit_rate
## Model: EBM models (trained on tabular TF-IDF vector)
## Ranking data: knn_tfidf_ranking_v2
- Input: Top20 recommendation results by white-box and black-box models
- Hit rate: Evaluated on different benchmarks: knn_tfidf_ranking_data, random_ranking_data, actual test_interaction data
- Hit rate @5, @10, @20 for each model

In [1]:
import numpy as np
import pandas as pd

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Run this for reproduce
# Gets the current working directory
import os
cwd = os.getcwd()
print("Working directory:", cwd)
# Go up one directory from working directory
os.chdir("..")

Working directory: /Users/anhtth/Library/CloudStorage/OneDrive-UniversityofTwente/2023 UT- THESIS/1-Code/0.cb12_main/nb_recsys_tabular


In [4]:
# Load evaluation data
evaluation_data = pd.read_csv('./nb_recsys_tabular/knn_tfidf_ranking_v2.csv')

In [5]:
evaluation_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 371600 entries, 0 to 371599
Data columns (total 6 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   UserID   371600 non-null  int64  
 1   JobID    371600 non-null  int64  
 2   label    371600 non-null  int64  
 3   City     371600 non-null  float64
 4   State    371600 non-null  float64
 5   Country  371600 non-null  float64
dtypes: float64(3), int64(3)
memory usage: 17.0 MB


In [6]:
evaluation_data.label.value_counts()

0    371426
1       174
Name: label, dtype: int64

In [7]:
# Load ranking_data_random
ranking_data_random = pd.read_csv('./nb_recsys_tabular/ranking_data_random.csv')
ranking_data_random.label.value_counts()

0    367884
1      3716
Name: label, dtype: int64

In [8]:
models = []
benchmarks = []
hits_5 = []
hits_10 = []
hits_20 = []

In [9]:
def get_hit_rate_at_k(recommendations, rankings, K):
    total_users = len(recommendations['UserID'].unique())
    hit_count = 0
    # mark_stop = 0
    for user_id in recommendations['UserID'].unique():
        user_recommendations = recommendations[recommendations['UserID'] == user_id]['JobID'].tolist()[:K]
        actual_job_ids = rankings[(rankings['UserID'] == user_id) & (rankings['label'] == 1)]['JobID'].tolist()

        for rec_job_id in user_recommendations:
            if rec_job_id in actual_job_ids:
                # print('hit detail UserID, JobID: ', user_id, rec_job_id)
                hit_count += 1
                break     
        # mark_stop =+1
        # if mark_stop == 2:
        #     break
    print('final hit_count:', hit_count)
    hit_rate = hit_count / total_users
    return hit_rate

# Calculate hit rate using benchmark data: random ranking data

In [10]:
benchmark = 'ranking_data_random'

## Model: logreg

In [11]:
model = 'logreg'
logreg_recommendations = pd.read_csv('./output_topN_tabular/rec_result_logreg_ranking_knn.csv')

In [12]:
%%time
# Calculate hit rate at K (precision at K)
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = logreg_recommendations, 
                                   rankings = ranking_data_random, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = logreg_recommendations, 
                                   rankings = ranking_data_random, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = logreg_recommendations, 
                                   rankings = ranking_data_random, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_20}")

final hit_count: 63
Model: logreg - Hit rate at 5: 0.016953713670613563
final hit_count: 74
Model: logreg - Hit rate at 10: 0.01991388589881593
final hit_count: 76
Model: logreg - Hit rate at 20: 0.02045209903121636
CPU times: user 13.5 s, sys: 108 ms, total: 13.6 s
Wall time: 13.6 s


In [13]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

## Model: dt

In [14]:
model = 'dt'
dt_recommendations = pd.read_csv('./output_topN_tabular/rec_result_dt_ranking_knn.csv')

In [15]:
%%time
# Calculate hit rate at K (precision at K)
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = dt_recommendations, 
                                   rankings = ranking_data_random, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = dt_recommendations, 
                                   rankings = ranking_data_random, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = dt_recommendations, 
                                   rankings = ranking_data_random, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_20}")

final hit_count: 76
Model: dt - Hit rate at 5: 0.02045209903121636
final hit_count: 76
Model: dt - Hit rate at 10: 0.02045209903121636
final hit_count: 80
Model: dt - Hit rate at 20: 0.021528525296017224
CPU times: user 13.8 s, sys: 131 ms, total: 14 s
Wall time: 14.1 s


In [16]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

## Model: XGBoost

In [18]:
%%time
model = 'xgbt'
xgbt_recommendations = pd.read_csv('./output_topN_tabular/rec_result_xgbt_ranking_knn.csv')

# Calculate hit rate at K (precision at K)
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = xgbt_recommendations, 
                                   rankings = ranking_data_random, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = xgbt_recommendations, 
                                   rankings = ranking_data_random, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = xgbt_recommendations, 
                                   rankings = ranking_data_random, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_20}")

final hit_count: 78
Model: xgbt - Hit rate at 5: 0.02099031216361679
final hit_count: 79
Model: xgbt - Hit rate at 10: 0.021259418729817008
final hit_count: 79
Model: xgbt - Hit rate at 20: 0.021259418729817008
CPU times: user 14.5 s, sys: 155 ms, total: 14.6 s
Wall time: 14.7 s


In [19]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

## Model: Naive Bayes

In [20]:
%%time
model = 'nb'
nb_recommendations = pd.read_csv('./output_topN_tabular/rec_result_nb_ranking_knn.csv')

# Calculate hit rate at K (precision at K)
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = nb_recommendations, 
                                   rankings = ranking_data_random, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = nb_recommendations, 
                                   rankings = ranking_data_random, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = nb_recommendations, 
                                   rankings = ranking_data_random, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_20}")

final hit_count: 58
Model: nb - Hit rate at 5: 0.015608180839612486
final hit_count: 66
Model: nb - Hit rate at 10: 0.01776103336921421
final hit_count: 77
Model: nb - Hit rate at 20: 0.020721205597416577
CPU times: user 14.2 s, sys: 115 ms, total: 14.3 s
Wall time: 14.4 s


In [21]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

## Model: AdaBoost

In [22]:
%%time
model = 'ada'
ada_recommendations = pd.read_csv('./output_topN_tabular/rec_result_ada_ranking_knn.csv')

# Calculate hit rate at K (precision at K)
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = ada_recommendations, 
                                   rankings = ranking_data_random, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = ada_recommendations, 
                                   rankings = ranking_data_random, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = ada_recommendations, 
                                   rankings = ranking_data_random, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_20}")

final hit_count: 65
Model: ada - Hit rate at 5: 0.017491926803013993
final hit_count: 73
Model: ada - Hit rate at 10: 0.019644779332615717
final hit_count: 75
Model: ada - Hit rate at 20: 0.020182992465016147
CPU times: user 13.2 s, sys: 99 ms, total: 13.3 s
Wall time: 13.3 s


In [23]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

## Model: lda

In [24]:
%%time
model = 'lda'
lda_recommendations = pd.read_csv('./output_topN_tabular/rec_result_lda_ranking_knn.csv')

# Calculate hit rate at K (precision at K)
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = lda_recommendations, 
                                   rankings = ranking_data_random, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = lda_recommendations, 
                                   rankings = ranking_data_random, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = lda_recommendations, 
                                   rankings = ranking_data_random, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_20}")

final hit_count: 63
Model: lda - Hit rate at 5: 0.016953713670613563
final hit_count: 74
Model: lda - Hit rate at 10: 0.01991388589881593
final hit_count: 76
Model: lda - Hit rate at 20: 0.02045209903121636
CPU times: user 13.5 s, sys: 115 ms, total: 13.6 s
Wall time: 13.7 s


In [25]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

## Model: qda

In [26]:
%%time
model = 'qda'
lda_recommendations = pd.read_csv('./output_topN_tabular/rec_result_qda_ranking_knn.csv')

# Calculate hit rate at K (precision at K)
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = lda_recommendations, 
                                   rankings = ranking_data_random, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = lda_recommendations, 
                                   rankings = ranking_data_random, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = lda_recommendations, 
                                   rankings = ranking_data_random, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_20}")

final hit_count: 58
Model: qda - Hit rate at 5: 0.015608180839612486
final hit_count: 72
Model: qda - Hit rate at 10: 0.0193756727664155
final hit_count: 77
Model: qda - Hit rate at 20: 0.020721205597416577
CPU times: user 13.6 s, sys: 116 ms, total: 13.8 s
Wall time: 13.8 s


In [27]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

# Calculate hit rate using benchmark data: evaluation_data

In [28]:
benchmark = 'ranking_knn_tidif'

## Model: logreg

In [29]:
model = 'logreg'
logreg_recommendations = pd.read_csv('./output_topN_tabular/rec_result_logreg_ranking_knn.csv')

In [30]:
%%time
# Calculate hit rate at K (precision at K)
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = logreg_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = logreg_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = logreg_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_20}")

final hit_count: 134
Model: logreg - Hit rate at 5: 0.03606027987082885
final hit_count: 146
Model: logreg - Hit rate at 10: 0.039289558665231435
final hit_count: 149
Model: logreg - Hit rate at 20: 0.04009687836383208
CPU times: user 13 s, sys: 98.1 ms, total: 13.1 s
Wall time: 13.1 s


In [31]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

## Model: dt

In [32]:
model = 'dt'
dt_recommendations = pd.read_csv('./output_topN_tabular/rec_result_dt_ranking_knn.csv')

In [33]:
%%time
# Calculate hit rate at K (precision at K)
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = dt_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = dt_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = dt_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_20}")

final hit_count: 146
Model: dt - Hit rate at 5: 0.039289558665231435
final hit_count: 147
Model: dt - Hit rate at 10: 0.039558665231431644
final hit_count: 157
Model: dt - Hit rate at 20: 0.0422497308934338
CPU times: user 12.9 s, sys: 89.8 ms, total: 13 s
Wall time: 13 s


In [34]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

## Model: XGBoost (xgbt)

In [36]:
%%time
model = 'xgbt'
xgbt_recommendations = pd.read_csv('./output_topN_tabular/rec_result_xgbt_ranking_knn.csv')

# Calculate hit rate at K (precision at K)
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = xgbt_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = xgbt_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = xgbt_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_20}")

final hit_count: 156
Model: xgbt - Hit rate at 5: 0.04198062432723358
final hit_count: 158
Model: xgbt - Hit rate at 10: 0.042518837459634015
final hit_count: 158
Model: xgbt - Hit rate at 20: 0.042518837459634015
CPU times: user 13 s, sys: 99.6 ms, total: 13.1 s
Wall time: 13.1 s


In [37]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

## Model: Naive Bayes

In [38]:
%%time
model = 'nb'
nb_recommendations = pd.read_csv('./output_topN_tabular/rec_result_nb_ranking_knn.csv')

# Calculate hit rate at K (precision at K)
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = nb_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = nb_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = nb_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_20}")

final hit_count: 128
Model: nb - Hit rate at 5: 0.03444564047362755
final hit_count: 135
Model: nb - Hit rate at 10: 0.036329386437029064
final hit_count: 147
Model: nb - Hit rate at 20: 0.039558665231431644
CPU times: user 12.8 s, sys: 93.2 ms, total: 12.9 s
Wall time: 12.9 s


In [39]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

## Model: AdaBoost

In [40]:
%%time
model = 'ada'
ada_recommendations = pd.read_csv('./output_topN_tabular/rec_result_ada_ranking_knn.csv')

# Calculate hit rate at K (precision at K)
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = ada_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = ada_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = ada_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_20}")

final hit_count: 134
Model: ada - Hit rate at 5: 0.03606027987082885
final hit_count: 146
Model: ada - Hit rate at 10: 0.039289558665231435
final hit_count: 148
Model: ada - Hit rate at 20: 0.03982777179763186
CPU times: user 15.2 s, sys: 1.21 s, total: 16.4 s
Wall time: 16.6 s


In [41]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

## Model: lda

In [42]:
%%time
model = 'lda'
lda_recommendations = pd.read_csv('./output_topN_tabular/rec_result_lda_ranking_knn.csv')

# Calculate hit rate at K (precision at K)
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = lda_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = lda_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = lda_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_20}")

final hit_count: 134
Model: lda - Hit rate at 5: 0.03606027987082885
final hit_count: 146
Model: lda - Hit rate at 10: 0.039289558665231435
final hit_count: 149
Model: lda - Hit rate at 20: 0.04009687836383208
CPU times: user 14.7 s, sys: 868 ms, total: 15.6 s
Wall time: 15.7 s


In [43]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

## Model: qda

In [44]:
%%time
model = 'qda'
qda_recommendations = pd.read_csv('./output_topN_tabular/rec_result_qda_ranking_knn.csv')

# Calculate hit rate at K (precision at K)
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = qda_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = qda_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = qda_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_20}")

final hit_count: 125
Model: qda - Hit rate at 5: 0.03363832077502691
final hit_count: 143
Model: qda - Hit rate at 10: 0.038482238966630784
final hit_count: 149
Model: qda - Hit rate at 20: 0.04009687836383208
CPU times: user 13.9 s, sys: 167 ms, total: 14.1 s
Wall time: 14.2 s


In [45]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

# Calculate hit_rate (manual) - Evaluation data: test_interaction data

In [46]:
# Load all interaction data (LDA tabular with side information)
train_data_extended = pd.read_csv('./xai_recsys/train_data_extended.csv')
test_data_extended = pd.read_csv('./xai_recsys/test_data_extended.csv')

In [47]:
# Positive interaction (from original data)
select_cols = ['UserID', 'JobID', 'label','Split']
actual_interaction_train = train_data_extended[select_cols]
actual_interaction_test = test_data_extended[select_cols]
actual_interaction_df = pd.concat([actual_interaction_train,actual_interaction_test])

In [48]:
len(actual_interaction_train), len(actual_interaction_test)

(563889, 15736)

In [49]:
benchmark = 'actual_test_interaction'

## Model: logreg

In [50]:
%%time
model = 'logreg'
logreg_recommendations = pd.read_csv('./output_topN_tabular/rec_result_logreg_ranking_knn.csv')

# Calculate hit rate at K (precision at K)
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = logreg_recommendations, 
                                   rankings = actual_interaction_test, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - actual test interaction: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = logreg_recommendations, 
                                   rankings = actual_interaction_test, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - actual test interaction: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = logreg_recommendations, 
                                   rankings = actual_interaction_test, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - actual test interaction: {hit_20}")

final hit_count: 134
Model: logreg - Hit rate at 5 - actual test interaction: 0.03606027987082885
final hit_count: 146
Model: logreg - Hit rate at 10 - actual test interaction: 0.039289558665231435
final hit_count: 149
Model: logreg - Hit rate at 20 - actual test interaction: 0.04009687836383208
CPU times: user 8.91 s, sys: 89.9 ms, total: 9 s
Wall time: 9.09 s


In [51]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

## Model: dt

In [52]:
%%time
model = 'dt'
dt_recommendations = pd.read_csv('./output_topN_tabular/rec_result_dt_ranking_knn.csv')

# Calculate hit rate at K (precision at K)
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = dt_recommendations, 
                                   rankings = actual_interaction_test, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - actual test interaction: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = dt_recommendations, 
                                   rankings = actual_interaction_test, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - actual test interaction: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = dt_recommendations, 
                                   rankings = actual_interaction_test, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - actual test interaction: {hit_20}")

final hit_count: 146
Model: dt - Hit rate at 5 - actual test interaction: 0.039289558665231435
final hit_count: 147
Model: dt - Hit rate at 10 - actual test interaction: 0.039558665231431644
final hit_count: 157
Model: dt - Hit rate at 20 - actual test interaction: 0.0422497308934338
CPU times: user 8.54 s, sys: 65.4 ms, total: 8.61 s
Wall time: 8.68 s


In [53]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

## Model: xgbt

In [54]:
%%time
model = 'xgbt'
xgbt_recommendations = pd.read_csv('./output_topN_tabular/rec_result_xgbt_ranking_knn.csv')

# Calculate hit rate at K (precision at K)
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = xgbt_recommendations, 
                                   rankings = actual_interaction_test, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - actual test interaction: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = xgbt_recommendations, 
                                   rankings = actual_interaction_test, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - actual test interaction: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = xgbt_recommendations, 
                                   rankings = actual_interaction_test, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - actual test interaction: {hit_20}")

final hit_count: 156
Model: xgbt - Hit rate at 5 - actual test interaction: 0.04198062432723358
final hit_count: 158
Model: xgbt - Hit rate at 10 - actual test interaction: 0.042518837459634015
final hit_count: 158
Model: xgbt - Hit rate at 20 - actual test interaction: 0.042518837459634015
CPU times: user 8.9 s, sys: 90.4 ms, total: 8.99 s
Wall time: 9.06 s


In [55]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

## Model: nb

In [56]:
%%time
model = 'nb'
nb_recommendations = pd.read_csv('./output_topN_tabular/rec_result_nb_ranking_knn.csv')

# Calculate hit rate at K (precision at K)
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = nb_recommendations, 
                                   rankings = actual_interaction_test, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - actual test interaction: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = nb_recommendations, 
                                   rankings = actual_interaction_test, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - actual test interaction: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = nb_recommendations, 
                                   rankings = actual_interaction_test, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - actual test interaction: {hit_20}")

final hit_count: 128
Model: nb - Hit rate at 5 - actual test interaction: 0.03444564047362755
final hit_count: 135
Model: nb - Hit rate at 10 - actual test interaction: 0.036329386437029064
final hit_count: 147
Model: nb - Hit rate at 20 - actual test interaction: 0.039558665231431644
CPU times: user 8.81 s, sys: 87.6 ms, total: 8.9 s
Wall time: 9 s


In [57]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

## Model: ada (to update)

In [58]:
%%time
model = 'ada'
ada_recommendations = pd.read_csv('./output_topN_tabular/rec_result_ada_ranking_knn.csv')

# Calculate hit rate at K (precision at K)
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = ada_recommendations, 
                                   rankings = actual_interaction_test, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - actual test interaction: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = ada_recommendations, 
                                   rankings = actual_interaction_test, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - actual test interaction: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = ada_recommendations, 
                                   rankings = actual_interaction_test, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - actual test interaction: {hit_20}")

final hit_count: 134
Model: ada - Hit rate at 5 - actual test interaction: 0.03606027987082885
final hit_count: 146
Model: ada - Hit rate at 10 - actual test interaction: 0.039289558665231435
final hit_count: 148
Model: ada - Hit rate at 20 - actual test interaction: 0.03982777179763186
CPU times: user 8.51 s, sys: 60.4 ms, total: 8.57 s
Wall time: 8.63 s


In [59]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

## Model: lda

In [60]:
%%time
model = 'lda'
lda_recommendations = pd.read_csv('./output_topN_tabular/rec_result_lda_ranking_knn.csv')

# Calculate hit rate at K (precision at K)
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = lda_recommendations, 
                                   rankings = actual_interaction_test, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - actual test interaction: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = lda_recommendations, 
                                   rankings = actual_interaction_test, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - actual test interaction: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = lda_recommendations, 
                                   rankings = actual_interaction_test, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - actual test interaction: {hit_20}")

final hit_count: 134
Model: lda - Hit rate at 5 - actual test interaction: 0.03606027987082885
final hit_count: 146
Model: lda - Hit rate at 10 - actual test interaction: 0.039289558665231435
final hit_count: 149
Model: lda - Hit rate at 20 - actual test interaction: 0.04009687836383208
CPU times: user 9.08 s, sys: 102 ms, total: 9.18 s
Wall time: 9.27 s


In [61]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

## Model: qda

In [62]:
%%time
model = 'qda'
qda_recommendations = pd.read_csv('./output_topN_tabular/rec_result_qda_ranking_knn.csv')

# Calculate hit rate at K (precision at K)
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = qda_recommendations, 
                                   rankings = actual_interaction_test, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - actual test interaction: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = qda_recommendations, 
                                   rankings = actual_interaction_test, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - actual test interaction: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = qda_recommendations, 
                                   rankings = actual_interaction_test, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - actual test interaction: {hit_20}")

final hit_count: 125
Model: qda - Hit rate at 5 - actual test interaction: 0.03363832077502691
final hit_count: 143
Model: qda - Hit rate at 10 - actual test interaction: 0.038482238966630784
final hit_count: 149
Model: qda - Hit rate at 20 - actual test interaction: 0.04009687836383208
CPU times: user 9 s, sys: 68.1 ms, total: 9.07 s
Wall time: 9.13 s


In [63]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

# Calculate hit_rate (manual) - Evaluation data: all interaction data

In [64]:
benchmark = 'all_interaction_data'

## Model: logreg

In [65]:
%%time
model = 'logreg'
logreg_recommendations = pd.read_csv('./output_topN_tabular/rec_result_logreg_ranking_knn.csv')

# Calculate hit rate at K (precision at K)
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = logreg_recommendations, 
                                   rankings = actual_interaction_df, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - all actual interaction: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = logreg_recommendations, 
                                   rankings = actual_interaction_df, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - all actual interaction: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = logreg_recommendations, 
                                   rankings = actual_interaction_df, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - all actual interaction: {hit_20}")

final hit_count: 134
Model: logreg - Hit rate at 5 - all actual interaction: 0.03606027987082885
final hit_count: 146
Model: logreg - Hit rate at 10 - all actual interaction: 0.039289558665231435
final hit_count: 149
Model: logreg - Hit rate at 20 - all actual interaction: 0.04009687836383208
CPU times: user 18.9 s, sys: 958 ms, total: 19.8 s
Wall time: 20 s


In [66]:
models.append(model)
benchmarks.append(benchmark)

hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

## Model: dt

In [67]:
%%time
model = 'dt'
dt_recommendations = pd.read_csv('./output_topN_tabular/rec_result_dt_ranking_knn.csv')

# Calculate hit rate at K (precision at K)
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = dt_recommendations, 
                                   rankings = actual_interaction_df, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - all actual interaction: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = dt_recommendations, 
                                   rankings = actual_interaction_df, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - all actual interaction: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = dt_recommendations, 
                                   rankings = actual_interaction_df, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - all actual interaction: {hit_20}")

final hit_count: 146
Model: dt - Hit rate at 5 - all actual interaction: 0.039289558665231435
final hit_count: 147
Model: dt - Hit rate at 10 - all actual interaction: 0.039558665231431644
final hit_count: 157
Model: dt - Hit rate at 20 - all actual interaction: 0.0422497308934338
CPU times: user 19.2 s, sys: 272 ms, total: 19.5 s
Wall time: 19.8 s


In [68]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

## Model: xgbt

In [69]:
%%time
model = 'xgbt'
xgbt_recommendations = pd.read_csv('./output_topN_tabular/rec_result_xgbt_ranking_knn.csv')

# Calculate hit rate at K (precision at K)
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = xgbt_recommendations, 
                                   rankings = actual_interaction_df, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - all actual interaction: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = xgbt_recommendations, 
                                   rankings = actual_interaction_df, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - all actual interaction: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = xgbt_recommendations, 
                                   rankings = actual_interaction_df, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - all actual interaction: {hit_20}")

final hit_count: 156
Model: xgbt - Hit rate at 5 - all actual interaction: 0.04198062432723358
final hit_count: 158
Model: xgbt - Hit rate at 10 - all actual interaction: 0.042518837459634015
final hit_count: 158
Model: xgbt - Hit rate at 20 - all actual interaction: 0.042518837459634015
CPU times: user 18.2 s, sys: 201 ms, total: 18.4 s
Wall time: 18.5 s


In [70]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

## Model: nb

In [71]:
%%time
model = 'nb'
nb_recommendations = pd.read_csv('./output_topN_tabular/rec_result_nb_ranking_knn.csv')

# Calculate hit rate at K (precision at K)
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = nb_recommendations, 
                                   rankings = actual_interaction_df, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - all actual interaction: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = nb_recommendations, 
                                   rankings = actual_interaction_df, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - all actual interaction: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = nb_recommendations, 
                                   rankings = actual_interaction_df, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - all actual interaction: {hit_20}")

final hit_count: 128
Model: nb - Hit rate at 5 - all actual interaction: 0.03444564047362755
final hit_count: 135
Model: nb - Hit rate at 10 - all actual interaction: 0.036329386437029064
final hit_count: 147
Model: nb - Hit rate at 20 - all actual interaction: 0.039558665231431644
CPU times: user 19.1 s, sys: 228 ms, total: 19.3 s
Wall time: 19.6 s


In [72]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

## Model: ada (to update)

In [73]:
%%time
model = 'ada'
ada_recommendations = pd.read_csv('./output_topN_tabular/rec_result_ada_ranking_knn.csv')

# Calculate hit rate at K (precision at K)
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = ada_recommendations, 
                                   rankings = actual_interaction_df, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - all actual interaction: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = ada_recommendations, 
                                   rankings = actual_interaction_df, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - all actual interaction: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = ada_recommendations, 
                                   rankings = actual_interaction_df, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - all actual interaction: {hit_20}")

final hit_count: 134
Model: ada - Hit rate at 5 - all actual interaction: 0.03606027987082885
final hit_count: 146
Model: ada - Hit rate at 10 - all actual interaction: 0.039289558665231435
final hit_count: 148
Model: ada - Hit rate at 20 - all actual interaction: 0.03982777179763186
CPU times: user 19.1 s, sys: 718 ms, total: 19.8 s
Wall time: 20 s


In [74]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

## Model: lda

In [75]:
%%time
model = 'lda'
lda_recommendations = pd.read_csv('./output_topN_tabular/rec_result_lda_ranking_knn.csv')

# Calculate hit rate at K (precision at K)
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = lda_recommendations, 
                                   rankings = actual_interaction_df, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - all actual interaction: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = lda_recommendations, 
                                   rankings = actual_interaction_df, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - all actual interaction: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = lda_recommendations, 
                                   rankings = actual_interaction_df, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - all actual interaction: {hit_20}")

final hit_count: 134
Model: lda - Hit rate at 5 - all actual interaction: 0.03606027987082885
final hit_count: 146
Model: lda - Hit rate at 10 - all actual interaction: 0.039289558665231435
final hit_count: 149
Model: lda - Hit rate at 20 - all actual interaction: 0.04009687836383208
CPU times: user 18.7 s, sys: 747 ms, total: 19.5 s
Wall time: 19.6 s


In [76]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

## Model: qda

In [77]:
%%time
model = 'qda'
qda_recommendations = pd.read_csv('./output_topN_tabular/rec_result_qda_ranking_knn.csv')

# Calculate hit rate at K (precision at K)
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = qda_recommendations, 
                                   rankings = actual_interaction_df, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - all actual interaction: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = qda_recommendations, 
                                   rankings = actual_interaction_df, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - all actual interaction: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = qda_recommendations, 
                                   rankings = actual_interaction_df, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - all actual interaction: {hit_20}")

final hit_count: 125
Model: qda - Hit rate at 5 - all actual interaction: 0.03363832077502691
final hit_count: 143
Model: qda - Hit rate at 10 - all actual interaction: 0.038482238966630784
final hit_count: 149
Model: qda - Hit rate at 20 - all actual interaction: 0.04009687836383208
CPU times: user 20.2 s, sys: 227 ms, total: 20.4 s
Wall time: 21.3 s


In [78]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

# Summarize all results

In [79]:
len(models), len(benchmarks), len(hits_5), len(hits_10), len(hits_20)

(28, 28, 28, 28, 28)

In [80]:
baseline_hit = pd.DataFrame(columns = ["model","hit_5", "hit_10", "hit_20", 'benchmark'])
baseline_hit.model = models
baseline_hit.benchmark = benchmarks
baseline_hit.hit_5 = hits_5
baseline_hit.hit_10 = hits_10
baseline_hit.hit_20 = hits_20

In [81]:
baseline_hit

Unnamed: 0,model,hit_5,hit_10,hit_20,benchmark
0,logreg,0.016954,0.019914,0.020452,ranking_data_random
1,dt,0.020452,0.020452,0.021529,ranking_data_random
2,xgbt,0.02099,0.021259,0.021259,ranking_data_random
3,nb,0.015608,0.017761,0.020721,ranking_data_random
4,ada,0.017492,0.019645,0.020183,ranking_data_random
5,lda,0.016954,0.019914,0.020452,ranking_data_random
6,qda,0.015608,0.019376,0.020721,ranking_data_random
7,logreg,0.03606,0.03929,0.040097,ranking_knn_tidif
8,dt,0.03929,0.039559,0.04225,ranking_knn_tidif
9,xgbt,0.041981,0.042519,0.042519,ranking_knn_tidif


In [82]:
baseline_hit.to_csv('./output_topN_tabular/hit_rate_tfidf_ranking_knn.csv', header=True, index=False)

# Data-centric explanation
- Count number of JobID for positive interaction in test data

In [83]:
summary_test_interaction = actual_interaction_test.groupby('UserID').agg(Count_JobID=('JobID', 'count')).reset_index()

In [84]:
summary_test_interaction.sort_values(by = 'Count_JobID', ascending=False)

Unnamed: 0,UserID,Count_JobID
2663,1068676,8
609,239542,8
594,234125,8
1671,704657,8
3482,1385260,8
...,...,...
1769,739270,2
1767,738840,2
1766,738662,2
1763,736903,2


In [85]:
summary_test_interaction['Count_JobID'].value_counts()

2    1457
4     955
6     715
8     589
Name: Count_JobID, dtype: int64

In [86]:
summary_positive_test_interaction = actual_interaction_test[actual_interaction_test.label==1].groupby('UserID').agg(Count_JobID=('JobID', 'count')).reset_index()

In [87]:
summary_positive_test_interaction.sort_values(by = 'Count_JobID', ascending=False)

Unnamed: 0,UserID,Count_JobID
2663,1068676,4
609,239542,4
594,234125,4
1671,704657,4
3482,1385260,4
...,...,...
1769,739270,1
1767,738840,1
1766,738662,1
1763,736903,1


In [88]:
summary_positive_test_interaction['Count_JobID'].value_counts()

1    1457
2     955
3     715
4     589
Name: Count_JobID, dtype: int64