# CALCULATE HIT RATE - TopN Recommendation
- Model: White-box and Black-box models trained on TF-IDF features
 (use vector for X_train, Y_train, X_test, Y_test)
    - White-box: Logistic Regression (logreg), Decision Tree (dt), Gaussian Naive Bayes (nb)
    - Black-box: XGBoost (xgbt), AdaBoost (ada), Linear Discriminant Analysis (lda), Quadratic Discriminant Analysis (qda)
- Source of potential applications: random_ranking_data

INPUT: Top 20 applications in dictionary form
OUTPUT: 
- Top 20 results in dataframe for hit rate calculation
- Hit_rate @ 5, 10, 20 of all recsys

In [1]:
import numpy as np
import pandas as pd

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Run this for reproduce
# Gets the current working directory
import os
cwd = os.getcwd()
print("Working directory:", cwd)
# Go up one directory from working directory
os.chdir("..")

Working directory: /home/jovyan/1_UT THESIS/CB12_MAIN/nb_recsys_tabular


In [4]:
# Load evaluation data (by knn_tfidf)
evaluation_data = pd.read_csv('./nb_recsys_tfidf_vector/evaluation_data_knn_tfidf.csv')

In [5]:
evaluation_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 371600 entries, 0 to 371599
Data columns (total 6 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   UserID   371600 non-null  int64  
 1   JobID    371600 non-null  int64  
 2   label    371600 non-null  int64  
 3   City     371600 non-null  float64
 4   State    371600 non-null  float64
 5   Country  371600 non-null  float64
dtypes: float64(3), int64(3)
memory usage: 17.0 MB


In [6]:
evaluation_data.label.value_counts()

label
0    371600
Name: count, dtype: int64

In [7]:
# Load ranking_data_random
ranking_data_random = pd.read_csv('./nb_recsys_tfidf_vector/ranking_data_random.csv')
ranking_data_random.label.value_counts()

label
0    367884
1      3716
Name: count, dtype: int64

In [8]:
models = []
benchmarks = []
hits_5 = []
hits_10 = []
hits_20 = []

In [9]:
def get_hit_rate_at_k(recommendations, rankings, K):
    total_users = len(recommendations['UserID'].unique())
    hit_count = 0
    # mark_stop = 0
    for user_id in recommendations['UserID'].unique():
        user_recommendations = recommendations[recommendations['UserID'] == user_id]['JobID'].tolist()[:K]
        actual_job_ids = rankings[(rankings['UserID'] == user_id) & (rankings['label'] == 1)]['JobID'].tolist()

        for rec_job_id in user_recommendations:
            if rec_job_id in actual_job_ids:
                # print('hit detail UserID, JobID: ', user_id, rec_job_id)
                hit_count += 1
                break     
        # mark_stop =+1
        # if mark_stop == 2:
        #     break
    print('final hit_count:', hit_count)
    hit_rate = hit_count / total_users
    return hit_rate

# Load topN recommendatin result

In [10]:
import pickle
# Test loaded result
with open('./output_ranking/baseline_topN_result.pickle', 'rb') as handle:
    rec_result = pickle.load(handle)

In [11]:
len(rec_result)

9

In [12]:
user_ids = list(ranking_data_random.groupby('UserID').UserID.unique().astype('int'))

In [13]:
def get_rec_result_df(user_index, rec_result):
    rec_20_df = pd.DataFrame(rec_result[user_index])
    rec_cols = ['JobID', 'Y_prob', 'Y_pred']
    rec_20_df.columns = rec_cols
    rec_20_df['UserID'] = user_ids[user_index]
    rec_20_df['rank'] = rec_20_df.groupby('UserID').cumcount()
    return rec_20_df

# Calculate hit rate using benchmark data: random ranking data

In [14]:
benchmark = 'ranking_data_random'

## Model: logreg

In [15]:
model = 'logreg'
rec20_logreg = rec_result[model]

In [16]:
get_rec_result_df(0, rec20_logreg)

Unnamed: 0,JobID,Y_prob,Y_pred,UserID,rank
0,821691,0.946082,1,13,0
1,129969,0.538867,1,13,1
2,860580,0.537347,1,13,2
3,145443,0.530015,1,13,3
4,1092282,0.522887,1,13,4
5,208055,0.522163,1,13,5
6,701157,0.517683,1,13,6
7,234659,0.50924,1,13,7
8,589357,0.503029,1,13,8
9,448819,0.499829,0,13,9


In [17]:
final_rec_result = pd.DataFrame(columns = ['JobID', 'Y_prob', 'Y_pred','UserID','rank'])

for user_index in range(len(user_ids)):
    
    temp_df = get_rec_result_df(user_index = user_index, 
                                rec_result=rec20_logreg)
    # print(u_id, len(temp_df))
    final_rec_result = pd.concat([final_rec_result, temp_df])

final_rec_result = final_rec_result[['UserID','JobID', 'Y_prob', 'Y_pred', 'rank']]

In [18]:
logreg_recommendations = final_rec_result.copy()

In [19]:
logreg_recommendations

Unnamed: 0,UserID,JobID,Y_prob,Y_pred,rank
0,13,821691,0.946082,1,0
1,13,129969,0.538867,1,1
2,13,860580,0.537347,1,2
3,13,145443,0.530015,1,3
4,13,1092282,0.522887,1,4
...,...,...,...,...,...
15,1471988,858776,0.456862,0,15
16,1471988,51720,0.456746,0,16
17,1471988,805692,0.456715,0,17
18,1471988,919721,0.456324,0,18


## Model: 'dt'

In [22]:
model = 'dt'
rec20_dt = rec_result[model]

In [23]:
final_rec_result = pd.DataFrame(columns = ['JobID', 'Y_prob', 'Y_pred','UserID','rank'])

for user_index in range(len(user_ids)):
    
    temp_df = get_rec_result_df(user_index = user_index, 
                                rec_result=rec20_dt)
    # print(u_id, len(temp_df))
    final_rec_result = pd.concat([final_rec_result, temp_df])

final_rec_result = final_rec_result[['UserID','JobID', 'Y_prob', 'Y_pred', 'rank']]

In [24]:
dt_recommendations = final_rec_result.copy()

## Model: XGBoost

In [27]:
model = 'xgbt'
rec20_xgbt = rec_result[model]

In [28]:
final_rec_result = pd.DataFrame(columns = ['JobID', 'Y_prob', 'Y_pred','UserID','rank'])

for user_index in range(len(user_ids)):
    
    temp_df = get_rec_result_df(user_index = user_index, 
                                rec_result=rec20_xgbt)
    # print(u_id, len(temp_df))
    final_rec_result = pd.concat([final_rec_result, temp_df])

final_rec_result = final_rec_result[['UserID','JobID', 'Y_prob', 'Y_pred', 'rank']]

In [29]:
xgbt_recommendations = final_rec_result.copy()

## Model: Naive Bayes

In [32]:
model = 'nb'
rec20_nb = rec_result[model]

In [33]:
final_rec_result = pd.DataFrame(columns = ['JobID', 'Y_prob', 'Y_pred','UserID','rank'])

for user_index in range(len(user_ids)):
    
    temp_df = get_rec_result_df(user_index = user_index, 
                                rec_result=rec20_nb)
    # print(u_id, len(temp_df))
    final_rec_result = pd.concat([final_rec_result, temp_df])

final_rec_result = final_rec_result[['UserID','JobID', 'Y_prob', 'Y_pred', 'rank']]

In [34]:
nb_recommendations = final_rec_result.copy()

## Model: AdaBoost

In [37]:
model = 'ada'
rec20_ada = rec_result[model]

In [38]:
final_rec_result = pd.DataFrame(columns = ['JobID', 'Y_prob', 'Y_pred','UserID','rank'])

for user_index in range(len(user_ids)):
    
    temp_df = get_rec_result_df(user_index = user_index, 
                                rec_result=rec20_ada)
    # print(u_id, len(temp_df))
    final_rec_result = pd.concat([final_rec_result, temp_df])

final_rec_result = final_rec_result[['UserID','JobID', 'Y_prob', 'Y_pred', 'rank']]

In [39]:
ada_recommendations = final_rec_result.copy()

## Model: lda

In [42]:
model = 'lda'
rec20_lda = rec_result[model]

In [43]:
final_rec_result = pd.DataFrame(columns = ['JobID', 'Y_prob', 'Y_pred','UserID','rank'])

for user_index in range(len(user_ids)):
    
    temp_df = get_rec_result_df(user_index = user_index, 
                                rec_result=rec20_lda)
    # print(u_id, len(temp_df))
    final_rec_result = pd.concat([final_rec_result, temp_df])

final_rec_result = final_rec_result[['UserID','JobID', 'Y_prob', 'Y_pred', 'rank']]

In [44]:
lda_recommendations = final_rec_result.copy()

## Model: qda

In [47]:
model = 'qda'
rec20_qda = rec_result[model]

In [48]:
final_rec_result = pd.DataFrame(columns = ['JobID', 'Y_prob', 'Y_pred','UserID','rank'])

for user_index in range(len(user_ids)):
    
    temp_df = get_rec_result_df(user_index = user_index, 
                                rec_result=rec20_qda)
    # print(u_id, len(temp_df))
    final_rec_result = pd.concat([final_rec_result, temp_df])

final_rec_result = final_rec_result[['UserID','JobID', 'Y_prob', 'Y_pred', 'rank']]

In [49]:
qda_recommendations = final_rec_result.copy()

In [52]:
benchmark = 'evaluation_knn_tidif'

## Model: logreg

In [53]:
model = 'logreg'

In [54]:
%%time
# Calculate hit rate at K (precision at K)
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = logreg_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = logreg_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = logreg_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_20}")

final hit_count: 0
Model: logreg - Hit rate at 5: 0.0
final hit_count: 0
Model: logreg - Hit rate at 10: 0.0
final hit_count: 0
Model: logreg - Hit rate at 20: 0.0
CPU times: user 1min 12s, sys: 19.5 ms, total: 1min 12s
Wall time: 1min 12s


In [55]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

## Model: dt

In [56]:
model = 'dt'

In [57]:
%%time
# Calculate hit rate at K (precision at K)
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = dt_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = dt_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = dt_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_20}")

final hit_count: 0
Model: dt - Hit rate at 5: 0.0
final hit_count: 0
Model: dt - Hit rate at 10: 0.0
final hit_count: 0
Model: dt - Hit rate at 20: 0.0
CPU times: user 1min 11s, sys: 23.7 ms, total: 1min 11s
Wall time: 1min 11s


In [58]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

## Model: XGBoost (xgbt)

In [59]:
%%time
model = 'xgbt'
# Calculate hit rate at K (precision at K)
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = xgbt_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = xgbt_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = xgbt_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_20}")

final hit_count: 0
Model: xgbt - Hit rate at 5: 0.0
final hit_count: 0
Model: xgbt - Hit rate at 10: 0.0
final hit_count: 0
Model: xgbt - Hit rate at 20: 0.0
CPU times: user 1min 10s, sys: 19.8 ms, total: 1min 10s
Wall time: 1min 10s


In [60]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

## Model: Naive Bayes

In [61]:
%%time
model = 'nb'
# Calculate hit rate at K (precision at K)
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = nb_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = nb_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = nb_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_20}")

final hit_count: 0
Model: nb - Hit rate at 5: 0.0
final hit_count: 0
Model: nb - Hit rate at 10: 0.0
final hit_count: 0
Model: nb - Hit rate at 20: 0.0
CPU times: user 1min 12s, sys: 15.7 ms, total: 1min 12s
Wall time: 1min 12s


In [62]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

## Model: AdaBoost

In [63]:
%%time
model = 'ada'

# Calculate hit rate at K (precision at K)
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = ada_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = ada_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = ada_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_20}")

final hit_count: 0
Model: ada - Hit rate at 5: 0.0
final hit_count: 0
Model: ada - Hit rate at 10: 0.0
final hit_count: 0
Model: ada - Hit rate at 20: 0.0
CPU times: user 1min 13s, sys: 15.7 ms, total: 1min 13s
Wall time: 1min 13s


In [64]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

## Model: lda

In [65]:
%%time
model = 'lda'
# Calculate hit rate at K (precision at K)
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = lda_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = lda_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = lda_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_20}")

final hit_count: 0
Model: lda - Hit rate at 5: 0.0
final hit_count: 0
Model: lda - Hit rate at 10: 0.0
final hit_count: 0
Model: lda - Hit rate at 20: 0.0
CPU times: user 1min 10s, sys: 19.8 ms, total: 1min 11s
Wall time: 1min 11s


In [66]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

## Model: qda

In [67]:
%%time
model = 'qda'

# Calculate hit rate at K (precision at K)
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = qda_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = qda_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = qda_recommendations, 
                                   rankings = evaluation_data, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k}: {hit_20}")

final hit_count: 0
Model: qda - Hit rate at 5: 0.0
final hit_count: 0
Model: qda - Hit rate at 10: 0.0
final hit_count: 0
Model: qda - Hit rate at 20: 0.0
CPU times: user 1min 10s, sys: 15.9 ms, total: 1min 10s
Wall time: 1min 10s


In [68]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

# Export all df of rec result for re-use

In [69]:
logreg_recommendations.to_csv('./output_topN_tabular/rec_result_logreg_random_ranking.csv', header=True, index=False)
dt_recommendations.to_csv('./output_topN_tabular/rec_result_dt_random_ranking.csv', header=True, index=False)
nb_recommendations.to_csv('./output_topN_tabular/rec_result_nb_random_ranking.csv', header=True, index=False)
xgbt_recommendations.to_csv('./output_topN_tabular/rec_result_xgbt_random_ranking.csv', header=True, index=False)
lda_recommendations.to_csv('./output_topN_tabular/rec_result_lda_random_ranking.csv', header=True, index=False)
qda_recommendations.to_csv('./output_topN_tabular/rec_result_qda_random_ranking.csv', header=True, index=False)

In [70]:
# Load all interaction data (LDA tabular with side information)
train_data_extended = pd.read_csv('./xai_recsys/train_data_extended.csv')
test_data_extended = pd.read_csv('./xai_recsys/test_data_extended.csv')

In [71]:
# Positive interaction (from original data)
select_cols = ['UserID', 'JobID', 'label','Split']
actual_interaction_train = train_data_extended[select_cols]
actual_interaction_test = test_data_extended[select_cols]
actual_interaction_df = pd.concat([actual_interaction_train,actual_interaction_test])

In [72]:
len(actual_interaction_train), len(actual_interaction_test)

(563889, 15736)

In [73]:
benchmark = 'actual_test_interaction'

## Model: logreg

In [74]:
%%time
model = 'logreg'

# Calculate hit rate at K (precision at K)
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = logreg_recommendations, 
                                   rankings = actual_interaction_test, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - actual test interaction: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = logreg_recommendations, 
                                   rankings = actual_interaction_test, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - actual test interaction: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = logreg_recommendations, 
                                   rankings = actual_interaction_test, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - actual test interaction: {hit_20}")

final hit_count: 3253
Model: logreg - Hit rate at 5 - actual test interaction: 0.8754036598493004
final hit_count: 3273
Model: logreg - Hit rate at 10 - actual test interaction: 0.8807857911733046
final hit_count: 3329
Model: logreg - Hit rate at 20 - actual test interaction: 0.8958557588805167
CPU times: user 1min 7s, sys: 7.93 ms, total: 1min 7s
Wall time: 1min 7s


In [75]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

## Model: dt

In [76]:
%%time
model = 'dt'
# Calculate hit rate at K (precision at K)
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = dt_recommendations, 
                                   rankings = actual_interaction_test, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - actual test interaction: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = dt_recommendations, 
                                   rankings = actual_interaction_test, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - actual test interaction: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = dt_recommendations, 
                                   rankings = actual_interaction_test, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - actual test interaction: {hit_20}")

final hit_count: 3298
Model: dt - Hit rate at 5 - actual test interaction: 0.88751345532831
final hit_count: 3298
Model: dt - Hit rate at 10 - actual test interaction: 0.88751345532831
final hit_count: 3304
Model: dt - Hit rate at 20 - actual test interaction: 0.8891280947255114
CPU times: user 1min 10s, sys: 15.9 ms, total: 1min 10s
Wall time: 1min 10s


In [77]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

## Model: xgbt

In [78]:
%%time
model = 'xgbt'

# Calculate hit rate at K (precision at K)
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = xgbt_recommendations, 
                                   rankings = actual_interaction_test, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - actual test interaction: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = xgbt_recommendations, 
                                   rankings = actual_interaction_test, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - actual test interaction: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = xgbt_recommendations, 
                                   rankings = actual_interaction_test, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - actual test interaction: {hit_20}")

final hit_count: 3716
Model: xgbt - Hit rate at 5 - actual test interaction: 1.0
final hit_count: 3716
Model: xgbt - Hit rate at 10 - actual test interaction: 1.0
final hit_count: 3716
Model: xgbt - Hit rate at 20 - actual test interaction: 1.0
CPU times: user 1min 9s, sys: 35.8 ms, total: 1min 9s
Wall time: 1min 9s


In [79]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

## Model: nb

In [80]:
%%time
model = 'nb'
# Calculate hit rate at K (precision at K)
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = nb_recommendations, 
                                   rankings = actual_interaction_test, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - actual test interaction: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = nb_recommendations, 
                                   rankings = actual_interaction_test, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - actual test interaction: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = nb_recommendations, 
                                   rankings = actual_interaction_test, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - actual test interaction: {hit_20}")

final hit_count: 3262
Model: nb - Hit rate at 5 - actual test interaction: 0.8778256189451022
final hit_count: 3298
Model: nb - Hit rate at 10 - actual test interaction: 0.88751345532831
final hit_count: 3355
Model: nb - Hit rate at 20 - actual test interaction: 0.9028525296017222
CPU times: user 1min 4s, sys: 15.9 ms, total: 1min 4s
Wall time: 1min 4s


In [81]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

## Model: lda

In [82]:
%%time
model = 'lda'

# Calculate hit rate at K (precision at K)
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = lda_recommendations, 
                                   rankings = actual_interaction_test, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - actual test interaction: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = lda_recommendations, 
                                   rankings = actual_interaction_test, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - actual test interaction: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = lda_recommendations, 
                                   rankings = actual_interaction_test, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - actual test interaction: {hit_20}")

final hit_count: 3251
Model: lda - Hit rate at 5 - actual test interaction: 0.8748654467168999
final hit_count: 3273
Model: lda - Hit rate at 10 - actual test interaction: 0.8807857911733046
final hit_count: 3327
Model: lda - Hit rate at 20 - actual test interaction: 0.8953175457481163
CPU times: user 1min 9s, sys: 3.93 ms, total: 1min 9s
Wall time: 1min 9s


In [83]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

## Model: qda

In [84]:
%%time
model = 'qda'

# Calculate hit rate at K (precision at K)
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = qda_recommendations, 
                                   rankings = actual_interaction_test, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - actual test interaction: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = qda_recommendations, 
                                   rankings = actual_interaction_test, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - actual test interaction: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = qda_recommendations, 
                                   rankings = actual_interaction_test, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - actual test interaction: {hit_20}")

final hit_count: 3260
Model: qda - Hit rate at 5 - actual test interaction: 0.8772874058127018
final hit_count: 3294
Model: qda - Hit rate at 10 - actual test interaction: 0.8864370290635092
final hit_count: 3365
Model: qda - Hit rate at 20 - actual test interaction: 0.9055435952637244
CPU times: user 1min 6s, sys: 7.89 ms, total: 1min 6s
Wall time: 1min 6s


In [85]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

In [86]:
benchmark = 'all_interaction_data'

## Model: logreg

In [87]:
%%time
model = 'logreg'

# Calculate hit rate at K (precision at K)
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = logreg_recommendations, 
                                   rankings = actual_interaction_df, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - all actual interaction: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = logreg_recommendations, 
                                   rankings = actual_interaction_df, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - all actual interaction: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = logreg_recommendations, 
                                   rankings = actual_interaction_df, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - all actual interaction: {hit_20}")

final hit_count: 3253
Model: logreg - Hit rate at 5 - all actual interaction: 0.8754036598493004
final hit_count: 3273
Model: logreg - Hit rate at 10 - all actual interaction: 0.8807857911733046
final hit_count: 3329
Model: logreg - Hit rate at 20 - all actual interaction: 0.8958557588805167
CPU times: user 1min 18s, sys: 27.9 ms, total: 1min 18s
Wall time: 1min 18s


In [88]:
models.append(model)
benchmarks.append(benchmark)

hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

## Model: dt

In [89]:
%%time
model = 'dt'

# Calculate hit rate at K (precision at K)
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = dt_recommendations, 
                                   rankings = actual_interaction_df, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - all actual interaction: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = dt_recommendations, 
                                   rankings = actual_interaction_df, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - all actual interaction: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = dt_recommendations, 
                                   rankings = actual_interaction_df, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - all actual interaction: {hit_20}")

final hit_count: 3298
Model: dt - Hit rate at 5 - all actual interaction: 0.88751345532831
final hit_count: 3298
Model: dt - Hit rate at 10 - all actual interaction: 0.88751345532831
final hit_count: 3304
Model: dt - Hit rate at 20 - all actual interaction: 0.8891280947255114
CPU times: user 1min 17s, sys: 23.9 ms, total: 1min 17s
Wall time: 1min 17s


In [90]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

## Model: xgbt

In [91]:
%%time
model = 'xgbt'

# Calculate hit rate at K (precision at K)
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = xgbt_recommendations, 
                                   rankings = actual_interaction_df, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - all actual interaction: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = xgbt_recommendations, 
                                   rankings = actual_interaction_df, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - all actual interaction: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = xgbt_recommendations, 
                                   rankings = actual_interaction_df, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - all actual interaction: {hit_20}")

final hit_count: 3716
Model: xgbt - Hit rate at 5 - all actual interaction: 1.0
final hit_count: 3716
Model: xgbt - Hit rate at 10 - all actual interaction: 1.0
final hit_count: 3716
Model: xgbt - Hit rate at 20 - all actual interaction: 1.0
CPU times: user 1min 18s, sys: 11.9 ms, total: 1min 18s
Wall time: 1min 18s


In [92]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

## Model: nb

In [93]:
%%time
model = 'nb'

# Calculate hit rate at K (precision at K)
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = nb_recommendations, 
                                   rankings = actual_interaction_df, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - all actual interaction: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = nb_recommendations, 
                                   rankings = actual_interaction_df, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - all actual interaction: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = nb_recommendations, 
                                   rankings = actual_interaction_df, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - all actual interaction: {hit_20}")

final hit_count: 3262
Model: nb - Hit rate at 5 - all actual interaction: 0.8778256189451022
final hit_count: 3298
Model: nb - Hit rate at 10 - all actual interaction: 0.88751345532831
final hit_count: 3355
Model: nb - Hit rate at 20 - all actual interaction: 0.9028525296017222
CPU times: user 1min 18s, sys: 28 ms, total: 1min 18s
Wall time: 1min 18s


In [94]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

## Model: lda

In [95]:
%%time
model = 'lda'

# Calculate hit rate at K (precision at K)
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = lda_recommendations, 
                                   rankings = actual_interaction_df, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - all actual interaction: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = lda_recommendations, 
                                   rankings = actual_interaction_df, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - all actual interaction: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = lda_recommendations, 
                                   rankings = actual_interaction_df, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - all actual interaction: {hit_20}")

final hit_count: 3251
Model: lda - Hit rate at 5 - all actual interaction: 0.8748654467168999
final hit_count: 3273
Model: lda - Hit rate at 10 - all actual interaction: 0.8807857911733046
final hit_count: 3327
Model: lda - Hit rate at 20 - all actual interaction: 0.8953175457481163
CPU times: user 1min 16s, sys: 11.9 ms, total: 1min 16s
Wall time: 1min 16s


In [96]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

## Model: qda

In [97]:
%%time
model = 'qda'

# Calculate hit rate at K (precision at K)
k = 5  # Set the desired value of K
hit_5 = get_hit_rate_at_k(recommendations = qda_recommendations, 
                                   rankings = actual_interaction_df, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - all actual interaction: {hit_5}")

k = 10  # Set the desired value of K
hit_10 = get_hit_rate_at_k(recommendations = qda_recommendations, 
                                   rankings = actual_interaction_df, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - all actual interaction: {hit_10}")

k = 20  # Set the desired value of K
hit_20 = get_hit_rate_at_k(recommendations = qda_recommendations, 
                                   rankings = actual_interaction_df, 
                                   K=k)
print(f"Model: {model} - Hit rate at {k} - all actual interaction: {hit_20}")

final hit_count: 3260
Model: qda - Hit rate at 5 - all actual interaction: 0.8772874058127018
final hit_count: 3294
Model: qda - Hit rate at 10 - all actual interaction: 0.8864370290635092
final hit_count: 3365
Model: qda - Hit rate at 20 - all actual interaction: 0.9055435952637244
CPU times: user 1min 19s, sys: 40 ms, total: 1min 19s
Wall time: 1min 19s


In [98]:
models.append(model)
benchmarks.append(benchmark)
hits_5.append(hit_5)
hits_10.append(hit_10)
hits_20.append(hit_20)

In [99]:
len(models), len(benchmarks), len(hits_5), len(hits_10), len(hits_20)

(26, 26, 26, 26, 26)

In [100]:
baseline_hit = pd.DataFrame(columns = ["model","hit_5", "hit_10", "hit_20", 'benchmark'])
baseline_hit.model = models
baseline_hit.benchmark = benchmarks
baseline_hit.hit_5 = hits_5
baseline_hit.hit_10 = hits_10
baseline_hit.hit_20 = hits_20

In [101]:
baseline_hit

Unnamed: 0,model,hit_5,hit_10,hit_20,benchmark
0,logreg,0.875404,0.880786,0.895856,ranking_data_random
1,dt,0.887513,0.887513,0.889128,ranking_data_random
2,xgbt,1.0,1.0,1.0,ranking_data_random
3,nb,0.877826,0.887513,0.902853,ranking_data_random
4,ada,0.874327,0.880517,0.896125,ranking_data_random
5,lda,0.874865,0.880786,0.895318,ranking_data_random
6,qda,0.874865,0.880786,0.895318,ranking_data_random
7,logreg,0.0,0.0,0.0,evaluation_knn_tidif
8,dt,0.0,0.0,0.0,evaluation_knn_tidif
9,xgbt,0.0,0.0,0.0,evaluation_knn_tidif


In [120]:
baseline_hit.to_csv('./output_topN_tabular/hit_rate_tfidf_ranking_random.csv', header=True, index=False)