In [36]:
import numpy as np
import pandas as pd
import random
import pickle
import json

<h2>Tuning the # of candidate items<h2>

In [29]:
recs_folder = "/home/diego/chat-reranking/experiments/goodreads/recs/reranked"
recs_name = "chatgpt0613-div-p1-pzt-fold_0_50_20"
top_n = 10

# read JSON file
f = open(f"{recs_folder}/{recs_name}.json")
data = json.load(f)
f.close()

In [30]:
print("Tune the length of the candidate set: chatgpt")
max_pos = []
for entry in data:
    base = entry['recs']
    re_ranked = entry['reranked_recs']
    if len(re_ranked) == 0:
        continue

    def return_pos(j):
        if j in base:
            return base.index(j)
        else:
            return 0

    pos = [return_pos(r) for r in re_ranked]
    max_pos.append(max(pos))

print(f"Average max reranking pos: {np.mean(max_pos)}") 
print(f"Average std reranking pos: {np.std(max_pos)}")   

Tune the length of the candidate set: chatgpt
Average max reranking pos: 36.689655172413794
Average std reranking pos: 14.692487989764354


In [28]:
rerankers = ["MMR-pzt-fold_0_50_20", "RxQuAD-pzt-fold_0_50_20", "xQuAD-pzt-fold_0_50_20", "Random-pzt-fold_0_50_20"]

print(f"Tune the length of the candidate set: baseline reranker {rerankers[3]}")
base_recs = pd.read_csv(f"/home/diego/chat-reranking/experiments/goodreads/recs/baselines/pzt-fold_0_50_20", 
                            names=["userid", "itemid", "rating"], sep="\t")
re_ranked = pd.read_csv(f"/home/diego/chat-reranking/experiments/goodreads/recs/reranked/{rerankers[3]}", 
                            names=["userid", "itemid", "rating"], sep="\t")

max_pos = []
for userid in re_ranked["userid"].unique()[:300]:
    base = list(base_recs[base_recs["userid"] == userid]["itemid"].values[:50])
    recs = re_ranked[re_ranked["userid"] == userid]["itemid"].values.tolist()
        
    pos = [base.index(r) for r in recs]
    max_pos.append(max(pos))
print(f"Average max reranking pos: {np.mean(max_pos)}") 
print(f"Average std reranking pos: {np.std(max_pos)}")  

Tune the length of the candidate set: baseline reranker Random-pzt-fold_0_50_20
Average max reranking pos: 45.45333333333333
Average std reranking pos: 3.9800949179078073


<h2>Convert rec files for Ranksys</h2>

In [47]:
recs_folder = "/home/diego/chat-reranking/experiments/goodreads/recs/reranked"
recs_name = "chatgpt0613-div-p4-pzt-fold_0"
top_n = 10

# read JSON file
f = open(f"{recs_folder}/{recs_name}.json")
data = json.load(f)
f.close()

In [48]:
train_folder = f"/home/diego/chat-reranking/experiments/goodreads/fold_0/train_data.csv"
training_data = pd.read_csv(f'{train_folder}', names=["userid", "itemid", "rating"], sep="\t")

Some recommendations might contain 
- less than 10 items
- items that are from the training

We need to remove these items from the recommenations (and count them)

In [49]:
# check whether some recommendations don't intersect with the candidate set
recs = {}
candidate_set = {}
tot_rec_before_pruning = 0
tot_rec_after_pruning = 0
for entry in data:
    userid = entry['userid']
    recs[userid] = entry['reranked_recs']
    candidate_set[userid] = entry['recs']
    tot_rec_before_pruning += len(recs[userid])
    user_train = training_data[training_data["userid"] == entry['userid']]["itemid"].values.tolist()
    to_prune = recs[userid]
    for i, item in enumerate(recs[userid]):
        if item not in candidate_set[userid]:
            del to_prune[i]
            # print(f"user {entry['userid']}: {item}")
            # print(item)
            # print(entry['recs'])
            # print("######")
    recs[userid] = to_prune
    tot_rec_after_pruning += len(to_prune)
print(f"# recs before pruning: {tot_rec_before_pruning}")
print(f"# recs after pruning: {tot_rec_after_pruning}")

# recs before pruning: 4878
# recs after pruning: 4857


In [50]:
# count users with less than 10 recommendations
count = 0
count_none = 0
avg_len = []
for userid in recs:
    avg_len.append(len(recs[userid]))
    if len(recs[userid]) < 10:
        count += 1
    if len(recs[userid]) == 0:
        count_none += 1
print(f"# of users with less than 10 recommendations: {count}")
print(f"# of users with no recommendations: {count_none}")
print(f"avg number of recommendations per user: {np.mean(avg_len)}")

# of users with less than 10 recommendations: 72
# of users with no recommendations: 6
avg number of recommendations per user: 9.714


In [51]:
# print final recommendations
exclude_violating_recs = True
excluded = 0
with open(f"{recs_folder}/{recs_name}", "w") as f:
    for userid in recs:
        if len(recs[userid]) < 10:
            excluded += 1
            continue
        score = float(top_n)
        usr_str = ""
        for r in recs[userid]:
            usr_str += f"{userid}\t{r}\t{score}\n"
            score -= 1.0
        f.write(usr_str)
print("Printed!")
print(f"# of excluded users: {excluded}")

Printed!
# of excluded users: 72
