In [1]:
import json
import re
import pickle
import pandas as pd
import random

Method to add random recommendations from the candidate set when recommendation set is incomplete

In [2]:
def add_random_recs(initial_recs, candidate_set, userid) -> (list, int):
    
    initial_recs = [a for a in initial_recs if a in candidate_set]
    
    # a strange case where more than 10 items are recommeded
    if len(initial_recs) > 10:
        initial_recs = initial_recs[:10]

    if len(initial_recs) == 10:
        return initial_recs, 0
    
    new_candidates = list(set(candidate_set).difference(initial_recs))
    nn = 10 - len(initial_recs)
    random.shuffle(new_candidates)
    try:
        sampled = random.sample(new_candidates, nn)
    except Exception as e:
        print(userid)
        print(nn)
        print(new_candidates)
    
    assert len(initial_recs + sampled) == 10
    
    return initial_recs + sampled, nn

<h2> Re-parse recs for LLama2 (Goodreads)<h2>

In [366]:
recs_folder = "/home/diego/chat-reranking/experiments/goodreads/recs/reranked"
recs_name = "instructgpt-div-p6-pzt-fold_0"
top_n = 10

# read JSON file
f = open(f"{recs_folder}/{recs_name}.json")
data = json.load(f)
f.close()

In [367]:
if "p11" in recs_name or "p12" in recs_name or "p5" in recs_name or "p6" in recs_name:
    with open(f"/home/diego/chat-reranking/experiments/goodreads/itemnamegenres_to_id.pkl", 'rb') as fp:
        itemname_to_id = pickle.load(fp)
else:
    with open(f"/home/diego/chat-reranking/experiments/goodreads/itemname_to_id.pkl", 'rb') as fp:
        itemname_to_id = pickle.load(fp)

In [368]:
def clean_name_goodreads(name: str, prompt:str) -> (int, str):
    
    # check first if name is in prompt
    if name in prompt:
        return 0, name
    
    name = name.replace('"', "")
    
    # otherwise fix all the malformed output
    if " (#" in name:
        return 1, name.split(" (#")[0]
    
    if "(" in name:
        return 1, name.split(" (")[0]
    
    if "by" in name:
        splitted = name.split(" by ")
        if len(splitted) == 2:
            return 1, splitted[0]
        
    if "-" in name: #  in prompt p11,p12 the list of genres are given after dash
        splitted = name.split(" - ")
        if len(splitted) == 2:
            new_out = f"{splitted[0]} ({splitted[1].lower()})"
            # print(new_out)
            return 1, new_out
    return 0, name

def parse_raw_output_goodreads(raw_output:str) -> list:
    n_fixed = 0
    lines = raw_output.splitlines()
    reranked_recs = []
    for line in lines:
        try:
            if len(line.split("-> ")) > 1:
                item_name = line.split("-> ")[1]
                
                # clean name
                fixed, cleaned_name = clean_name_goodreads(item_name, entry["prompt"])
                n_fixed += fixed
                reranked_recs.append(itemname_to_id[cleaned_name])
                continue

            if len(re.split('1. |2. |3. |4. |5. |6. |7. |8. |9. |10. ', line)) > 0:
                item_name = re.split('1. |2. |3. |4. |5. |6. |7. |8. |9. |10. ', line)[1]
                
                # clean name
                fixed, cleaned_name = clean_name_goodreads(item_name, entry["prompt"])
                n_fixed += fixed
                reranked_recs.append(itemname_to_id[cleaned_name])
        except Exception as e:
            continue
    # print(f"fixed: {n_fixed} records")
    return reranked_recs

In [369]:
new_data = []  # store here the new json data
n_random_added = 0
for entry in data:
    new_entry = {}
    for k in entry:
        new_entry[k] = entry[k]
        
    # recompute reranked recommendations based on the raw output
    new_recs = parse_raw_output_goodreads(entry["raw_gpt_outputs"])
    
    # add random recommendations if the recommendation set is incomplete
    new_entry['reranked_recs'], m = add_random_recs(new_recs, entry["recs"], entry["userid"])
    n_random_added += m
    
    # print new recommendation set
    new_data.append(new_entry)
print(f"# of random recommendations: {n_random_added}")

# of random recommendations: 182


In [370]:
out_folder = "/home/diego/chat-reranking/experiments/goodreads/recs/reranked_final"
df = pd.DataFrame.from_dict(new_data)
df.to_json(f'{out_folder}/{recs_name}.json', orient="records")

In [184]:
recs_folder = "/home/diego/chat-reranking/experiments/goodreads/recs/reranked_final"
recs_name = "Llama-2-7b-chat-hf-div-p7-pzt-fold_0"
top_n = 10

# read JSON file
f = open(f"{recs_folder}/{recs_name}.json")
data = json.load(f)
f.close()

In [185]:
n_take = 10
for entry in data:
    if len(entry["reranked_recs"]) < 10:
        print(entry["userid"])
        n_take -= 1
    if n_take == 0:
        break

<h2> Re-parse recs for LLama2 (anime)<h2>

In [3]:
recs_folder = "/home/diego/chat-reranking/experiments/anime/recs/reranked/"
# recs_name = "Llama-2-13b-chat-hf-div-p9-pzt-fold_0"
recs_name = "chatgpt0613-div-p21-pzt-fold_0"
top_n = 10

# read JSON file
f = open(f"{recs_folder}/{recs_name}.json")
data = json.load(f)
f.close()

In [4]:
if "p11" in recs_name or "p12" in recs_name or "p5" in recs_name or "p6" in recs_name:
    with open(f"/home/diego/chat-reranking/experiments/anime/itemnamegenres_to_id.pkl", 'rb') as fp:
        itemname_to_id = pickle.load(fp)
elif "p21" in recs_name or "p22" in recs_name:
    with open(f"/home/diego/chat-reranking/experiments/anime/itemnameplot_to_id.pkl", 'rb') as fp:
        itemname_to_id = pickle.load(fp)
else:
    with open(f"/home/diego/chat-reranking/experiments/anime/itemname_to_id.pkl", 'rb') as fp:
        itemname_to_id = pickle.load(fp)

In [5]:
def clean_name_anime(name: str, prompt:str) -> (int, str):
    
    # check first if name is in prompt
    if name in prompt:
        return 0, name
    
    if "(" in name:
        return 1, name.split("(")[0][:-1]
 
    return 0, name

def parse_raw_output_anime(raw_output:str) -> list:
    n_fixed = 0
    lines = raw_output.splitlines()
    reranked_recs = []
    for line in lines:
        try:
            if len(line.split("-> ")) > 1:
                item_name = line.split("-> ")[1]
                
                # clean name
                fixed, cleaned_name = clean_name_anime(item_name, entry["prompt"])
                n_fixed += fixed
                reranked_recs.append(itemname_to_id[cleaned_name])
                continue

            if len(re.split('1. |2. |3. |4. |5. |6. |7. |8. |9. |10. ', line)) > 0:
                item_name = re.split('1. |2. |3. |4. |5. |6. |7. |8. |9. |10. ', line)[1]
                
                # clean name
                fixed, cleaned_name = clean_name_anime(item_name, entry["prompt"])
                n_fixed += fixed
                reranked_recs.append(itemname_to_id[cleaned_name])
        except Exception as e:
            continue
    # print(f"fixed: {n_fixed} records")
    return reranked_recs

In [6]:
new_data = []  # store here the new json data
n_random_added = 0
for entry in data:
    new_entry = {}
    for k in entry:
        new_entry[k] = entry[k]
        
    # recompute reranked recommendations based on the raw output
    new_recs = parse_raw_output_anime(entry["raw_gpt_outputs"])
    
    # add random recommendations if the recommendation set is incomplete
    new_entry['reranked_recs'], m = add_random_recs(new_recs, entry["recs"], entry["userid"])
    n_random_added += m
    
    # print new recommendation set
    new_data.append(new_entry)
print(f"# of random recommendations: {n_random_added}")

# of random recommendations: 1437


In [38]:
out_folder = "/home/diego/chat-reranking/experiments/anime/recs/reranked_final"
df = pd.DataFrame.from_dict(new_data)
df.to_json(f'{out_folder}/{recs_name}.json', orient="records")

In [229]:
recs_folder = "/home/diego/chat-reranking/experiments/anime/recs/reranked_final"
recs_name = "Llama-2-7b-chat-hf-div-p7-pzt-fold_0"
top_n = 10

# read JSON file
f = open(f"{recs_folder}/{recs_name}.json")
data = json.load(f)
f.close()

In [230]:
n_take = 10
for entry in data:
    if len(entry["reranked_recs"]) < 10:
        print(entry["userid"])
        n_take -= 1
    if n_take == 0:
        break

<h2> Re-parse recs for GPT (anime plots)<h2>

In [39]:
recs_folder = "/home/diego/chat-reranking/experiments/anime/recs/reranked/"
recs_name = "chatgpt0613-div-p21-pzt-fold_0"
top_n = 10

# read JSON file
f = open(f"{recs_folder}/{recs_name}.json")
data = json.load(f)
f.close()

In [40]:
with open(f"/home/diego/chat-reranking/experiments/anime/itemnameplot_to_id.pkl", 'rb') as fp:
    itemnameplot_to_id = pickle.load(fp)
with open(f"/home/diego/chat-reranking/experiments/anime/itemname_to_id.pkl", 'rb') as fp:
    itemname_to_id = pickle.load(fp)

In [41]:
def clean_name_anime(name: str, prompt:str) -> (int, str):
    
    if ": " in name:
        return 1, name.replace(": ", ":")
    
    if name[-1] == " ":
        # print(name)
        return 1, name[:-1]
 
    return 0, name

def parse_raw_output_anime(raw_output:str) -> list:
    n_fixed = 0
    lines = raw_output.splitlines()
    reranked_recs = []
    for line in lines:
        try:
            if len(line.split("-> ")) > 1:
                item_name = line.split("-> ")[1]
                
                if item_name in itemname_to_id:
                    reranked_recs.append(itemname_to_id[item_name])
                elif item_name in itemnameplot_to_id:
                    reranked_recs.append(itemnameplot_to_id[item_name])
                else:
                    # try to clean the name
                    fixed, cleaned_name = clean_name_anime(item_name, entry["prompt"])
                    n_fixed += fixed
                    if cleaned_name in itemname_to_id:
                        reranked_recs.append(itemname_to_id[cleaned_name])
                    elif cleaned_name in itemnameplot_to_id:
                        reranked_recs.append(itemnameplot_to_id[cleaned_name])
                    else:
                        # pass
                        print(line)
                        # print(itemname_to_id[item_name])
                        # print(len(cleaned_name))
        except Exception as e:
            print(line)
            continue
    if len(reranked_recs) < 10:
        print(lines)
    # print(f"# of random recommendations: {n_random_added}")
    # print(f"fixed: {n_fixed} records")
    return reranked_recs

In [42]:
new_data = []  # store here the new json data
n_random_added = 0
for entry in data:
    new_entry = {}
    for k in entry:
        new_entry[k] = entry[k]
        
    # recompute reranked recommendations based on the raw output
    new_recs = parse_raw_output_anime(entry["raw_gpt_outputs"])
    
    # add random recommendations if the recommendation set is incomplete
    new_entry['reranked_recs'], m = add_random_recs(new_recs, entry["recs"], entry["userid"])
    n_random_added += m
    
    # print new recommendation set
    new_data.append(new_entry)
print(f"# of random recommendations: {n_random_added}")

8-> Re:ZERO - Starting Life in Another World-
['1-> Attack on Titan', '2-> Fullmetal Alchemist: Brotherhood', '3-> Code Geass: Lelouch of the Rebellion', '4-> Sword Art Online', '5-> Demon Slayer: Kimetsu no Yaiba', '6-> One Punch Man', '7-> My Hero Academia', '8-> Re:ZERO - Starting Life in Another World-', '9-> Naruto: Shippuden', '10-> Food Wars! Shokugeki no Soma']
3-> Re:ZERO - Starting Life in Another World- Season 2
['1-> Attack on Titan Season 3 Part 2', '2-> Demon Slayer: Kimetsu no Yaiba', '3-> Re:ZERO - Starting Life in Another World- Season 2', '4-> The Promised Neverland', '5-> Sword Art Online: Alicization', '6-> Noragami Aragoto', '7-> Death Note', '8-> Haikyu!! 2nd Season', '9-> Toradora!', '10-> My Teen Romantic Comedy SNAFU Climax!']
3-> Re:ZERO - Starting Life in Another World- Season 2
['1-> Kaguya-sama: Love is War Season 2', '2-> Hyouka', '3-> Re:ZERO - Starting Life in Another World- Season 2', '4-> My Teen Romantic Comedy SNAFU Climax!', '5-> March Comes In Like

In [43]:
out_folder = "/home/diego/chat-reranking/experiments/anime/recs/reranked_final"
df = pd.DataFrame.from_dict(new_data)
df.to_json(f'{out_folder}/{recs_name}.json', orient="records")

In [44]:
recs_folder = "/home/diego/chat-reranking/experiments/anime/recs/reranked_final"
top_n = 10

# read JSON file
f = open(f"{recs_folder}/{recs_name}.json")
data = json.load(f)
f.close()

In [45]:
n_take = 10
for entry in data:
    if len(entry["reranked_recs"]) < 10:
        print(entry["userid"])
        n_take -= 1
    if n_take == 0:
        break

<h2> Re-parse recs for GPT (goodreads plots)<h2>

In [32]:
recs_folder = "/home/diego/chat-reranking/experiments/goodreads/recs/reranked/"
recs_name = "chatgpt0613-div-p21-pzt-fold_0"
top_n = 10

# read JSON file
f = open(f"{recs_folder}/{recs_name}.json")
data = json.load(f)
f.close()

In [33]:
with open(f"/home/diego/chat-reranking/experiments/goodreads/itemnameplot_to_id.pkl", 'rb') as fp:
    itemnameplot_to_id = pickle.load(fp)
with open(f"/home/diego/chat-reranking/experiments/goodreads/itemname_to_id.pkl", 'rb') as fp:
    itemname_to_id = pickle.load(fp)

In [34]:
def clean_name_anime(name: str, prompt:str) -> (int, str):
    
    if ": " in name:
        return 1, name.replace(": ", ":")
    
    if name[-1] == " ":
        # print(name)
        return 1, name[:-1]
 
    return 0, name

def parse_raw_output_anime(raw_output:str) -> list:
    n_fixed = 0
    lines = raw_output.splitlines()
    reranked_recs = []
    for line in lines:
        try:
            if len(line.split("-> ")) > 1:
                item_name = line.split("-> ")[1]
                
                if item_name in itemname_to_id:
                    reranked_recs.append(itemname_to_id[item_name])
                elif item_name in itemnameplot_to_id:
                    reranked_recs.append(itemnameplot_to_id[item_name])
                else:
                    # try to clean the name
                    fixed, cleaned_name = clean_name_anime(item_name, entry["prompt"])
                    n_fixed += fixed
                    if cleaned_name in itemname_to_id:
                        reranked_recs.append(itemname_to_id[cleaned_name])
                    elif cleaned_name in itemnameplot_to_id:
                        reranked_recs.append(itemnameplot_to_id[cleaned_name])
                    else:
                        # pass
                        print(line)
                        # print(itemname_to_id[item_name])
                        # print(len(cleaned_name))
        except Exception as e:
            print(line)
            continue
    if len(reranked_recs) < 10:
        print(lines)
    # print(f"# of random recommendations: {n_random_added}")
    # print(f"fixed: {n_fixed} records")
    return reranked_recs

In [35]:
new_data = []  # store here the new json data
n_random_added = 0
for entry in data:
    new_entry = {}
    for k in entry:
        new_entry[k] = entry[k]
        
    # recompute reranked recommendations based on the raw output
    new_recs = parse_raw_output_anime(entry["raw_gpt_outputs"])
    
    # add random recommendations if the recommendation set is incomplete
    new_entry['reranked_recs'], m = add_random_recs(new_recs, entry["recs"], entry["userid"])
    n_random_added += m
    
    # print new recommendation set
    new_data.append(new_entry)
print(f"# of random recommendations: {n_random_added}")

5-> Cry, the Beloved Country: A Novel of South Africa
['1-> The Terror', '2-> The Daodejing of Laozi', '3-> The Witch of Blackbird Pond and Related Readings', '4-> The Secret Life of Bees', '5-> Cry, the Beloved Country: A Novel of South Africa', '6-> In Cold Blood', '7-> The Letters of J.R.R. Tolkien', '8-> Fates Worse Than Death', "9-> Marley & Me: Life and Love with the World's Worst Dog", '10-> The Truth About the Drug Companies: How They Deceive Us and What to Do About It']
8-> The Color Purple [A collection of three novels by Alice Walker that explore themes of race, gender, and identity.]
["1-> Cumbres borrascosas [The Spanish translation of Emily Brontë's classic novel 'Wuthering Heights', which tells a tale of love, revenge, and passion.]", '2-> Dracula [The iconic vampire Count Dracula terrorizes Victorian England as a group of individuals try to stop his reign of darkness.]', '3-> The Art of War by Sun Tzu [A classic book on military strategy and tactics.]', '4-> The Catcher

In [36]:
out_folder = "/home/diego/chat-reranking/experiments/goodreads/recs/reranked_final"
df = pd.DataFrame.from_dict(new_data)
df.to_json(f'{out_folder}/{recs_name}.json', orient="records")

In [37]:
recs_folder = "/home/diego/chat-reranking/experiments/goodreads/recs/reranked_final"
top_n = 10

# read JSON file
f = open(f"{recs_folder}/{recs_name}.json")
data = json.load(f)
f.close()

In [38]:
n_take = 10
for entry in data:
    if len(entry["reranked_recs"]) < 10:
        print(entry["userid"])
        n_take -= 1
    if n_take == 0:
        break