In [2]:
import json
import re
import pickle
import pandas as pd
import random

Method to add random recommendations from the candidate set when recommendation set is incomplete

In [7]:
def add_random_recs(initial_recs, candidate_set, userid) -> (list, int):
    
    initial_recs = [a for a in initial_recs if a in candidate_set]
    
    # a strange case where more than 10 items are recommeded
    if len(initial_recs) > 10:
        initial_recs = initial_recs[:10]

    if len(initial_recs) == 10:
        return initial_recs, 0
    
    new_candidates = list(set(candidate_set).difference(initial_recs))
    nn = 10 - len(initial_recs)
    random.shuffle(new_candidates)
    try:
        sampled = random.sample(new_candidates, nn)
    except Exception as e:
        print(userid)
        print(nn)
        print(new_candidates)
    
    assert len(initial_recs + sampled) == 10
    
    return initial_recs + sampled, nn

<h2> Re-parse recs for InstructGPT and Chatgpt (anime)<h2>

In [165]:
recs_folder = "/home/diego/chat-reranking/experiments/anime/recs/reranked"
recs_name = "chatgpt0613-div-p1-pzt-fold_0"
top_n = 10

# read JSON file
f = open(f"{recs_folder}/{recs_name}.json")
data = json.load(f)
f.close()

In [166]:
if "p11" in recs_name or "p12" in recs_name or "p5" in recs_name or "p6" in recs_name:
    with open(f"/home/diego/chat-reranking/experiments/anime/itemnamegenres_to_id.pkl", 'rb') as fp:
        itemname_to_id = pickle.load(fp)
else:
    with open(f"/home/diego/chat-reranking/experiments/anime/itemname_to_id.pkl", 'rb') as fp:
        itemname_to_id = pickle.load(fp)

In [167]:
def clean_name_anime(name: str, prompt:str) -> (int, str):
    
    # check first if name is in prompt
    if name in prompt:
        return 0, name
    
    if "(" in name:
        return 1, name.split("(")[0][:-1]
    
   #  these commented advanced parsing can improve the results! 
    if ": " in name:
        return 1, name.replace(": ", ":")
 
    print(name)
    return 0, name

def parse_raw_output_anime(raw_output:str) -> (list, int):
    n_fixed = 0
    lines = raw_output.splitlines()
    reranked_recs = []
    for line in lines:
        try:
            if len(line.split("-> ")) > 1:
                item_name = line.split("-> ")[1]
                
                # clean name
                fixed, cleaned_name = clean_name_anime(item_name, entry["prompt"])
                n_fixed += fixed
                reranked_recs.append(itemname_to_id[cleaned_name])
                continue

            if len(re.split('1. |2. |3. |4. |5. |6. |7. |8. |9. |10. ', line)) > 0:
                item_name = re.split('1. |2. |3. |4. |5. |6. |7. |8. |9. |10. ', line)[1]
                
                # clean name
                fixed, cleaned_name = clean_name_anime(item_name, entry["prompt"])
                n_fixed += fixed
                reranked_recs.append(itemname_to_id[cleaned_name])
        except Exception as e:
            continue
    # print(f"fixed: {n_fixed} records")
    return reranked_recs, n_fixed

In [168]:
new_data = []  # store here the new json data
n_random_added = 0
total_fixed = 0
for entry in data:
    new_entry = {}
    for k in entry:
        new_entry[k] = entry[k]
        
    # recompute reranked recommendations based on the raw output
    new_recs, n_fixed = parse_raw_output_anime(entry["raw_gpt_outputs"])
    total_fixed += n_fixed
    
    # add random recommendations if the recommendation set is incomplete
    new_entry['reranked_recs'], m = add_random_recs(new_recs, entry["recs"], entry["userid"])
    n_random_added += m
    
    # print new recommendation set
    new_data.append(new_entry)

print(f"# of random recommendations: {n_random_added}")
print(f"# of fixed recommendations: {total_fixed}")

Re:ZERO - Starting Life in Another World - Season 2
Re:ZERO - Starting Life in Another World-
Re:ZERO - Starting Life in Another World- Season 2
A Place Further Than the Universe
Re:ZERO - Starting Life in Another World-
Re:ZERO - Starting Life in Another World-
# of random recommendations: 136
# of fixed recommendations: 272


In [169]:
out_folder = "/home/diego/chat-reranking/experiments/anime/recs/reranked_final"
df = pd.DataFrame.from_dict(new_data)
df.to_json(f'{out_folder}/{recs_name}.json', orient="records")

<h2> Re-parse recs for InstructGPT and Chatgpt (goodreads)<h2>

In [43]:
recs_folder = "/home/diego/chat-reranking/experiments/goodreads/recs/reranked"
recs_name = "chatgpt0613-div-p21-pzt-fold_0"
top_n = 10

# read JSON file
f = open(f"{recs_folder}/{recs_name}.json")
data = json.load(f)
f.close()

In [44]:
if "p11" in recs_name or "p12" in recs_name or "p5" in recs_name or "p6" in recs_name:
    with open(f"/home/diego/chat-reranking/experiments/goodreads/itemnamegenres_to_id.pkl", 'rb') as fp:
        itemname_to_id = pickle.load(fp)
else:
    with open(f"/home/diego/chat-reranking/experiments/goodreads/itemname_to_id.pkl", 'rb') as fp:
        itemname_to_id = pickle.load(fp)

In [45]:
def clean_name_goodreads(name: str, prompt:str) -> (int, str):
    
    # check first if name is in prompt
    if name in prompt:
        return 0, name
    
    if "(" in name:
        print(name)
        return 1, name.split("(")[0][:-1]
    
    if "[" in name:
        print(name)
        return 1, name.split("[")[0][:-1]
    
   #  these commented advanced parsing can improve the results! 
    if ": " in name:
        print(name)
        return 1, name.replace(": ", ":")
 
    print(name)
    return 0, name

def parse_raw_output_goodreads(raw_output:str) -> (list, int):
    n_fixed = 0
    lines = raw_output.splitlines()
    reranked_recs = []
    for line in lines:
        try:
            if len(line.split("-> ")) > 1:
                item_name = line.split("-> ")[1]
                
                # clean name
                fixed, cleaned_name = clean_name_goodreads(item_name, entry["prompt"])
                n_fixed += fixed
                reranked_recs.append(itemname_to_id[cleaned_name])
                continue

            if len(re.split('1. |2. |3. |4. |5. |6. |7. |8. |9. |10. ', line)) > 0:
                item_name = re.split('1. |2. |3. |4. |5. |6. |7. |8. |9. |10. ', line)[1]
                
                # clean name
                fixed, cleaned_name = clean_name_anime(item_name, entry["prompt"])
                n_fixed += fixed
                reranked_recs.append(itemname_to_id[cleaned_name])
        except Exception as e:
            continue
    # print(f"fixed: {n_fixed} records")
    return reranked_recs, n_fixed

In [46]:
new_data = []  # store here the new json data
n_random_added = 0
total_fixed = 0
for entry in data:
    new_entry = {}
    for k in entry:
        new_entry[k] = entry[k]
        
    # recompute reranked recommendations based on the raw output
    new_recs, n_fixed = parse_raw_output_goodreads(entry["raw_gpt_outputs"])
    total_fixed += n_fixed
    
    # add random recommendations if the recommendation set is incomplete
    new_entry['reranked_recs'], m = add_random_recs(new_recs, entry["recs"], entry["userid"])
    n_random_added += m
    
    # print new recommendation set
    new_data.append(new_entry)

print(f"# of random recommendations: {n_random_added}")
print(f"# of fixed recommendations: {total_fixed}")

The Catcher in the Rye [A collection of materials and analysis related to J.D. Salinger's iconic novel 'The Catcher in the Rye.']
The No. 1 Ladies' Detective Agency (No. 1 Ladies' Detective Agency, #1)
The Devil's Broker: Seeking Gold, God, and Glory in Fourteenth-Century Italy
The Turn of the Screw [A governess becomes convinced that the children she cares for are being haunted by ghosts.]
The Catcher in the Rye [A collection of materials and analysis related to J.D. Salinger's iconic novel 'The Catcher in the Rye.']
The Aeneid: Selections from Books 1, 2, 4, 6, 10, 12 [Selected passages from Virgil's Aeneid.]
The No. 1 Ladies' Detective Agency (No. 1 Ladies' Detective Agency, #1)
Murasaki Shikibu: The Tale of Genji (Landmarks of World Literature)
The Color Purple [A novel about a woman named Emma Bovary who seeks passion and excitement outside of her marriage.]
The Lord of the Rings Trilogy Jigsaw Book - Collectors Edition
Murasaki Shikibu: The Tale of Genji (Landmarks of World Liter

In [42]:
out_folder = "/home/diego/chat-reranking/experiments/goodreads/recs/reranked_final"
df = pd.DataFrame.from_dict(new_data)
df.to_json(f'{out_folder}/{recs_name}.json', orient="records")

<h2> Re-parse recs for LLama2 (Goodreads)<h2>

In [53]:
recs_folder = "/home/diego/chat-reranking/experiments/goodreads/recs/reranked/llama2"
recs_name = "Llama-2-7b-chat-hf-div-p11-pzt-fold_0"
top_n = 10

# read JSON file
f = open(f"{recs_folder}/{recs_name}.json")
data = json.load(f)
f.close()

In [54]:
~if "p11" in recs_name or "p12" in recs_name or "p5" in recs_name or "p6" in recs_name:
    with open(f"/home/diego/chat-reranking/experiments/goodreads/itemnamegenres_to_id.pkl", 'rb') as fp:
        itemname_to_id = pickle.load(fp)
else:
    with open(f"/home/diego/chat-reranking/experiments/goodreads/itemname_to_id.pkl", 'rb') as fp:
        itemname_to_id = pickle.load(fp)

SyntaxError: invalid syntax (3579598413.py, line 1)

In [55]:
def clean_name_goodreads(name: str, prompt:str) -> (int, str):
    
    # check first if name is in prompt
    if name in prompt:
        return 0, name
    
    name = name.replace('"', "")
    
    # otherwise fix all the malformed output
    if " (#" in name:
        return 1, name.split(" (#")[0]
    
    if "(" in name:
        return 1, name.split(" (")[0]
    
    if "by" in name:
        splitted = name.split(" by ")
        if len(splitted) == 2:
            return 1, splitted[0]
        
    if "-" in name: #  in prompt p11,p12 the list of genres are given after dash
        splitted = name.split(" - ")
        if len(splitted) == 2:
            new_out = f"{splitted[0]} ({splitted[1].lower()})"
            # print(new_out)
            return 1, new_out
    # print(name)
    return 0, name

def parse_raw_output_goodreads(raw_output:str) -> (list, int):
    n_fixed = 0
    lines = raw_output.splitlines()
    reranked_recs = []
    for line in lines:
        try:
            if len(line.split("-> ")) > 1:
                item_name = line.split("-> ")[1]
                
                # clean name
                fixed, cleaned_name = clean_name_goodreads(item_name, entry["prompt"])
                n_fixed += fixed
                reranked_recs.append(itemname_to_id[cleaned_name])
                continue

            if len(re.split('1. |2. |3. |4. |5. |6. |7. |8. |9. |10. ', line)) > 0:
                item_name = re.split('1. |2. |3. |4. |5. |6. |7. |8. |9. |10. ', line)[1]
                
                # clean name
                fixed, cleaned_name = clean_name_goodreads(item_name, entry["prompt"])
                n_fixed += fixed
                reranked_recs.append(itemname_to_id[cleaned_name])
        except Exception as e:
            continue
    # print(f"fixed: {n_fixed} records")
    return reranked_recs, n_fixed

In [56]:
new_data = []  # store here the new json data
n_random_added = 0
total_fixed = 0
for entry in data:
    new_entry = {}
    for k in entry:
        new_entry[k] = entry[k]
        
    # recompute reranked recommendations based on the raw output
    new_recs, n_fixed = parse_raw_output_goodreads(entry["raw_gpt_outputs"])
    total_fixed += n_fixed
    
    # add random recommendations if the recommendation set is incomplete
    new_entry['reranked_recs'], m = add_random_recs(new_recs, entry["recs"], entry["userid"])
    n_random_added += m
    
    # print new recommendation set
    new_data.append(new_entry)
print(f"# of random recommendations: {n_random_added}")
print(f"# of fixed recommendations: {total_fixed}")

# of random recommendations: 4354
# of fixed recommendations: 1118


In [57]:
out_folder = "/home/diego/chat-reranking/experiments/goodreads/recs/reranked_final"
df = pd.DataFrame.from_dict(new_data)
df.to_json(f'{out_folder}/{recs_name}.json', orient="records")

In [184]:
recs_folder = "/home/diego/chat-reranking/experiments/goodreads/recs/reranked_final"
recs_name = "Llama-2-7b-chat-hf-div-p7-pzt-fold_0"
top_n = 10

# read JSON file
f = open(f"{recs_folder}/{recs_name}.json")
data = json.load(f)
f.close()

In [185]:
n_take = 10
for entry in data:
    if len(entry["reranked_recs"]) < 10:
        print(entry["userid"])
        n_take -= 1
    if n_take == 0:
        break

<h2> Re-parse recs for LLama2 (anime)<h2>

In [9]:
recs_folder = "/home/diego/chat-reranking/experiments/anime/recs/reranked/"
recs_name = "Llama-2-13b-chat-hf-div-p32-pzt-fold_0"
top_n = 10

# read JSON file
f = open(f"{recs_folder}/{recs_name}.json")
data = json.load(f)
f.close()

In [10]:
if "p11" in recs_name or "p12" in recs_name or "p5" in recs_name or "p6" in recs_name:
    with open(f"/home/diego/chat-reranking/experiments/anime/itemnamegenres_to_id.pkl", 'rb') as fp:
        itemname_to_id = pickle.load(fp)
# elif "p31" in recs_name or "p32" in recs_name:
#     with open(f"/home/diego/chat-reranking/experiments/anime/itemnameplot_to_id.pkl", 'rb') as fp:
#         itemname_to_id = pickle.load(fp)
else:
    with open(f"/home/diego/chat-reranking/experiments/anime/itemname_to_id.pkl", 'rb') as fp:
        itemname_to_id = pickle.load(fp)

In [11]:
def clean_name_anime(name: str, prompt:str) -> (int, str):
    
    # check first if name is in prompt
    if name in prompt:
        return 0, name
    
    if "(" in name:
        return 1, name.split("(")[0][:-1]
    
    if "[" in name:
        print(name.split("[")[0][:-1])
        return 1, name.split("[")[0][:-1] 
    
   #  these commented advanced parsing can improve the results! 
    if ": " in name:
        return 1, name.replace(": ", ":")
    # 
    if " - " in name:
        return 1, name.split(" - ")[0]
 
    print(name)
    return 0, name

def parse_raw_output_anime(raw_output:str) -> (list, int):
    n_fixed = 0
    lines = raw_output.splitlines()
    reranked_recs = []
    for line in lines:
        try:
            if len(line.split("-> ")) > 1:
                item_name = line.split("-> ")[1]
                
                # clean name
                fixed, cleaned_name = clean_name_anime(item_name, entry["prompt"])
                n_fixed += fixed
                reranked_recs.append(itemname_to_id[cleaned_name])
                continue

            if len(re.split('1. |2. |3. |4. |5. |6. |7. |8. |9. |10. ', line)) > 0:
                item_name = re.split('1. |2. |3. |4. |5. |6. |7. |8. |9. |10. ', line)[1]
                
                # clean name
                fixed, cleaned_name = clean_name_anime(item_name, entry["prompt"])
                n_fixed += fixed
                reranked_recs.append(itemname_to_id[cleaned_name])
        except Exception as e:
            continue
    # print(f"fixed: {n_fixed} records")
    return reranked_recs, n_fixed

In [12]:
new_data = []  # store here the new json data
n_random_added = 0
total_fixed = 0
for entry in data:
    new_entry = {}
    for k in entry:
        new_entry[k] = entry[k]
        
    # recompute reranked recommendations based on the raw output
    new_recs, n_fixed = parse_raw_output_anime(entry["raw_gpt_outputs"])
    total_fixed += n_fixed
    
    # add random recommendations if the recommendation set is incomplete
    new_entry['reranked_recs'], m = add_random_recs(new_recs, entry["recs"], entry["userid"])
    n_random_added += m
    
    # print new recommendation set
    new_data.append(new_entry)
print(f"# of random recommendations: {n_random_added}")
print(f"# of fixed recommendations: {total_fixed}")

anime, taking into account the diversity of genres and themes:
anime, taking into account the diversity of genres and themes:
anime are diverse in terms of genre, tone, and themes, offering a variety of stories and perspectives for the user to explore.
anime, with a focus on diversity:
anime, taking into account the goal of maximizing diversity in the list:
anime are diverse in terms of genre, tone, and themes, offering something for
anime, with a focus on diversity:
list of anime based on the provided list, with a focus on maximizing diversity:
anime from the provided list, with the goal of maximizing diversity in the list:
anime on this list offer a mix of action, drama, comedy, and supernatural elements, and they cover a range of topics such as friendship, sacrifice, and personal growth. I hope you find this list helpful and enjoyable!
anime, taking into account the goal of maximizing diversity in the list:
anime, taking into account the goal of maximizing diversity in the list:
ani

In [13]:
out_folder = "/home/diego/chat-reranking/experiments/anime/recs/reranked_final"
df = pd.DataFrame.from_dict(new_data)
df.to_json(f'{out_folder}/{recs_name}.json', orient="records")

In [24]:
recs_folder = "/home/diego/chat-reranking/experiments/anime/recs/reranked_final"
top_n = 10

# read JSON file
f = open(f"{recs_folder}/{recs_name}.json")
data = json.load(f)
f.close()

In [25]:
n_take = 10
for entry in data:
    if len(entry["reranked_recs"]) < 10:
        print(entry["userid"])
        n_take -= 1
    if n_take == 0:
        break

<h2> Re-parse recs for GPT (anime plots)<h2>

In [17]:
recs_folder = "/home/diego/chat-reranking/experiments/anime/recs/reranked/"
recs_name = "instructgpt-div-p21-pzt-fold_0"
top_n = 10

# read JSON file
f = open(f"{recs_folder}/{recs_name}.json")
data = json.load(f)
f.close()

In [18]:
with open(f"/home/diego/chat-reranking/experiments/anime/itemnameplot_to_id.pkl", 'rb') as fp:
    itemnameplot_to_id = pickle.load(fp)
with open(f"/home/diego/chat-reranking/experiments/anime/itemname_to_id.pkl", 'rb') as fp:
    itemname_to_id = pickle.load(fp)

In [19]:
def clean_name_anime(name: str, prompt:str) -> (int, str):
    
    if ": " in name:
        return 1, name.replace(": ", ":")
    
    if name[-1] == " ":
        # print(name)
        return 1, name[:-1]
 
    return 0, name

def parse_raw_output_anime(raw_output:str) -> (list, int):
    n_fixed = 0
    lines = raw_output.splitlines()
    reranked_recs = []
    for line in lines:
        try:
            if len(line.split("-> ")) > 1:
                item_name = line.split("-> ")[1]
                
                if item_name in itemname_to_id:
                    reranked_recs.append(itemname_to_id[item_name])
                elif item_name in itemnameplot_to_id:
                    reranked_recs.append(itemnameplot_to_id[item_name])
                else:
                    # try to clean the name
                    fixed, cleaned_name = clean_name_anime(item_name, entry["prompt"])
                    n_fixed += fixed
                    if cleaned_name in itemname_to_id:
                        reranked_recs.append(itemname_to_id[cleaned_name])
                    elif cleaned_name in itemnameplot_to_id:
                        reranked_recs.append(itemnameplot_to_id[cleaned_name])
                    else:
                        # pass
                        print(line)
                        # print(itemname_to_id[item_name])
                        # print(len(cleaned_name))
        except Exception as e:
            print(line)
            continue
    if len(reranked_recs) < 10:
        print(lines)
    # print(f"# of random recommendations: {n_random_added}")
    # print(f"fixed: {n_fixed} records")
    return reranked_recs, n_fixed

In [20]:
new_data = []  # store here the new json data
n_random_added = 0
total_fixed = 0
for entry in data:
    new_entry = {}
    for k in entry:
        new_entry[k] = entry[k]
        
    # recompute reranked recommendations based on the raw output
    new_recs, n_fix = parse_raw_output_anime(entry["raw_gpt_outputs"])
    total_fixed += n_fix 
    
    # add random recommendations if the recommendation set is incomplete
    new_entry['reranked_recs'], m = add_random_recs(new_recs, entry["recs"], entry["userid"])
    n_random_added += m
    
    # print new recommendation set
    new_data.append(new_entry)
print(f"# of random recommendations: {n_random_added}")
print(f"# of fixed recommendations: {total_fixed}")

2-> Anohana: The Flower We Saw That Day
['', '1-> Steins;Gate', '2-> Anohana: The Flower We Saw That Day', '3-> Terror in Resonance', '4-> The World God Only Knows', '5-> My Teen Romantic Comedy SNAFU TOO!', '6-> Say "I Love You".', '7-> Beyond the Boundary', '8-> Charlotte', '9-> The Kawai Complex Guide to Manors and Hostel Behavior', '10-> Oreimo 2']
8-> Re:ZERO - Starting Life in Another World-
['', '1-> Attack on Titan', '2-> Fullmetal Alchemist: Brotherhood', '3-> Sword Art Online', '4-> Naruto: Shippuden', '5-> My Hero Academia', '6-> Demon Slayer: Kimetsu no Yaiba', '7-> Food Wars! Shokugeki no Soma', '8-> Re:ZERO - Starting Life in Another World-', '9-> The Irregular at Magic High School', '10-> Code Geass: Lelouch of the Rebellion']
4-> Re:ZERO - Starting Life in Another World- Season 2
['', '1-> Gurren Lagann', '2-> Mob Psycho 100 II', '3-> Kaguya-sama: Love is War', '4-> Re:ZERO - Starting Life in Another World- Season 2', '5-> KILL la KILL', '6-> The Rising of the Shield He

In [21]:
out_folder = "/home/diego/chat-reranking/experiments/anime/recs/reranked_final"
df = pd.DataFrame.from_dict(new_data)
df.to_json(f'{out_folder}/{recs_name}.json', orient="records")

In [144]:
recs_folder = "/home/diego/chat-reranking/experiments/anime/recs/reranked_final"
top_n = 10

# read JSON file
f = open(f"{recs_folder}/{recs_name}.json")
data = json.load(f)
f.close()

In [145]:
n_take = 10
for entry in data:
    if len(entry["reranked_recs"]) < 10:
        print(entry["userid"])
        n_take -= 1
    if n_take == 0:
        break

<h2> Re-parse recs for GPT (goodreads plots)<h2>

In [250]:
recs_folder = "/home/diego/chat-reranking/experiments/goodreads/recs/reranked/"
recs_name = "instructgpt-div-p22-pzt-fold_0"
top_n = 10

# read JSON file
f = open(f"{recs_folder}/{recs_name}.json")
data = json.load(f)
f.close()

In [251]:
with open(f"/home/diego/chat-reranking/experiments/goodreads/itemnameplot_to_id.pkl", 'rb') as fp:
    itemnameplot_to_id = pickle.load(fp)
with open(f"/home/diego/chat-reranking/experiments/goodreads/itemname_to_id.pkl", 'rb') as fp:
    itemname_to_id = pickle.load(fp)

In [252]:
def clean_name_anime(name: str, prompt:str) -> (int, str):
    
    if ": " in name:
        return 1, name.replace(": ", ":")
    
    if name[-1] == " ":
        # print(name)
        return 1, name[:-1]
 
    return 0, name

def parse_raw_output_anime(raw_output:str) -> (list, int):
    n_fixed = 0
    lines = raw_output.splitlines()
    reranked_recs = []
    for line in lines:
        try:
            if len(line.split("-> ")) > 1:
                item_name = line.split("-> ")[1]
                
                if item_name in itemname_to_id:
                    reranked_recs.append(itemname_to_id[item_name])
                elif item_name in itemnameplot_to_id:
                    reranked_recs.append(itemnameplot_to_id[item_name])
                else:
                    # try to clean the name
                    fixed, cleaned_name = clean_name_anime(item_name, entry["prompt"])
                    n_fixed += fixed
                    if cleaned_name in itemname_to_id:
                        reranked_recs.append(itemname_to_id[cleaned_name])
                    elif cleaned_name in itemnameplot_to_id:
                        reranked_recs.append(itemnameplot_to_id[cleaned_name])
                    else:
                        # pass
                        print(line)
                        # print(itemname_to_id[item_name])
                        # print(len(cleaned_name))
        except Exception as e:
            print(line)
            continue
    if len(reranked_recs) < 10:
        print(lines)
    # print(f"# of random recommendations: {n_random_added}")
    # print(f"fixed: {n_fixed} records")
    return reranked_recs, n_fixed

In [253]:
new_data = []  # store here the new json data
n_random_added = 0
total_fixed = 0
for entry in data:
    new_entry = {}
    for k in entry:
        new_entry[k] = entry[k]
        
    # recompute reranked recommendations based on the raw output
    new_recs, n_fixed = parse_raw_output_anime(entry["raw_gpt_outputs"])
    total_fixed += n_fixed
    
    # add random recommendations if the recommendation set is incomplete
    new_entry['reranked_recs'], m = add_random_recs(new_recs, entry["recs"], entry["userid"])
    n_random_added += m
    
    # print new recommendation set
    new_data.append(new_entry)
print(f"# of random recommendations: {n_random_added}")
print(f"# of fixed recommendations: {total_fixed}")

10-> The Prodigal Sons and Material Girls: How Not to Be Your Child's ATM
['1-> The Secret Life of Bees', '2-> Theban Plays', '3-> The Iliad', '4-> The Catcher In The Rye', '5-> The Great Redwall Feast', '6-> The ELEGANT UNIVERSE S.S.', '7-> The Brooklyn Follies', '8-> The Compleat Works of Wllm Shkspr', '9-> The Prodigal Son (Roger the Chapman, #15)', "10-> The Prodigal Sons and Material Girls: How Not to Be Your Child's ATM"]
8-> Murasaki Shikibu: The Tale of Genji (Landmarks of World Literature)
['1-> The Tale Of Genji', '2-> Theban Plays', '3-> The Persian War', '4-> V.', '5-> Triss (Redwall, #15)', '6-> Great Jones Street', '7-> The Complete Nonsense of Edward Lear', '8-> Murasaki Shikibu: The Tale of Genji (Landmarks of World Literature)', '9-> Prince of Ice (Tale of the Demon World, #3)', '10-> The Winter King (The Arthur Books, #1)']
10-> The Terror: A Novel
['', '1-> The Terror', '2-> The Inner Life of Martin Frost', '3-> The Witch of Blackbird Pond and Related Readings', '4->

In [254]:
out_folder = "/home/diego/chat-reranking/experiments/goodreads/recs/reranked_final"
df = pd.DataFrame.from_dict(new_data)
df.to_json(f'{out_folder}/{recs_name}.json', orient="records")

In [255]:
recs_folder = "/home/diego/chat-reranking/experiments/goodreads/recs/reranked_final"
top_n = 10

# read JSON file
f = open(f"{recs_folder}/{recs_name}.json")
data = json.load(f)
f.close()

In [38]:
n_take = 10
for entry in data:
    if len(entry["reranked_recs"]) < 10:
        print(entry["userid"])
        n_take -= 1
    if n_take == 0:
        break

<h2> Re-parse recs for LLama2 (goodreads plots)<h2>

In [238]:
recs_folder = "/home/diego/chat-reranking/experiments/goodreads/recs/reranked/"
recs_name = "Llama-2-7b-chat-hf-div-p32-pzt-fold_0"
top_n = 10

# read JSON file
f = open(f"{recs_folder}/{recs_name}.json")
data = json.load(f)
f.close()

In [239]:
# if "p11" in recs_name or "p12" in recs_name or "p5" in recs_name or "p6" in recs_name:
#     with open(f"/home/diego/chat-reranking/experiments/anime/itemnamegenres_to_id.pkl", 'rb') as fp:
#         itemname_to_id = pickle.load(fp)
# # elif "p31" in recs_name or "p32" in recs_name:
# #     print("here")
# #     with open(f"/home/diego/chat-reranking/experiments/goodreads/itemnameplot_to_id.pkl", 'rb') as fp:
# #         itemname_to_id = pickle.load(fp)
# else:
#     with open(f"/home/diego/chat-reranking/experiments/goodreads/itemname_to_id.pkl", 'rb') as fp:
#         itemname_to_id = pickle.load(fp)
with open(f"/home/diego/chat-reranking/experiments/goodreads/itemnameplot_to_id.pkl", 'rb') as fp:
    itemnameplot_to_id = pickle.load(fp)
with open(f"/home/diego/chat-reranking/experiments/goodreads/itemname_to_id.pkl", 'rb') as fp:
    itemname_to_id = pickle.load(fp)

In [240]:
def clean_name_goodreads(name: str, prompt:str) -> (int, str):
    
    # check first if name is in prompt
    # if name in prompt:
    #     return 0, name
    
    # name = name.replace('"', "")
    # 
    # otherwise fix all the malformed output


    # if " (" in name:
    #     return 1, name.split(" (")[0]

    # if "by" in name:
    #     splitted = name.split(" by ")
    #     if len(splitted) == 2:
    #         return 1, splitted[0]

    # if "[" in name:
    #     return 1, name.split(" [")[0]
    
    if " - " in name: #  in prompt p11,p12 the list of genres are given after dash
        # print(name)
        splitted = name.split(" - ")
        if splitted == 2:
            if " (#" in splitted[0]:
                return 1, splitted.split(" (#")[0]
            else:
                return 1, splitted[0]        
        
    # if "(#" in name:
    #     return 1, name.split(" (#")[0]
        # print(len(name))
        # if len(splitted) == 2:
        #     print(splitted[0])
        #     # new_out = f"{splitted[0]} ({splitted[1].lower()})"
        #     if "(" in splitted[0]:
        #         print(splitted[0])
        #         return 1, splitted[0].split(" (")[0] 
        #     # return 1, splitted[0]
        # else:
        #     return 1, splitted.split(" (")[0]
    
    # if name in prompt:
    #     return 0, name
    # 
    if " (" in name:
        return 1, name.split(" (")[0]
    if " [" in name:
        print(name)
        return 1, name.split(" [")[0]
    
    
    # print(name)
    return 0, name

def parse_raw_output_goodreads(raw_output:str) -> (list, int):
    n_fixed = 0
    lines = raw_output.splitlines()
    reranked_recs = []
    for line in lines:
        try:
            if len(re.split('1. |2. |3. |4. |5. |6. |7. |8. |9. |10. ', line)) > 0:
                item_name = re.split('1. |2. |3. |4. |5. |6. |7. |8. |9. |10. ', line)[1]
                
                # clean name
                if item_name in itemname_to_id:
                    # print("diego")
                    reranked_recs.append(itemname_to_id[item_name])
                elif item_name in itemnameplot_to_id:
                    # print("maria")
                    reranked_recs.append(itemnameplot_to_id[item_name])
                else:
                    # try to clean the name
                    fixed, cleaned_name = clean_name_goodreads(item_name, entry["prompt"])
                    n_fixed += fixed
                    if cleaned_name in itemname_to_id:
                        reranked_recs.append(itemname_to_id[cleaned_name])
                    elif cleaned_name in itemnameplot_to_id:
                        reranked_recs.append(itemnameplot_to_id[cleaned_name])
                    else:
                        # pass
                        print(item_name)
                        print(line)
                        # print(itemname_to_id[item_name])
                        # print(len(cleaned_name))

        except Exception as e:
            continue
    # print(f"fixed: {n_fixed} records")
    return reranked_recs, n_fixed

In [241]:
new_data = []  # store here the new json data
n_random_added = 0
total_fixed = 0
for entry in data:
    new_entry = {}
    for k in entry:
        new_entry[k] = entry[k]
        
    # recompute reranked recommendations based on the raw output
    new_recs, n_fixed = parse_raw_output_goodreads(entry["raw_gpt_outputs"])
    total_fixed += n_fixed
    
    # add random recommendations if the recommendation set is incomplete
    new_entry['reranked_recs'], m = add_random_recs(new_recs, entry["recs"], entry["userid"])
    n_random_added += m
    
    # print new recommendation set
    new_data.append(new_entry)
print(f"# of random recommendations: {n_random_added}")
print(f"# of fixed recommendations: {total_fixed}")

books from the list above, in the format requested.
Please provide the top 10 books from the list above, in the format requested.
books from the list above, in the format requested.
Please provide the top 10 books from the list above, in the format requested.
books from the list, in the format requested.
Please provide the top 10 books from the list, in the format requested.
Diversity of genres and topics (books with a wider range of genres and topics are ranked higher)
1. Diversity of genres and topics (books with a wider range of genres and topics are ranked higher)
Popularity and recognition (books that are more widely recognized and popular are ranked higher)
2. Popularity and recognition (books that are more widely recognized and popular are ranked higher)
Critical acclaim and awards (books that have received critical acclaim and awards are ranked higher)
3. Critical acclaim and awards (books that have received critical acclaim and awards are ranked higher)
recommendation list in 

In [242]:
out_folder = "/home/diego/chat-reranking/experiments/goodreads/recs/reranked_final"
df = pd.DataFrame.from_dict(new_data)
df.to_json(f'{out_folder}/{recs_name}.json', orient="records")