In [11]:
import pandas as pd
import pickle
from ast import literal_eval

In [20]:
def load_data(dataset_dir: str,
              fold: str
              ) -> (pd.DataFrame, pd.DataFrame, dict):

    # load dataset
    dir_fold = f"{dataset_dir}fold_{fold}"
    data = pd.read_csv(f"{dir_fold}/data.csv")

    data.support = data.support.apply(literal_eval)
    data.test_set = data.test_set.apply(literal_eval)
    data.remaining_pos = data.remaining_pos.apply(literal_eval)
    data.movieId = data.movieId.apply(literal_eval)
    data.pos = data.pos.apply(literal_eval)
    data.neg = data.neg.apply(literal_eval)
    data.candidates = data.candidates.apply(literal_eval)

    # load items
    items = pd.read_csv(f"{dataset_dir}items.csv")

    with open(f"{dataset_dir}itemid_to_name.pkl", 'rb') as fp:
        itemid_to_name = pickle.load(fp)

    return data, itemid_to_name, items


def prepare_df_evaluation(dataset: pd.DataFrame,
                          recs: dict,
                          max_cutoff: int
                          ) -> pd.DataFrame:

    users = list(recs.keys())
    ratings_column, rec_column, test_column = [], [], []
    for user in recs:
        user_df = dataset[dataset['userId'] == user]
        ratings_column.append(list(user_df['movieId'].values)[0])
        test_column.append(list(user_df['test_set'].values)[0])
        rec_column.append([el[0] for el in recs[user]])

    df = pd.DataFrame(data={"userId": users,
                            "ratings": ratings_column,
                            "test_set": test_column,
                            "recs": rec_column})

    for k in range(5, max_cutoff + 1, 5):
        df[f"hits@{k}"] = df.apply(lambda x: list(set(x["recs"][:k]).intersection(x["test_set"])), axis=1)

    return df

def jaccard(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union

def _ild(recs: list,
         cutoff: int,
         items: pd.DataFrame
         ) -> float:
    score = 0
    for i, x in enumerate(recs[:cutoff-1]):
        for y in recs[i + 1:cutoff]:
            score += 1 - jaccard(items.loc[x]["genres"], items.loc[y]["genres"])
    return score/(cutoff*(cutoff-1))


def _eild(hits: list,
          cutoff: int,
          items: pd.DataFrame
          ) -> float:
    if len(hits) < 2:
        return 0
    else:
        score = 0
        for i, x in enumerate(hits[:cutoff-1]):
            for y in hits[i + 1:cutoff]:
                score += 1 - jaccard(items.loc[x]["genres"], items.loc[y]["genres"])
    return score/(cutoff*(cutoff-1))

def eild(df_div: pd.DataFrame,
         items: pd.DataFrame
         ) -> pd.DataFrame:
    """Expected Intra-List Diversity (based on genres and Jaccard distance)"""

    df_div[f"eild@{5}"] = df_div.apply(lambda x: _eild(x[f"hits@{5}"], 5, items), axis=1)
    df_div[f"eild@{10}"] = df_div.apply(lambda x: _eild(x[f"hits@{10}"], 10, items), axis=1)

    return df_div

def ild(df_div: pd.DataFrame,
         items: pd.DataFrame
         ) -> pd.DataFrame:
    """Expected Intra-List Diversity (based on genres and Jaccard distance)"""

    df_div[f"ild@{5}"] = df_div.apply(lambda x: _ild(x["recs"], 5, items), axis=1)
    df_div[f"ild@{10}"] = df_div.apply(lambda x: _ild(x["recs"], 10, items), axis=1)

    return df_div

def ild_pair(pair, features):
    return 1 - jaccard(features.loc[pair[0]]["genres"], features.loc[pair[1]]["genres"])

In [21]:
# from itertools import combinations
# def find_most_similar_movies(list_of_movies, features):
#
#     comb = list(combinations(list_of_movies, 2))
#     scores = []
#     for c in comb:
#         scores.append(ild_pair(c, features))
#     return comb(scores.index(max(scores)))
#
# find_most_similar_movies([110, 457, 318, 1, 377, 34, 32, 380, 21, 367], df_items)

In [13]:
evaluation_dir = "/home/diego/lamore/results/zero_shot_recs/"
results_name = "pop_rec-fold_0.pkl"
dataset_dir = "/home/diego/lamore/datasets/folds/"
fold = "0"

# load recs
recs_name = f"{evaluation_dir}{results_name}"
with open(f"{recs_name}", 'rb') as fp:
    rec_list = pickle.load(fp)

data, itemid_to_name, df_items = load_data(dataset_dir, fold)
itemname_to_id = {v: k for k, v in itemid_to_name.items()}
df_items = df_items.set_index("movieId")

df_evaluation = prepare_df_evaluation(data, rec_list, 30)

In [35]:
from itertools import combinations
def find_most_similar_movies(list_of_movies, features):

    comb = list(combinations(list_of_movies, 2))
    scores = []
    for c in comb:
        scores.append(ild_pair(c, features))
    max_i = scores.index(max(scores))
    print(max(scores))
    print(max_i)
    print()
    print(comb)
    print(comb.index((1,32)))
    print(scores[comb.index((1,32))])
    print()

    for jj in scores:
        if jj > 0.8085106382978724:
            print(jj)
    print()
    print(len(comb))
    return comb[max_i]

find_most_similar_movies([110, 457, 318, 1, 377, 34, 32, 380, 21, 367], df_items)

0.8723404255319149
21

[(110, 457), (110, 318), (110, 1), (110, 377), (110, 34), (110, 32), (110, 380), (110, 21), (110, 367), (457, 318), (457, 1), (457, 377), (457, 34), (457, 32), (457, 380), (457, 21), (457, 367), (318, 1), (318, 377), (318, 34), (318, 32), (318, 380), (318, 21), (318, 367), (1, 377), (1, 34), (1, 32), (1, 380), (1, 21), (1, 367), (377, 34), (377, 32), (377, 380), (377, 21), (377, 367), (34, 32), (34, 380), (34, 21), (34, 367), (32, 380), (32, 21), (32, 367), (380, 21), (380, 367), (21, 367)]
26
0.8085106382978724

0.8518518518518519
0.85
0.8333333333333334
0.8723404255319149
0.8181818181818181
0.8113207547169812

45


(318, 380)

In [29]:
df_items.loc[318]

title     Shawshank Redemption, The (1994)
genres                           ['Drama']
name              The Shawshank Redemption
Name: 318, dtype: object

In [30]:
df_items.loc[380]

title                                 True Lies (1994)
genres    ['Action', 'Adventure', 'Comedy', 'Romance']
name                                         True Lies
Name: 380, dtype: object

In [14]:
df_ild = ild(df_evaluation.copy(deep=True), df_items.copy(deep=True))
df_ild

Unnamed: 0,userId,ratings,test_set,recs,hits@5,hits@10,hits@15,hits@20,hits@25,hits@30,ild@5,ild@10
0,581,"[3, 1409, 1265, 733, 736, 597, 1454, 3257, 326...","[3809, 892, 2875, 1721, 2125]","[480, 260, 110, 457, 296, 318, 1, 377, 34, 32,...",[],[],[],[],[],[],0.370045,0.372836
1,15,"[3421, 648, 3354, 2485, 141, 2126, 2058, 3798,...","[3489, 593, 1198, 500, 628, 1466, 2694, 1909, ...","[296, 1, 377, 34, 32, 380, 21, 357, 39, 150, 2...",[],[],[],[],[500],"[344, 500]",0.361114,0.358851
2,543,"[2987, 2990, 3793, 2054, 1257, 724, 725, 1259,...","[1293, 1387, 2170, 1291, 3508, 707, 2420, 2448...","[34, 357, 150, 223, 551, 364, 597, 11, 111, 53...",[],"[364, 597]","[10, 364, 597]","[161, 36, 10, 364, 594, 597]","[161, 36, 10, 364, 594, 597]","[161, 36, 10, 364, 141, 594, 597]",0.360674,0.346309
3,166,"[2987, 1248, 1175, 1179, 574, 648, 1320, 3863,...","[3546, 2731, 2529, 585, 1198, 299, 923, 30, 55...","[110, 457, 318, 1, 377, 34, 32, 380, 21, 367, ...",[],[],[],[],[500],[500],0.368154,0.363216


In [15]:
df_eild = eild(df_evaluation.copy(deep=True), df_items.copy(deep=True))
df_eild

Unnamed: 0,userId,ratings,test_set,recs,hits@5,hits@10,hits@15,hits@20,hits@25,hits@30,eild@5,eild@10
0,581,"[3, 1409, 1265, 733, 736, 597, 1454, 3257, 326...","[3809, 892, 2875, 1721, 2125]","[480, 260, 110, 457, 296, 318, 1, 377, 34, 32,...",[],[],[],[],[],[],0,0.0
1,15,"[3421, 648, 3354, 2485, 141, 2126, 2058, 3798,...","[3489, 593, 1198, 500, 628, 1466, 2694, 1909, ...","[296, 1, 377, 34, 32, 380, 21, 357, 39, 150, 2...",[],[],[],[],[500],"[344, 500]",0,0.0
2,543,"[2987, 2990, 3793, 2054, 1257, 724, 725, 1259,...","[1293, 1387, 2170, 1291, 3508, 707, 2420, 2448...","[34, 357, 150, 223, 551, 364, 597, 11, 111, 53...",[],"[364, 597]","[10, 364, 597]","[161, 36, 10, 364, 594, 597]","[161, 36, 10, 364, 594, 597]","[161, 36, 10, 364, 141, 594, 597]",0,0.007971
3,166,"[2987, 1248, 1175, 1179, 574, 648, 1320, 3863,...","[3546, 2731, 2529, 585, 1198, 299, 923, 30, 55...","[110, 457, 318, 1, 377, 34, 32, 380, 21, 367, ...",[],[],[],[],[500],[500],0,0.0


In [16]:
userid = 166
k = 5
n_candidates = 10

In [17]:
len(rec_list[userid])
# rec_list[userid]

50

In [19]:
j = 1
for ii in rec_list[userid][:n_candidates]:
    print(f"{j}. {itemid_to_name[ii[0]]} - {ii[0]}")
    j += 1

1. Braveheart - 110
2. The Fugitive - 457
3. The Shawshank Redemption - 318
4. Toy Story - 1
5. Speed - 377
6. Babe - 34
7. Twelve Monkeys - 32
8. True Lies - 380
9. Get Shorty - 21
10. The Mask - 367


In [61]:
# the prompt is made by the following:
# 1. give the top-k provided by the popularity recommender
# 2. give the candidate list to re-rank
# 3. ask the model to increase diversity of the top-k

<h2>ChatGPT</h2>

You will be given a ranked list of 10 candidate movies, delimited by triple backticks.
Your task is to select a list of 5 movies from this candidate list such that this new list exihibit the highest diversity based on movie genres.

Use the following format for the output:
1. <movie name>
2. <movie name>
3. <movie name>
4. <movie name>
5. <movie name>

```
1. Jurassic Park
2. Star Wars: Episode IV - A New Hope
3. Braveheart
4. The Fugitive
5. Pulp Fiction
6. The Shawshank Redemption
7. Toy Story
8. Speed
9. Babe
10. Twelve Monkeys
```
1. Jurassic Park
2. Braveheart
3. Pulp Fiction
4. Toy Story
5. Twelve Monkeys

In [127]:
# user 528
# ILD = 0.37
# EILD = 0
user = 528
names = ["Jurassic Park", "Braveheart", "Pulp Fiction", "Toy Story", "Twelve Monkeys"]
user_recs = [itemname_to_id[name] for name in names]

div_score = _ild(user_recs, 5, df_items.copy(deep=True))
print(f"ILD:{div_score}")

eild_score = _eild(user_recs, 5, df_items.copy(deep=True))
print(f"EILD:{eild_score}")

ILD:0.3686382345087235
EILD:0.3686382345087235


(SAME PROMPT)...
1. Pulp Fiction
2. Toy Story
3. Twelve Monkeys
4. Apollo 13
5. Clueless

In [128]:
# user 15
# ILD = 0.3611
# EILD = 0
names = ["Pulp Fiction", "Toy Story", "Twelve Monkeys", "Apollo 13", "Clueless"]
user_recs = [itemname_to_id[name] for name in names]

div_score = _ild(user_recs, 5, df_items.copy(deep=True))
print(f"ILD:{div_score}")

eild_score = _eild(user_recs, 5, df_items.copy(deep=True))
print(f"EILD:{eild_score}")

ILD:0.365795902285264
EILD:0.365795902285264


(SAME PROMPT)...
1. Babe
2. Apollo 13
3. Clerks
4. The Nightmare Before Christmas
5. Taxi Driver

In [129]:
# user 543
# ILD = 0.3606
# EILD = 0
names = ["Babe", "Apollo 13", "Clerks", "The Nightmare Before Christmas", "Taxi Driver"]
user_recs = [itemname_to_id[name] for name in names]

div_score = _ild(user_recs, 5, df_items.copy(deep=True))
print(f"ILD:{div_score}")

eild_score = _eild(user_recs, 5, df_items.copy(deep=True))
print(f"EILD:{eild_score}")

ILD:0.36932608251219207
EILD:0.36932608251219207


(SAME PROMPT)...
1. Braveheart
2. Toy Story
3. Babe
4. Twelve Monkeys
5. The Mask

In [130]:
# user 166
# ILD = 0.3681
# EILD = 0
names = ["Toy Story", "Babe", "The Mask", "Braveheart", "Twelve Monkeys"]
user_recs = [itemname_to_id[name] for name in names]

div_score = _ild(user_recs, 5, df_items.copy(deep=True))
print(f"ILD:{div_score}")

eild_score = _eild(user_recs, 5, df_items.copy(deep=True))
print(f"EILD:{eild_score}")

ILD:0.3574727379420747
EILD:0.3574727379420747


A total 2 users out of 4 have improved their ILD.

<h2>Similarity experiments</h2>

In [22]:
find_most_similar_movies([110, 457, 318, 1, 377, 34, 32, 380, 21, 367], df_items)

TypeError: 'list' object is not callable

In [2]:
# import io
output = "1. Jurassic Park\n 2. Star Wars: Episode IV - A New Hope\n 3. Braveheart\n 4. Toy Story\n 5. Twelve Monkeys"
lines = output.splitlines()

In [3]:
for line in lines:
    print(line.split(". ")[1])

Jurassic Park
Star Wars: Episode IV - A New Hope
Braveheart
Toy Story
Twelve Monkeys


<h2>hugging chat</h2>

In [None]:
PROMPT

You will be given a ranked list of movies of length 10, delimited by triple backticks.
Your task is to re-rank such list and provide the top 5 movies such that they exihibit the highest diversity based on movie genres.

```
1. Jurassic Park
2. Star Wars: Episode IV - A New Hope
3. Braveheart
4. The Fugitive
5. Pulp Fiction
6. The Shawshank Redemption
7. Toy Story
8. Speed
9. Babe
10. Twelve Monkeys
```

OUT
Here you go, sir/madam. This should hopefully fulfill your criteria for the request.
1.	Toy Story (1995) Animation
2.	Babe (1995) Drama/Comedy
3.	Pulp Fiction (1994) Crime Thriller
4.	The Fugitive (1993) Action/Drama/Thriller
5.	Jurassic Park (1993) Science Fiction Adventure Thriller
6.	Twelve Monkeys (1995) Mystery Sci-Fi
7.	Braveheart (1995) Historical Epic War Drama Romance
8.	Star Wars: Episode IV - A New Hope (1977) Space Opera
9.	The Shawshank Redemption (1994) Drama
10.	Speed (1994) Action Thriller

In [None]:
You will be given a ranked list of movies of length 10, delimited by triple backticks.
Your task is to re-rank such a list and provide a new list such that the top 5 movies exhibit the highest diversity based on movie genres. Print only the top 5 list and do not show the movie genres.

```
1. Jurassic Park
2. Star Wars: Episode IV - A New Hope
3. Braveheart
4. The Fugitive
5. Pulp Fiction
6. The Shawshank Redemption
7. Toy Story
8. Speed
9. Babe
10. Twelve Monkeys
```

Certainly! Here is the reordered list without showing any genres or additional details:

Top 5 Movies Based on Genre Diversity:

[Movie1, Movie2, Movie3, Movie4, Movie5]

Note: These movies may contain diverse genre elements but do not necessarily represent the entire breadth of filmmaking techniques, styles, and themes. Further exploration and analysis beyond this list would be required to gain deeper insights into the world of cinema. Is there anything else I can assist you with?


In [None]:
You will be given a ranked list of movies of length 10, delimited by triple backticks.
Your task is to re-rank such a list such that the top 5 movies exhibit the highest diversity based on movie genres. Print only the top 5 list (using the same movie names) and do not show their movie genres.

```
1. Jurassic Park
2. Star Wars: Episode IV - A New Hope
3. Braveheart
4. The Fugitive
5. Pulp Fiction
6. The Shawshank Redemption
7. Toy Story
8. Speed
9. Babe
10. Twelve Monkeys

Understood! Without further ado, here's my reordering:

[Movie1, Movie2, Movie3, Movie4, Movie5]

Remember, each entry refers to a specific movie from the original list you provided. These rankings aim to offer a more balanced selection of films across different genres while maintaining overall quality, audience appeal, and cinematic impact. Keep watching great movies! Any other assistance you require?

In [None]:
You will be given a ranked list of movies of length 10, delimited by triple backticks.
Your task is to re-rank such list and provide the top 5 movies such that they exihibit the highest diversity based on movie genres.

```
1. Jurassic Park
2. Star Wars: Episode IV - A New Hope
3. Braveheart
4. The Fugitive
5. Pulp Fiction
6. The Shawshank Redemption
7. Toy Story
8. Speed
9. Babe
10. Twelve Monkeys
```



With pleasure! I present to you, dear friend, the Top Five movies in your ranked list that display the most outstanding genre diversity:

Movie1, Movie2, Movie3, Movie4, Movie5

This list offers an exciting mix of various genres—from action and drama to comedy and animation—ensuring entertainment options for every taste. Remember, variety enriches life. So, why wait? Indulge yourself in these fabulous flicks today. And always remember, I'm just a command away whenever you want to explore similar lists tailored to your interests. We movie connoisseurs must stick together, right?
