In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from openai import OpenAI
from key import OPENAI_API_KEY
client = OpenAI(api_key=OPENAI_API_KEY)

In [2]:
movie_latest_small=pd.read_csv('../../data/processed_movie_latest_small.csv')
movie_latest_small.head()

Unnamed: 0,userId,movieId,title,genres,rating,timestamp,tag
0,1,1,Toy Story (1995),"Adventure, Animation, Children, Comedy, Fantasy",4.0,964982703,
1,1,3,Grumpier Old Men (1995),"Comedy, Romance",4.0,964981247,
2,1,6,Heat (1995),"Action, Crime, Thriller",4.0,964982224,
3,1,47,Seven (a.k.a. Se7en) (1995),"Mystery, Thriller",5.0,964983815,
4,1,50,"Usual Suspects, The (1995)","Crime, Mystery, Thriller",5.0,964982931,


In [3]:
# for each user, find his top-most rated movies
N_TOP_MOVIES = 10
def get_top_recent_movies(movies_df, user_id, num_movies=N_TOP_MOVIES):
    user_ratings = movies_df[movies_df['userId'] == user_id]
    user_ratings = user_ratings.sort_values(by=['rating', 'timestamp'], ascending=[False, False])
    return user_ratings.head(num_movies)

# get the top 5 movies for all users
top_movies = pd.DataFrame()
for user_id in movie_latest_small['userId'].unique():
    top_movies = pd.concat([top_movies, get_top_recent_movies(movie_latest_small, user_id, 10)])
top_movies.reset_index(drop=True, inplace=True)
top_movies.head(15)

Unnamed: 0,userId,movieId,title,genres,rating,timestamp,tag
0,1,553,Tombstone (1993),"Action, Drama, Western",5.0,964984153,
1,1,157,Canadian Bacon (1995),"Comedy, War",5.0,964984100,
2,1,1298,Pink Floyd: The Wall (1982),"Drama, Musical",5.0,964984086,
3,1,3053,"Messenger: The Story of Joan of Arc, The (1999)","Drama, War",5.0,964984086,
4,1,3448,"Good Morning, Vietnam (1987)","Comedy, Drama, War",5.0,964984054,
5,1,151,Rob Roy (1995),"Action, Drama, Romance, War",5.0,964984041,
6,1,1224,Henry V (1989),"Action, Drama, Romance, War",5.0,964984018,
7,1,527,Schindler's List (1993),"Drama, War",5.0,964984002,
8,1,5060,M*A*S*H (a.k.a. MASH) (1970),"Comedy, Drama, War",5.0,964984002,
9,1,3147,"Green Mile, The (1999)","Crime, Drama",5.0,964983873,


In [4]:
import re

def extract_movie_titles(response):
    # Extract lines that match the format [Title (Year)]
    pattern = r"\[.*\]"
    titles = re.findall(pattern, response)
    return [title.strip("[]") for title in titles]

In [5]:
""" Number of users to run the recommender on
    (Use a small sample for faster results)
    If you want to run the recommender on all users, uncomment the last line
"""
# num_users = 5

# Uncomment below to run on all users
num_users = len(movie_latest_small['userId'].unique())
print(f"Running the recommender on {num_users} users")

Running the recommender on 610 users


In [6]:
def openai_response(prompts, num_users, output_file='responses.json'):

    responses = []
    for i in range(num_users):
        response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages= [{ 'role':'user','content' : prompts[i]}],
        temperature=0,
        )
        responses.append(response)

    # parse the responses
    responses_recommendations = [extract_movie_titles(response.choices[0].message.content) for response in responses]

    # write the responses to a file
    with open(output_file, 'w') as f:
        for i in range(num_users):
            f.write(str(responses_recommendations[i]) + '\n')
        
        print(f"Response written to {output_file}")
    return responses_recommendations

# calculate the hit rate
# hit rate for the recommendations
def hit_rate(recommendation, actual):
    hit = 0
    for rec in recommendation:
        if rec in actual:
            hit += 1
    return hit/len(recommendation)

# NDCG for the recommendations
def ndcg(recommendation, actual):
    dcg = 0
    for i, rec in enumerate(recommendation):
        if rec in actual:
            dcg += 1/np.log2(i+2)
    idcg = 0
    for i in range(len(actual)):
        idcg += 1/np.log2(i+2)
    return dcg/idcg


def eval_response(responses_recommendations, selected_movie_data, original_movie_data, num_users):
    hit_rates = []
    ndcg_scores = []
    for i in range(num_users):
        user_id = selected_movie_data['userId'].unique()[i]
        user_movies = original_movie_data[original_movie_data['userId']==user_id]
        user_watched_movies = user_movies['title'].values
        # print(f"User {user_id} watched movies: {user_watched_movies}")
        recommendation = responses_recommendations[i]
        # print(f"Recommendation: {recommendation}")
        hit_rate_ = hit_rate(recommendation, user_watched_movies)
        # print(f"Hit Rate: {hit_rate_}")
        hit_rates.append(hit_rate_)
        ndcg_ = ndcg(recommendation, user_watched_movies)
        # print(f"NDCG: {ndcg_}")
        ndcg_scores.append(ndcg_)

    avg_hit_rate = np.mean(hit_rates)
    avg_ndcg = np.mean(ndcg_scores)

    print(f"Average Hit Rate: {avg_hit_rate}")
    print(f"Average NDCG: {avg_ndcg}")

    return hit_rates, ndcg_scores


In [7]:
# read the responses from the json file
def read_responses(output_file):
    responses_recommendations = []
    with open(output_file, 'r') as f:
        for line in f:
            responses_recommendations.append(eval(line))
    return responses_recommendations

## Add similar user

In [8]:
user_ratings = top_movies.drop_duplicates(['userId', 'title']).pivot(index='userId', \
                                                                columns='title',\
                                                            values='rating').fillna(0)
user_ratings.head()

title,'Salem's Lot (2004),'Til There Was You (1997),(500) Days of Summer (2009),1-900 (06) (1994),10 Cloverfield Lane (2016),10 Things I Hate About You (1999),101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),102 Dalmatians (2000),10th & Wolf (2006),...,Young Frankenstein (1974),"Young Victoria, The (2009)",Your Highness (2011),Zero Dark Thirty (2012),Zombieland (2009),Zoolander (2001),Zulu (1964),[REC] (2007),eXistenZ (1999),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
from scipy.sparse import csr_matrix
sparse_user_ratings = csr_matrix(user_ratings)
NUM_SIMILAR_USERS = 10
NUM_CANDIDADTES = 20

In [10]:
def get_similar_users(sparse_user_ratings, index, n = 10):
    """
    Get the n most similar users to each user
    Input:
        sparse_user_ratings: scipy sparse csr_matrix
        index: list of users
        n: number of similar users to return for each user
    Output:
        list of lists containing the n most similar users to each user
    """
    df = pd.DataFrame(cosine_similarity(sparse_user_ratings) - np.identity(sparse_user_ratings.shape[0]), \
                        index = index, columns = index)
    return df.apply(lambda x: list(x.sort_values(ascending = False).head(n).index), axis=1)

similar_users = get_similar_users(sparse_user_ratings, user_ratings.index,NUM_SIMILAR_USERS)

In [11]:
def get_candidate_movies(similar_users, dataframe, n=20):
    """
    Get the top-n candidate movies for each user based on the movies 
    rated by similar users.
    
    Parameters:
    similar_users (Series): A pandas Series where each entry is a list of user_ids that 
                            are similar to a specific user.
    dataframe (DataFrame): The pandas DataFrame containing user ratings with columns 'user_id', 
                           'movie_title', and 'rating'.
    n (int): The number of top-rated movies to return for each user.
    
    Returns:
    Series: A pandas Series where each entry is a list of n movie titles recommended for each user.
    """
    # Define a helper function to get top-n movies for a single user
    def get_top_movies(user_ids):
        # Filter the dataframe for movies rated by similar users
        filtered_movies = dataframe[dataframe['userId'].isin(user_ids)]
        # Group by movie title, sum the ratings, and get the top-n movies
        top_movies = (
            filtered_movies.groupby('title')['rating']
            .sum()
            .sort_values(ascending=False)
            .head(n)
            .index
            .tolist()
        )
        return top_movies

    # Apply the helper function to each entry in similar_users
    return similar_users.apply(get_top_movies)

candidate_movies = get_candidate_movies(similar_users, top_movies, NUM_CANDIDADTES)
candidate_movies.head()

userId
1    [Schindler's List (1993), Forrest Gump (1994),...
2    [Dark Knight, The (2008), Wolf of Wall Street,...
3    [Road Warrior, The (Mad Max 2) (1981), Indiana...
4    [Monty Python and the Holy Grail (1975), Monty...
5    [Dances with Wolves (1990), Schindler's List (...
dtype: object

In [12]:
# add to top_movies 
top_movies['candidate_movies'] = top_movies['userId'].map(candidate_movies)
top_movies.head()

Unnamed: 0,userId,movieId,title,genres,rating,timestamp,tag,candidate_movies
0,1,553,Tombstone (1993),"Action, Drama, Western",5.0,964984153,,"[Schindler's List (1993), Forrest Gump (1994),..."
1,1,157,Canadian Bacon (1995),"Comedy, War",5.0,964984100,,"[Schindler's List (1993), Forrest Gump (1994),..."
2,1,1298,Pink Floyd: The Wall (1982),"Drama, Musical",5.0,964984086,,"[Schindler's List (1993), Forrest Gump (1994),..."
3,1,3053,"Messenger: The Story of Joan of Arc, The (1999)","Drama, War",5.0,964984086,,"[Schindler's List (1993), Forrest Gump (1994),..."
4,1,3448,"Good Morning, Vietnam (1987)","Comedy, Drama, War",5.0,964984054,,"[Schindler's List (1993), Forrest Gump (1994),..."


In [13]:
def generate_prompt_from_similar_users(df, user_id, num, num_candidate):
    df_user = df[df['userId'] == user_id]
    movies_list = '\n'.join([f"Movie {i+1}: {row['title']}, Rating: {row['rating']}" for i, row in df_user.iterrows()])
    
    # Assuming all rows for the user have the same candidate_movies list
    candidate_movies = df_user['candidate_movies'].iloc[0]

    prompt_template = '''
I am user {user_id}. My most recent {num} top-rated movies are listed below along with their ratings:
{movies_list}
Here are {num_candidate} top-rated movies from users with similar taste:
{candidate_movies}
Please recommend 10 movies similar to these that I haven't watched yet. Format your response as a list of movie titles, using brackets around the titles and separating each title with a new line for easy parsing.
Example format:
Here are the 10 movies recommended for you: 
[Midnight Cowboy (1969)]
[Lost in Translation (2003)]
[etc.]
Answer:
'''

    prompt = prompt_template.format(user_id=user_id, num=num, movies_list=movies_list, num_candidate=num_candidate, candidate_movies=candidate_movies)
    return prompt

In [14]:
prompts_similar_users = []
for user_id in top_movies['userId'].unique():
    prompt_temp = generate_prompt_from_similar_users(top_movies, user_id, N_TOP_MOVIES, NUM_CANDIDADTES)
    prompts_similar_users.append(prompt_temp)

print(prompts_similar_users[0])


I am user 1. My most recent 10 top-rated movies are listed below along with their ratings:
Movie 1: Tombstone (1993), Rating: 5.0
Movie 2: Canadian Bacon (1995), Rating: 5.0
Movie 3: Pink Floyd: The Wall (1982), Rating: 5.0
Movie 4: Messenger: The Story of Joan of Arc, The (1999), Rating: 5.0
Movie 5: Good Morning, Vietnam (1987), Rating: 5.0
Movie 6: Rob Roy (1995), Rating: 5.0
Movie 7: Henry V (1989), Rating: 5.0
Movie 8: Schindler's List (1993), Rating: 5.0
Movie 9: M*A*S*H (a.k.a. MASH) (1970), Rating: 5.0
Movie 10: Green Mile, The (1999), Rating: 5.0
Here are 20 top-rated movies from users with similar taste:
["Schindler's List (1993)", 'Forrest Gump (1994)', 'Shawshank Redemption, The (1994)', 'Hoop Dreams (1994)', 'Green Mile, The (1999)', 'Fight Club (1999)', 'Four Weddings and a Funeral (1994)', 'Braveheart (1995)', 'Usual Suspects, The (1995)', 'Rob Roy (1995)', 'M*A*S*H (a.k.a. MASH) (1970)', 'Jurassic Park (1993)', 'Seven (a.k.a. Se7en) (1995)', 'Kingpin (1996)', 'Jeffrey 

In [15]:
prompt_similar_user_rec = openai_response(prompts_similar_users, num_users, output_file='responses_similar_user_10_20_3.json')

hit_rates_3, ndcg_scores_3 = eval_response(prompt_similar_user_rec, top_movies, movie_latest_small, num_users)

Response written to responses_simliar_user_10_20_3.json
Average Hit Rate: 0.14311475409836066
Average NDCG: 0.03337717631356322


0.3usd 15min