In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from openai import OpenAI
from key import OPENAI_API_KEY
client = OpenAI(api_key=OPENAI_API_KEY)

In [6]:
movie_100k=pd.read_csv('../../data/processed_movie100k.csv')
movie_100k.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,movie_title,genres,avg_rating
0,1,1,5,874965758,Toy Story (1995),"Animation, Children",5.0
1,1,2,3,876893171,GoldenEye (1995),"Action, Adventure, Thriller",3.0
2,1,3,4,878542960,Four Rooms (1995),Thriller,4.0
3,1,4,3,876893119,Get Shorty (1995),Action,3.0
4,1,5,3,889751712,Copycat (1995),Thriller,3.0


In [14]:
# for each user, find his top n rated movies, find his most recent watched n movies

def get_user_top_n_movies(user_id, n):
    user_data = movie_100k[movie_100k['user_id']==user_id]
    user_data = user_data.sort_values(by='avg_rating', ascending=False)
    return user_data.head(n)

def get_user_recent_n_movies(user_id, n):
    user_data = movie_100k[movie_100k['user_id']==user_id]
    user_data = user_data.sort_values(by='timestamp', ascending=False)
    return user_data.head(n)

user_movie_data = pd.DataFrame()
for user_id in movie_100k['user_id'].unique():
    user_top_n_movies = get_user_top_n_movies(user_id, 5)
    user_recent_n_movies = get_user_recent_n_movies(user_id, 5)
    # add an indicator for most recent watched movies or top rated movies
    user_top_n_movies['type'] = 'top'
    user_recent_n_movies['type'] = 'recent'
    user_movie_data = pd.concat([user_movie_data, user_top_n_movies, user_recent_n_movies])

user_movie_data.drop_duplicates(inplace=True)
user_movie_data.reset_index(drop=True, inplace=True)
user_movie_data.head(10)

Unnamed: 0,user_id,movie_id,rating,timestamp,movie_title,genres,avg_rating,type
0,1,1,5,874965758,Toy Story (1995),"Animation, Children",5.0,top
1,1,119,5,876893098,Maya Lin: A Strong Clear Vision (1994),unknown,5.0,top
2,1,172,5,874965478,"Empire Strikes Back, The (1980)","Action, Adventure, Romance, Sci-Fi, War",5.0,top
3,1,171,5,889751711,Delicatessen (1991),Sci-Fi,5.0,top
4,1,170,5,876892856,Cinema Paradiso (1988),Romance,5.0,top
5,1,102,2,889751736,"Aristocats, The (1970)","Animation, Children",2.0,recent
6,1,74,1,889751736,Faster Pussycat! Kill! Kill! (1965),Action,1.0,recent
7,1,256,4,889751712,When the Cats Away (Chacun cherche son chat) (...,Romance,4.0,recent
8,1,5,3,889751712,Copycat (1995),Thriller,3.0,recent
9,1,111,5,889751711,"Truth About Cats & Dogs, The (1996)",Romance,5.0,recent


## Basic prompt

In [26]:
prompt_1 = '''
I am user {user_id}, My top rated movies are {top_movies}. Out of 5, I rated them {rates} repectively. 
Recommend me 10 movies similar to these but I haven't watched yet. The format of the response should be a list of movie titles.
Please strictly follow the format: use brackets around the movies you recommend and separate the titles by new lines so I can easily parse them.
(Format Example: Here are the 10 movies recommended for you: [Midnight Cowboy (1969){new_line}Lost in Translation (2003){new_line}etc.]) 
Answer: 
'''

In [29]:
prompts_1 = []
for user_id in user_movie_data['user_id'].unique():
    user_movies = user_movie_data[user_movie_data['user_id']==user_id]
    top_movies = user_movies[user_movies['type']=='top']
    rates = top_movies['avg_rating'].values
    top_movies = top_movies['movie_title'].values
    top_movies = ', '.join(top_movies)
    rates = ', '.join([str(rate) for rate in rates])
    new_line = '\n'
    prompt = prompt_1.format(user_id=user_id, top_movies=top_movies, rates=rates, new_line=new_line)
    prompts_1.append(prompt)


In [30]:
""" Number of users to run the recommender on
    (Use a small sample for faster results)
    If you want to run the recommender on all users, uncomment the last line
"""
num_users = 5

# Uncomment below to run on all users
# num_users = user_movies.shape[0]

In [35]:
# call the OpenAI API
responses_1 = []
for i in range(num_users):
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages= [{ 'role':'user','content' : prompts_1[i]}],
        temperature=0.5,
    )
    responses_1.append(response)
    

In [36]:
responses_1

[ChatCompletion(id='chatcmpl-9iScfEEBeTpfHeSYE0BGUIKCiZj8N', choices=[Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content="Here are 10 movies recommended for you: \n[The Shawshank Redemption (1994)\nForrest Gump (1994)\nSchindler's List (1993)\nThe Green Mile (1999)\nThe Godfather (1972)\nGoodfellas (1990)\nPulp Fiction (1994)\nThe Silence of the Lambs (1991)\nThe Matrix (1999)\nInception (2010)]", role='assistant', function_call=None, tool_calls=None), logprobs=None)], created=1720383485, model='gpt-3.5-turbo-0125', object='chat.completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=87, prompt_tokens=183, total_tokens=270)),
 ChatCompletion(id='chatcmpl-9iScgYYSrUws5IIIAkDgDTLJI6DEN', choices=[Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content="Here are the 10 movies recommended for you: \n[Pulp Fiction (1994)\nForrest Gump (1994)\nThe Shawshank Redemption (1994)\nAmerican Beauty (1999)\nEternal Sunshine of the Sp

In [38]:
responses_1_recommendations = [response.choices[0].message.content.split("[")[1].split("]")[0].split("\n") for response in responses_1]

In [51]:
# hit rate for the recommendations
def hit_rate(recommendation, actual):
    hit = 0
    for rec in recommendation:
        if rec in actual:
            hit += 1
    return hit/len(recommendation)

# NDCG for the recommendations
def ndcg(recommendation, actual):
    dcg = 0
    for i, rec in enumerate(recommendation):
        if rec in actual:
            dcg += 1/np.log2(i+2)
    idcg = 0
    for i in range(len(actual)):
        idcg += 1/np.log2(i+2)
    return dcg/idcg

In [52]:
hit_rates = []
ndcgs = []
for i in range(num_users):
    user_id = user_movie_data['user_id'].unique()[i]
    user_movies = movie_100k[movie_100k['user_id']==user_id]
    user_watched_movies = user_movies['movie_title'].values
    print(f"User {user_id} watched movies: {user_watched_movies}")
    recommendation = responses_1_recommendations[i]
    print(f"Recommendation: {recommendation}")
    hit_rate_ = hit_rate(recommendation, user_watched_movies)
    print(f"Hit Rate: {hit_rate_}")
    hit_rates.append(hit_rate_)
    ndcg_ = ndcg(recommendation, user_watched_movies)
    print(f"NDCG: {ndcg_}")
    ndcgs.append(ndcg_)

User 1 watched movies: ['Toy Story (1995)' 'GoldenEye (1995)' 'Four Rooms (1995)'
 'Get Shorty (1995)' 'Copycat (1995)'
 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)'
 'Twelve Monkeys (1995)' 'Babe (1995)' 'Dead Man Walking (1995)'
 'Richard III (1995)' 'Seven (Se7en) (1995)' 'Usual Suspects, The (1995)'
 'Mighty Aphrodite (1995)' 'Postino, Il (1994)'
 "Mr. Holland's Opus (1995)" 'French Twist (Gazon maudit) (1995)'
 'From Dusk Till Dawn (1996)' 'White Balloon, The (1995)'
 "Antonia's Line (1995)" 'Angels and Insects (1995)'
 'Muppet Treasure Island (1996)' 'Braveheart (1995)' 'Taxi Driver (1976)'
 'Rumble in the Bronx (1995)' 'Birdcage, The (1996)'
 'Brothers McMullen, The (1995)' 'Bad Boys (1995)' 'Apollo 13 (1995)'
 'Batman Forever (1995)' 'Belle de jour (1967)' 'Crimson Tide (1995)'
 'Crumb (1994)' 'Desperado (1995)' 'Doom Generation, The (1995)'
 'Free Willy 2: The Adventure Home (1995)' 'Mad Love (1995)'
 'Nadja (1994)' 'Net, The (1995)' 'Strange Days (1995)'
 'To Wong F

In [56]:
np.mean(hit_rates)

0.1

In [57]:
np.mean(ndcgs)

0.018336780675374344