# Film Finder GPT featuring RAG + Reflection
###### CSTU GPT Application with professor Yang Sun
###### Final Project 8/31/2024
###### Team 4

- Justin Chen
- Xu Liu

#### Set up initial movie dataframe

In [3]:
import pandas
import ast

df = pandas.read_csv('movies_metadata.csv', low_memory=False) # movies_metadata.csv from https://www.kaggle.com/datasets/rohan4050/movie-recommendation-data

df = df[['title', 'vote_average', 'release_date', 'genres', 'overview']]
df = df.rename(columns={'vote_average': 'rating'})

df = df[df.rating > 0.0]
df = df.dropna().reset_index(drop=True)

df.genres = df.genres.apply(ast.literal_eval)
df.genres = df.genres.apply(lambda x: [genre['name'] for genre in x])

n_movies = 2 ** 14 # use subset of movies to save time and money on embeddings
df = df[:n_movies]

print('initial dataframe:')
df.tail(3)

initial dataframe:


Unnamed: 0,title,rating,release_date,genres,overview
16381,Elite Squad: The Enemy Within,7.5,2010-10-08,"[Drama, Action, Crime]",After a bloody invasion of the BOPE in the Hig...
16382,Melissa P.,4.1,2005-11-18,[Drama],"An adolescent girl, living with her mother and..."
16383,Mesrine: Public Enemy #1,7.3,2008-11-19,"[Action, Thriller, Crime, Drama]","The story of Jacques Mesrine, France's public ..."


#### Add "movie_text" and "n_tokens" columns to dataframe

In [5]:
import tiktoken

compose_movie_text = lambda row: f"Title: {row.title}. " \
    f"Rating: {row.rating}. " \
    f"Release Date: {row.release_date}. " \
    f"Genres: {', '.join(row.genres)}. " \
    f"Overview: {row.overview}"

df['movie_text'] = df.apply(compose_movie_text, axis=1)

embedding_model = "text-embedding-3-small"
encoding = tiktoken.encoding_for_model(embedding_model)

df['n_tokens'] = df.movie_text.map(lambda x: len(encoding.encode(x)))

max_tokens = 8000  # text-embedding-3-small input limit is 8191 tokens
df = df[df.n_tokens <= max_tokens].reset_index(drop=True)

print('updated dataframe with "movie_text" and "n_tokens" columns:')
df.tail(3)

updated dataframe with "movie_text" and "n_tokens" columns:


Unnamed: 0,title,rating,release_date,genres,overview,movie_text,n_tokens
16381,Elite Squad: The Enemy Within,7.5,2010-10-08,"[Drama, Action, Crime]",After a bloody invasion of the BOPE in the Hig...,Title: Elite Squad: The Enemy Within. Rating: ...,129
16382,Melissa P.,4.1,2005-11-18,[Drama],"An adolescent girl, living with her mother and...",Title: Melissa P.. Rating: 4.1. Release Date: ...,55
16383,Mesrine: Public Enemy #1,7.3,2008-11-19,"[Action, Thriller, Crime, Drama]","The story of Jacques Mesrine, France's public ...",Title: Mesrine: Public Enemy #1. Rating: 7.3. ...,96


#### Get embedding vector for each movie_text (in maximum size batches)

In [7]:
import os
import time

import openai

client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

token_limit = 142000 # 150,000 max tokens per batch
embeddings = []

n_movies = len(df)
i = 0

while i < n_movies:
    token_count = 0
    i_start = i
    
    while token_count < token_limit and i < n_movies:
        token_count += df.n_tokens[i]
        i += 1
    
    batch_movie_texts = df.movie_text[i_start:i]
    batch_response = client.embeddings.create(input=batch_movie_texts, model=embedding_model)
    batch_embeddings = list(map(lambda x: x.embedding, batch_response.data))
    embeddings.extend(batch_embeddings)
 
    print(f'{len(embeddings)} embeddings saved so far. | {len(batch_embeddings)} movie texts embedded this batch, costing {token_count} tokens.')
    time.sleep(5.0)
    
print(f'finished getting all {i} embeddings.')

df['embeddings'] = embeddings
df.to_pickle(f'movies-{n_movies}-{int(time.time())}.pkl') # make hard copy of completed DataFrame (in case run out of OpenAI API requests)

print('completed dataframe with "embeddings" column:')
df.tail(3)

1378 embeddings saved so far. | 1378 movie texts embedded this batch, costing 142004 tokens.
2753 embeddings saved so far. | 1375 movie texts embedded this batch, costing 142013 tokens.
4162 embeddings saved so far. | 1409 movie texts embedded this batch, costing 142045 tokens.
5544 embeddings saved so far. | 1382 movie texts embedded this batch, costing 142027 tokens.
6924 embeddings saved so far. | 1380 movie texts embedded this batch, costing 142075 tokens.
8297 embeddings saved so far. | 1373 movie texts embedded this batch, costing 142129 tokens.
9632 embeddings saved so far. | 1335 movie texts embedded this batch, costing 142102 tokens.
11029 embeddings saved so far. | 1397 movie texts embedded this batch, costing 142014 tokens.
12452 embeddings saved so far. | 1423 movie texts embedded this batch, costing 142068 tokens.
13835 embeddings saved so far. | 1383 movie texts embedded this batch, costing 142151 tokens.
15196 embeddings saved so far. | 1361 movie texts embedded this bat

Unnamed: 0,title,rating,release_date,genres,overview,movie_text,n_tokens,embeddings
16381,Elite Squad: The Enemy Within,7.5,2010-10-08,"[Drama, Action, Crime]",After a bloody invasion of the BOPE in the Hig...,Title: Elite Squad: The Enemy Within. Rating: ...,129,"[-0.009515637531876564, 0.02617114782333374, 0..."
16382,Melissa P.,4.1,2005-11-18,[Drama],"An adolescent girl, living with her mother and...",Title: Melissa P.. Rating: 4.1. Release Date: ...,55,"[0.03766604885458946, 0.040201932191848755, -0..."
16383,Mesrine: Public Enemy #1,7.3,2008-11-19,"[Action, Thriller, Crime, Drama]","The story of Jacques Mesrine, France's public ...",Title: Mesrine: Public Enemy #1. Rating: 7.3. ...,96,"[-0.023602256551384926, 0.02166825160384178, -..."


#### Save point you can load by uncommenting code and running a fresh kernel starting from this cell
##### Loads serialized dataframe from disk into memory to skip embedding generation and OpenAI API calls

In [9]:
# import pandas
# import openai

# df = pandas.read_pickle('movies-16384-1724889754.pkl')
# client = openai.OpenAI()

# embedding_model = "text-embedding-3-small"
# embeddings = df.embeddings.tolist()

# df.tail()

#### Load (text, embedding) pairs into FAISS vectorstore

In [11]:
import faiss

from langchain_community.vectorstores import FAISS
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.docstore.in_memory import InMemoryDocstore

llm = ChatOpenAI(model="gpt-4o-mini")

index = faiss.IndexFlatL2(len(embeddings[0]))
vector_store = FAISS(
    embedding_function=OpenAIEmbeddings(),
    index=index,
    docstore= InMemoryDocstore(),
    index_to_docstore_id={}
)

texts = df.movie_text.tolist()
text_embedding_pairs = zip(texts, embeddings)

vector_store.add_embeddings(text_embedding_pairs)

print(vector_store.index.d, vector_store.index.ntotal)

1536 16384


#### Set up RAG chain which takes the user request and retrieves associated movie texts as context
##### Puts request and context into a prompt template, converting it to a real prompt
###### Returns the prompt's message list which is used as our initial state for beginning LangGraph execution (a list of 1 HumanMessage)

In [13]:
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

n_retrieved, n_recommended = 6, 2

initial_prompt_template = ChatPromptTemplate([
    ("human",
        "request: {request}\n\n"\
        "context in the form of " + str(n_retrieved) + " movie descriptions:\n"\
        "{context}\n\n"\
        "Your response: ")
])

retriever = vector_store.as_retriever(search_kwargs={'k': n_retrieved})
format_docs = lambda docs: "\n\n".join(doc.page_content for doc in docs)

rl_get_initial_state = RunnableLambda(lambda prompt_value: prompt_value.messages)

rag_chain = (
    {"request": RunnablePassthrough(), "context": retriever | format_docs}
    | initial_prompt_template
    | rl_get_initial_state
)

rag_chain.invoke('some funny comedies')[0].pretty_print()


request: some funny comedies

context in the form of 6 movie descriptions:
Title: Daft Punk's Electroma. Rating: 6.5. Release Date: 2006-03-24. Genres: Science Fiction. Overview: Follows the history of two robots, the members of Daft Punk, on their quest to become human.

Title: The magnetic man. Rating: 5.3. Release Date: 2009-10-16. Genres: Documentary, Music, Foreign. Overview: A look at the life and music of Pekka Streng who died at the age of 26 in 1975.

Title: Dirty Deeds. Rating: 5.8. Release Date: 2005-08-26. Genres: Comedy, Drama. Overview: An American Pie-like teen comedy in which a high school senior tries to become the first student in years to complete the Dirty Deeds, an outrageous series of challenges that must be completed by the Homecoming banquet at 9 a.m.

Title: Crank: High Voltage. Rating: 5.9. Release Date: 2009-04-16. Genres: Action, Thriller, Crime. Overview: Chelios faces a Chinese mobster who has stolen his nearly indestructible heart and replaced it with a 

#### Set up generate and reflect chains to create movie recommendation and reflection articles

In [15]:
from langchain_core.messages import SystemMessage, HumanMessage, AIMessage, BaseMessage

gen_prompt = ChatPromptTemplate([
    SystemMessage(content=f"""You are an expert at recommending movies to users.
You will receive a request from a user describing the type of movie the user would like to watch. 
You will also receive context in the form of {n_retrieved} movie descriptions corresponding to the user's request.
Respond to the request by recommending the user {n_recommended} movies from the {n_retrieved} movie descriptions, along with your reasons for choosing these movies.
Also mention which out of these {n_recommended} movies you recommend the most.
If you are provided critique about your movie recommendation article, then revise it by incorporating the provided feedback.
Try to raise the score from the critique messages by as much as possible"""),
    MessagesPlaceholder(variable_name="messages"),
])

ref_prompt = ChatPromptTemplate([
    SystemMessage(content="""You are the editor in chief of a highly acclaimed pop culture magazine.
You will be be given an AI generated message that recommends movies for a particular request.
Score the movie recommendation message from 1 to 10 based on its overall quality, along with your reasoning.
Suggest the three best ways to improve the score without changing which movies were recommended."""),
    MessagesPlaceholder(variable_name="messages"),
])

generate = gen_prompt | llm
reflect = ref_prompt | llm

#### Define graph structure to automate reflection loop, finishing once the original recommendation article is revised 3 times

In [17]:
from langgraph.graph import END, MessageGraph
from typing import List, Sequence

def generation_node(state: Sequence[BaseMessage]):
    return generate.invoke({"messages": state})

def reflection_node(state: Sequence[BaseMessage]):
    res = reflect.invoke({"messages": state})
    return HumanMessage(content=res.content)

# Stop after the third revision.
# Review the state message history, giving special attention to the progression of scores in the reflection articles
def should_continue(state: List[BaseMessage]):
    if len(state) >= 8:
    	return END
    return "reflect"

builder = MessageGraph()
builder.add_node("generate", generation_node)
builder.add_node("reflect", reflection_node)

builder.add_conditional_edges("generate", should_continue)
builder.add_edge("reflect", "generate")
builder.set_entry_point("generate")

graph = builder.compile()

#### Compare initial and final state after traversing through the MessageGraph                                                       

In [19]:
request = 'Recommend me something that will take me on an emotional rollercoaster.'

initial_state = rag_chain.invoke(request)

print(f'initial number of messages in state: {len(initial_state)}')

initial number of messages in state: 1


In [20]:
final_state = graph.invoke(initial_state)

print(f'final number of messages in state: {len(final_state)}')

for msg in final_state:
    msg.pretty_print()

final number of messages in state: 8

request: Recommend me something that will take me on an emotional rollercoaster.

context in the form of 6 movie descriptions:
Title: Moulin Rouge. Rating: 6.5. Release Date: 1952-12-23. Genres: Drama, Music. Overview: Fictional account of French artist Henri de Toulouse-Lautrec.

Title: Daft Punk's Electroma. Rating: 6.5. Release Date: 2006-03-24. Genres: Science Fiction. Overview: Follows the history of two robots, the members of Daft Punk, on their quest to become human.

Title: The Tall Guy. Rating: 5.7. Release Date: 1989-02-01. Genres: Comedy, Romance. Overview: An American actor in England tries to find love and work.

Title: Tenure. Rating: 6.1. Release Date: 2009-10-24. Genres: Comedy. Overview: Despite his outstanding intellect, associate professor Charlie Thurber is a chronic underachiever and has never received university tenure. Aided by his nutty best friend, Charlie launches a final effort to make the grade at Gray College. But a bea