In [5]:
import pandas as pd
import sqlite3
import random
from backend.data_models.config import DATABASE_URL
from sklearn.metrics.pairwise import cosine_similarity
pd.set_option('display.max_colwidth', None)

def get_db_connection():
    return sqlite3.connect('/Users/ardsnijders/Documents/cineville_scraper/backend/db.sqlite3', check_same_thread=False)

In [6]:
def get_movies():
    conn = get_db_connection()
    query = """
        SELECT *
        FROM movies
    """

    df = pd.read_sql(query, conn)
    conn.close()
    # df["formatted_day"] = df["show_datetime"].dt.strftime("%A (%b %d)")
    # df["title"] = df["title"].str.title()
    # df["cinema"] = df["cinema"].str.title()
    return df

In [8]:
df = get_movies()

In [13]:
import pandas as pd
import numpy as np
import ast
import openai
from api_key import API_KEY
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity

class OpenAIMovieEmbedder:
    def __init__(self, df, openai_api_key):
        self.df = df
        self.client = openai.OpenAI(api_key=openai_api_key)  # Correct new API usage

    @staticmethod
    def safe_concat(*args):
        """
        Safely concatenates multiple text fields, ignoring NaN/None values.
        """
        return " ".join(str(arg) for arg in args if pd.notna(arg) and arg)

    @staticmethod
    def parse_keywords(keywords):
        """
        Safely parse keywords stored as a string representation of a list.
        """
        try:
            parsed = ast.literal_eval(keywords)
            return " ".join(parsed) if isinstance(parsed, list) else ""
        except (ValueError, SyntaxError):
            return ""

    def prepare_text(self):
        """
        Prepare the text to be embedded, ensuring safe parsing.
        """
        def safe_parse_list(val):
            if isinstance(val, str):  # Only parse if it's a string
                try:
                    parsed = ast.literal_eval(val)
                    return ", ".join(parsed) if isinstance(parsed, list) else ""
                except (ValueError, SyntaxError):
                    return ""
            return ""  # Return empty string for non-strings

        self.df["text_to_embed"] = self.df.apply(
            lambda row: f"Title: {row.get('title', '')} "
                        f"Director: {row.get('director', '')} "
                        f"Genres: {safe_parse_list(row.get('genres', ''))} "
                        f"Plot: {row.get('plot', '')} "
                        f"Keywords: {self.parse_keywords(row.get('keywords', ''))}",
            axis=1,
        )

    def generate_embeddings(self):
        """
        Generate OpenAI embeddings for each movie and store them in the DataFrame.
        """

        def get_embedding(text):
            response = self.client.embeddings.create(
                model="text-embedding-ada-002",
                input=text
            )
            return response.data[0].embedding  # Updated response structure

        tqdm.pandas(desc="Embedding movies with OpenAI")
        self.df["embedding"] = self.df["text_to_embed"].progress_apply(get_embedding)

    def save_embeddings(self, output_path):
        """
        Save embeddings to a CSV file.
        """
        self.df.to_csv(output_path, index=False)
        print(f"✅ Movie dataset embedded and saved to {output_path}")

    def load_embeddings(self, csv_path):
        """
        Load embedded movie data.
        """
        self.df = pd.read_csv(csv_path)
        self.df["embedding"] = self.df["embedding"].apply(lambda x: np.array(eval(x)))

    def get_mood_recommendations(self, user_query, top_k=5, rerank_top_n=20):
        """
        Retrieves movies using hybrid search (embedding similarity + keyword matching).
        """
        if "embedding" not in self.df.columns:
            raise ValueError("❌ Movie embeddings are not loaded. Run `load_embeddings()` first.")

        # 1️⃣ **Semantic Search (Embedding Similarity)**
        query_embedding = openai.Embedding.create(
            model="text-embedding-ada-002",
            input=user_query
        )["data"][0]["embedding"]

        similarities = cosine_similarity([query_embedding], np.stack(self.df["embedding"].values))[0]
        self.df["similarity"] = similarities

        # 2️⃣ **Keyword Matching (Simple Hybrid Search)**
        keyword_matches = self.df["text_to_embed"].str.contains(user_query, case=False, na=False).astype(int)
        self.df["keyword_score"] = keyword_matches * 0.1  # Weight keyword match lower than embeddings

        # 3️⃣ **Select Top Candidates**
        self.df["hybrid_score"] = self.df["similarity"] + self.df["keyword_score"]
        candidates = self.df.nlargest(rerank_top_n, "hybrid_score")[["title", "plot", "hybrid_score"]]

        # 4️⃣ **Reranking with GPT-4 Turbo**
        rerank_input = "\n\n".join([
            f"{i+1}. Title: {row['title']}\nPlot: {row['plot']}"
            for i, row in candidates.iterrows()
        ])
        prompt = f"Given the query '{user_query}', rank these movies by relevance:\n{rerank_input}"

        response = openai.ChatCompletion.create(
            model="gpt-4-turbo",
            messages=[{"role": "system", "content": "You are a movie recommendation assistant."},
                      {"role": "user", "content": prompt}]
        )

        rerank_output = response["choices"][0]["message"]["content"]
        ranked_titles = [line.split(". ")[1] for line in rerank_output.split("\n") if ". " in line]

        # 5️⃣ **Return Top Results**
        results = candidates[candidates["title"].isin(ranked_titles)].head(top_k)
        return results[["title", "plot"]]

In [14]:
# Initialize OpenAI-based embedder (replace with your actual API key)
embedder = OpenAIMovieEmbedder(df, openai_api_key=API_KEY)

# Step 1: Prepare and embed data
embedder.prepare_text()
embedder.generate_embeddings()
embedder.save_embeddings("movies_with_openai_embeddings.csv")

Embedding movies with OpenAI:   1%|          | 1/157 [00:03<08:01,  3.09s/it]


RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [None]:
# Step 2: Load embeddings and run queries
embedder.load_embeddings("movies_with_openai_embeddings.csv")
output = embedder.get_mood_recommendations("Dark, mysterious, dangerous, threatening", top_k=10)
print(output)

In [128]:
# Step 2: Load embeddings and run queries
embedder.load_embeddings("movies_with_embeddings.csv")
output = embedder.get_mood_recommendations("Beauty, relationships, drama, sexy, romantic", top_k=20)
output.head(20)

Unnamed: 0,title,plot
5,bridget jones: mad about the boy,"Bridget Jones navigates life as a widow and single mum with the help of her family, friends, and former lover, Daniel. Back to work and on the apps, she&apos;s pursued by a younger man and maybe - just maybe - her son&apos;s science teacher."
75,companion,A weekend getaway with friends at a remote cabin turns into chaos after it&apos;s revealed that one of the guests is not what they seem.
53,belle de jour,A frigid young housewife decides to spend her midweek afternoons as a prostitute.
15,the discreet charm of the bourgeoisie,"A surreal, virtually plotless series of dreams centered around six middle-class people and their consistently interrupted attempts to have a meal together."
137,polarized,"Lisa, an aspiring songwriter, whose farming family has suffered foreclosure is forced to work at a new, &apos;urban farm&apos; where she meets Dalia. Her casual racism leads her to be fired but the women end up drawn into a passionate affair."
7,hard truths,Ongoing exploration of the contemporary world with a tragicomic study of human strengths and weaknesses.
61,kasaba,The story of a family living in a small godforsaken town in Turkey seen through the eyes of children and dealing with the growing complexity when one becomes an adult.
141,romeo + juliet,Shakespeare&apos;s famous play is updated to the hip modern suburb of Verona still retaining its original dialogue.
48,when the light breaks,"Una grapples with grief while harboring a secret, unable to fully express her emotions, as she navigates challenging events swirling around her."
117,juf braaksel en de geniale ontsnapping,


In [122]:
embedder.df.iloc[26].text_to_embed

'Title: no other land Director: ["Yuval Abraham", "Basel Adra", "Hamdan Ballal"] Genres: Documentary, Back to top Plot: This film made by a Palestinian-Israeli collective shows the destruction of the occupied West Bank&apos;s Masafer Yatta by Israeli soldiers and the alliance which develops between the Palestinian activist Basel and Israeli journalist Y... Keywords: israeli occupation,house destruction,journalist,destruction,destruction of school'