In [2]:
import pandas as pd
import sqlite3
import random
from backend.data_models.config import DATABASE_URL
from sklearn.metrics.pairwise import cosine_similarity
pd.set_option('display.max_colwidth', None)

def get_db_connection():
    return sqlite3.connect('/Users/ardsnijders/Documents/cineville_scraper/backend/db.sqlite3', check_same_thread=False)

In [3]:
def get_movies():
    conn = get_db_connection()
    query = """
        SELECT *
        FROM movies
    """

    df = pd.read_sql(query, conn)
    conn.close()
    # df["formatted_day"] = df["show_datetime"].dt.strftime("%A (%b %d)")
    # df["title"] = df["title"].str.title()
    # df["cinema"] = df["cinema"].str.title()
    return df

In [4]:
df = get_movies()


In [61]:
import pandas as pd
import numpy as np
import ast
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, CrossEncoder
from sklearn.metrics.pairwise import cosine_similarity
from rank_bm25 import BM25Okapi  # BM25 for lexical search
import html


class MovieEmbedder:
    def __init__(
        self,
        df,
        embed_model="sentence-transformers/all-mpnet-base-v2",
        rerank_model="BAAI/bge-reranker-large",
        hybrid_weight=0.5,
    ):  
        """Initialize the MovieEmbedder class."""
        self.embed_model = SentenceTransformer(embed_model)
        self.rerank_model = CrossEncoder(rerank_model)
        self.df = df
        self.hybrid_weight = hybrid_weight  
        self.bm25 = None  

    @staticmethod
    def parse_keywords(keywords):
        """Safely parse keyword lists stored as strings."""
        try:
            parsed = ast.literal_eval(keywords)
            return " ".join(parsed) if isinstance(parsed, list) else ""
        except (ValueError, SyntaxError):
            return ""

    def prepare_text(self):
        """Prepare structured text for embeddings with better formatting. Drops rows where 'plot' is NaN."""
        
        def safe_parse_list(value):
            """Safely parse a list-like string or return an empty list."""
            if pd.isna(value) or not isinstance(value, str):
                return []
            try:
                parsed = ast.literal_eval(value)
                return parsed if isinstance(parsed, list) else []
            except (ValueError, SyntaxError):
                return []

        def clean_director(director):
            """Clean director field, removing list-like artifacts."""
            if isinstance(director, str):
                try:
                    parsed = ast.literal_eval(director)
                    if isinstance(parsed, list):
                        return ", ".join(parsed)  # Convert list to a clean string
                except (ValueError, SyntaxError):
                    pass
            return director.strip()

        def format_entry(row):
            """Format text dynamically for better embeddings."""
            parts = []

            # Movie title and genre
            title = row.get("title", "").strip()
            genres = [g for g in safe_parse_list(row.get("genres", "")) if g.lower() != "back to top"]  # Remove "Back to top"
            if title and genres:
                parts.append(f"{title.title()} is a {', '.join(genres)} film.")
            elif title:
                parts.append(f"{title.title()} is a film.")
            elif genres:
                parts.append(f"This is a {', '.join(genres)} film.")

            # Director and actors
            director = clean_director(row.get("director", ""))
            actors = safe_parse_list(row.get("actors", ""))
            if director and actors:
                parts.append(f"It is directed by {director} and stars {', '.join(actors)}.")
            elif director:
                parts.append(f"It is directed by {director}.")
            elif actors:
                parts.append(f"It stars {', '.join(actors)}.")

            # Rating
            rating = row.get("rating", "")
            rating_count = row.get("rating_count", "")
            if rating and rating_count:
                parts.append(f"The movie has a rating of {rating} based on {rating_count} reviews.")

            # Plot (this is required, so it should not be NaN)
            plot = str(row.get("plot", "")).strip()  # Convert NaN to an empty string
            if plot:
                parts.append(f"Plot: {html.unescape(plot)}")  # Decode HTML entities
            else:
                return None  # If plot is missing, remove this row

            # Keywords
            keywords = self.parse_keywords(row.get("keywords", ""))
            if keywords:
                parts.append(f"Important themes include: {keywords}.")

            return " ".join(parts)

        # Drop rows where 'plot' is NaN before applying transformations
        self.df = self.df.dropna(subset=["plot"])

        # Apply formatting to each row, filtering out any None results
        self.df["text_to_embed"] = self.df.apply(format_entry, axis=1)
        self.df = self.df.dropna(subset=["text_to_embed"])  # Drop any rows where formatting failed


        # Drop rows where 'plot' is NaN before applying transformations
        self.df = self.df.dropna(subset=["plot"])

        # Apply formatting to each row, filtering out any None results
        self.df["text_to_embed"] = self.df.apply(format_entry, axis=1)
        self.df = self.df.dropna(subset=["text_to_embed"])  # Drop any rows where formatting failed


    def generate_embeddings(self):
        """Generate sentence embeddings and store them in the DataFrame."""
        tqdm.pandas(desc="Embedding movies")
        self.df["embedding"] = self.df["text_to_embed"].progress_apply(
            lambda x: self.embed_model.encode(x).tolist()
        )

    def save_embeddings(self, output_path):
        """Save the DataFrame with embeddings to a CSV file."""
        self.df.to_csv(output_path, index=False)
        print(f"✅ Movie dataset saved to {output_path}")

    def load_embeddings(self, csv_path):
        """Load movie embeddings from a CSV file."""
        self.df = pd.read_csv(csv_path)
        self.df["embedding"] = self.df["embedding"].apply(lambda x: np.array(eval(x)))

    def get_mood_recommendations(self, user_query, top_k=5, rerank_top_n=20):
        """
        Retrieves movies using embedding similarity and reranks them with a cross-encoder.
        Now reranks using `text_to_embed` for consistency.
        """
        if self.df is None or "embedding" not in self.df.columns:
            raise ValueError("❌ Movie embeddings are not loaded. Run `load_embeddings()` first.")

        # Encode user query
        query_embedding = self.embed_model.encode(user_query)

        # Compute cosine similarity
        similarities = cosine_similarity([query_embedding], np.stack(self.df["embedding"].values))[0]
        self.df["similarity"] = similarities

        # Select top N candidates for reranking
        candidates = self.df.nlargest(rerank_top_n, "similarity")

        # Use `text_to_embed` for reranking (ensures consistency)
        query_movie_pairs = [(user_query, text) for text in candidates["text_to_embed"].tolist()]
        rerank_scores = self.rerank_model.predict(query_movie_pairs)

        # Update candidates with rerank scores and return top_k results
        candidates["rerank_score"] = rerank_scores
        results = candidates.nlargest(top_k, "rerank_score")[["title", "plot", "text_to_embed"]]

        return results



In [62]:
# Example Usage
embedder = MovieEmbedder(df, 
                         embed_model="sentence-transformers/all-mpnet-base-v2",
                         rerank_model="cross-encoder/ms-marco-MiniLM-L-12-v2")

# Step 1: Load and process data
embedder.prepare_text()
embedder.generate_embeddings()
embedder.save_embeddings("movies_with_embeddings.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df["text_to_embed"] = self.df.apply(format_entry, axis=1)
Embedding movies: 100%|██████████| 152/152 [00:20<00:00,  7.57it/s]


✅ Movie dataset saved to movies_with_embeddings.csv


In [63]:
# Step 2: Load embeddings and run queries
embedder.load_embeddings("movies_with_embeddings.csv")
output = embedder.get_mood_recommendations("Supernatural, fun and comical", top_k=5)
output
# embedder.df[['title', 'plot', 'text_to_embed', 'similarity']].sort_values(by='similarity', ascending=False).head(10)

Unnamed: 0,title,plot,text_to_embed
125,wicked (ov),"Elphaba, a young woman ridiculed for her green skin, and Galinda, a popular girl, become friends at Shiz University in the Land of Oz. After an encounter with the Wonderful Wizard of Oz, their friendship reaches a crossroads.","Wicked (Ov) is a Fairy Tale, Pop Musical, Fantasy, Musical, Romance film. It is directed by Jon M. Chu and stars Cynthia Erivo, Ariana Grande, Jeff Goldblum. The movie has a rating of 7.6 based on 132210.0 reviews. Plot: Elphaba, a young woman ridiculed for her green skin, and Galinda, a popular girl, become friends at Shiz University in the Land of Oz. After an encounter with the Wonderful Wizard of Oz, their friendship reaches a crossroads. Important themes include: wicked witch of the west character,wizard of oz character,magic,glinda the good character,bullying."
85,nosferatu,"A gothic tale of obsession between a haunted young woman and the terrifying vampire infatuated with her, causing untold horror in its wake.","Nosferatu is a Dark Fantasy, Supernatural Horror, Vampire Horror, Fantasy, Horror, Mystery film. It is directed by Robert Eggers and stars Lily-Rose Depp, Nicholas Hoult, Bill Skarsgård. The movie has a rating of 7.3 based on 163204.0 reviews. Plot: A gothic tale of obsession between a haunted young woman and the terrifying vampire infatuated with her, causing untold horror in its wake. Important themes include: female nudity,vampire,vampire horror,dark fantasy,gothic horror."
86,twin peaks: fire walk with me,"Laura Palmer&apos;s harrowing final days are chronicled one year after the murder of Teresa Banks, a resident of Twin Peaks&apos; neighboring town.","Twin Peaks: Fire Walk With Me is a Erotic Thriller, Psychological Drama, Psychological Horror, Psychological Thriller, Supernatural Horror, Suspense Mystery, Teen Drama, Teen Horror, Tragedy, Drama film. It is directed by David Lynch and stars Sheryl Lee, Ray Wise, Mädchen Amick. The movie has a rating of 7.3 based on 111041.0 reviews. Plot: Laura Palmer's harrowing final days are chronicled one year after the murder of Teresa Banks, a resident of Twin Peaks' neighboring town. Important themes include: incest,nonlinear timeline,double life,rape,surrealism."
36,chungking express,"Two melancholic Hong Kong policemen fall in love: one with a mysterious female underworld figure, the other with a beautiful and ethereal waitress at a late-night restaurant he frequents.","Chungking Express is a Comedy, Crime, Drama, Mystery, Romance film. It is directed by Wong Kar-Wai and stars Brigitte Lin, Takeshi Kaneshiro, Tony Leung Chiu-wai. The movie has a rating of 8.0 based on 102137.0 reviews. Plot: Two melancholic Hong Kong policemen fall in love: one with a mysterious female underworld figure, the other with a beautiful and ethereal waitress at a late-night restaurant he frequents. Important themes include: drug smuggling,hong kong,daydream,loneliness,spoken inner thoughts."
122,the monkey,"When twin brothers Bill and Hal find their father&apos;s old monkey toy in the attic, a series of gruesome deaths start. The siblings decide to throw the toy away and move on with their lives, growing apart over the years.","The Monkey is a Dark Comedy, Splatter Horror, Horror film. It is directed by Osgood Perkins and stars Theo James, Tatiana Maslany, Christian Convery. The movie has a rating of 6.5 based on 10224.0 reviews. Plot: When twin brothers Bill and Hal find their father's old monkey toy in the attic, a series of gruesome deaths start. The siblings decide to throw the toy away and move on with their lives, growing apart over the years. Important themes include: gruesome deaths,twin brothers,toy,toy monkey,monkey."


In [21]:
embedder.df.iloc[75].text_to_embed

'Title: companion Director: ["Drew Hancock"] Actors: ["Sophie Thatcher", "Jack Quaid", "Lukas Gage"] Rating: 7.1 Rating Count: 42446.0 Genres: Artificial Intelligence, Dark Comedy, Psychological Thriller, Sci-Fi, Thriller, Back to top Plot: A weekend getaway with friends at a remote cabin turns into chaos after it&apos;s revealed that one of the guests is not what they seem. Keywords: gay,psychological thriller,attempted rape,robot human relationship,one word title'