Import the libraries

In [19]:
import json
import sqlite3
import pandas as pd
from langchain_openai import ChatOpenAI
import os
import re
import chromadb
from openai import OpenAI
from langchain_community.callbacks import get_openai_callback
from langchain.schema import HumanMessage
from IPython.display import clear_output

Connect to sqlite db

In [20]:
cols = ['showname','first_airing','imdb','lang','description']
def fetch_rows(limit: int, offset: int):
    conn = sqlite3.connect("movie_db.sqlite")
    cursor = conn.cursor()
    executor = cursor.execute(f'select {",".join(cols)} from tvmaze where metadata is null and description is not null order by showname asc limit {limit} offset {offset}')
    data = executor.fetchall()
    cursor.close()
    conn.close()
    return data

pd.DataFrame(fetch_rows(100, 0), columns=cols)

Unnamed: 0,showname,first_airing,imdb,lang,description
0,'Allo 'Allo!,1982-12-30,tt0086659,English,"<p>In this spoof of World War II, René Artois ..."
1,'Orrible,2001-09-10,tt0299233,English,<p>Paul Clark is a cab driver and wannabe smal...
2,'Run Sbit,2016-04-01,,Welsh,<p>Satirical comedy series in a fly on the wal...
3,'Til Death Do Us Part,2007-03-19,,English,<p><b>'Til Death Do Us Part</b> is murder-myst...
4,'Til Death Do Us Part,2019-07-09,tt10553838,English,<p><b>'Til Death Do Us Part</b> follows lovers...
...,...,...,...,...,...
95,A Sister's All You Need.,2017-10-08,tt9731082,Japanese,<p>This is the story about the daily life of a...
96,A Small Light,2023-05-01,tt17921714,English,<p><b>A Small Light</b> follows twenty-somethi...
97,A Soldier's Heart,2020-01-20,tt12132812,Tagalog,<p>The story of seven individuals who chose th...
98,A Special Meal of the Weirdo 'Nara',2017-05-24,tt6962506,Korean,"<p>When you feel low, you must go to the meat ..."


Prepare Labeling Agent

In [21]:
# Initialize a ChatOpenAI model
llm = ChatOpenAI(
    model="gpt-4o-mini",
    api_key=os.getenv('OPENAI_KEY'),
    # model="deepseek-r1-distill-llama-8b",
    # openai_api_base='http://127.0.0.1:1234/v1',
)

Tools for extracting metadata

In [22]:
def generate_prompt(values: list[str]):
    with open(f'./prompts/movie_metadata_extraction.txt', 'r') as file:
        data = file.read().rstrip()
        index=0
            
        while(data.find(f"!<INPUT_{index}>") > 0):
            data = data.replace(f"!<INPUT_{index}>", values[index] if len(values) > index is not None else '')
            index+=1
        return data

    return None;

Define job to process data

In [23]:
def process_job(limit: int, offset: int) -> list[dict]:
    results = []
    completed = 0
    data = fetch_rows(limit, offset)
    total_tokens = 0
    total_cached_tokens = 0
    total_costs = 0
    for row in data:
        if(row[4] is not None and len(row[4]) > 0):
            with get_openai_callback() as cb:  
                prompt = generate_prompt([row[4]])
                output = llm.invoke([HumanMessage(content=prompt)])
                total_tokens+=cb.total_tokens
                total_costs+=cb.total_cost
                total_cached_tokens+=cb.prompt_tokens_cached
                json_str = re.findall(r"```json(.*?)```", output.text(),re.DOTALL)
                if(len(json_str) > 0):
                    try:
                        structured_data = json.loads(json_str[0])
                        conn = sqlite3.connect("movie_db.sqlite")
                        cursor = conn.cursor()
                        showname:str = row[0].replace("'", "\\'")
                        cursor.execute(f'update tvmaze set metadata = \'{json.dumps(structured_data)}\' where showname = \'{showname}\'')
                        conn.commit()
                        cursor.close()
                        conn.close()
                        results.append(structured_data)
                    except Exception  as e:
                        # print(row[0])
                        # print(e)
                        pass
                completed+=1
                clear_output(wait=True)
                print(f"{completed}/{limit} done ({round(completed/limit, 2) * 100}%)")
                print(f"total token use: {total_tokens}")
                print(f"total cost: ${total_costs}")
                print('-------------')
                print(json_str)
            
    return results;

Process the data

In [None]:
results = process_job(0,0)

1682/10000 done (17.0%)
total token use: 446990
total cost: $0.11899559999999992
-------------
['\n{\n  "locations": ["British isles"],\n  "characters": ["Dan Snow"],\n  "time period": ["11th-century"],\n  "events": ["Norman invasion"],\n  "sentimental": ["entertaining", "educational"],\n  "genre": ["documentary"]\n}\n']


KeyboardInterrupt: 

Define tools for vector embeddings

In [17]:
client = OpenAI(api_key="fake", base_url='http://127.0.0.1:1234/v1')
chroma_client = chromadb.PersistentClient(path="./chroma_db")  
collection = chroma_client.get_or_create_collection(name="movies")

def get_embedding(text):
    response = client.embeddings.create(model="text-embedding-nomic-embed-text-v1.5", input=text)
    return response.data[0].embedding

def db_to_embeddings(limit: int, offset: int):
    cols = ['showname','first_airing','imdb','lang','description', 'metadata']
    conn = sqlite3.connect("movie_db.sqlite")
    cursor = conn.cursor()
    executor = cursor.execute(f'select {",".join(cols)} from tvmaze where metadata is not null and processed is null order by showname asc limit {limit} offset {offset}')
    rows = executor.fetchall()
    completed = 0

    for row in rows:
        try:
            title = row[0]
            description=  row[4]
            id = f"{title}-{row[2]}"
            text = f"title: {title}, description: {description}, " 
            language = row[3] 

            metadata : dict = json.loads(row[5])
            parsed_metadata: dict = {}
            
            for key in metadata.keys():
                value = metadata[key]
                if(len(value) == 0):
                    continue
                if(type(metadata[key]) is list):
                    text += f"{key}: {','.join(value)}, "
                    for x in value:
                        x_str: str=x
                        val_key = f"{key}_{x_str.lower().replace(' ','_')}"
                        parsed_metadata[val_key] = True
                if(type(metadata[key]) is int or type(value) is str):
                    text += f"{key}: {value}, "
                    parsed_metadata[key] = value

            embedding = get_embedding(text)
            parsed_metadata["title"] = title if title is not None else ''
            parsed_metadata["language"] = language if language is not None else ''
            parsed_metadata["aired"] = row[1] if row[1] is not None else ''
            # print(parsed_metadata)
            # print(text)
            collection.add(
                ids=[id],
                documents=[text],
                embeddings=[embedding],
                metadatas=[parsed_metadata]
            )
            
            cursor = conn.cursor()
            cursor.execute(f'update tvmaze set processed = true where showname = \'{title}\'')
            conn.commit()
            completed+=1
        except Exception as e:
            print(e)
            pass
        clear_output(wait=True)
        print(f"{completed}/{limit} done ({round(completed/limit, 2) * 100}%)")
    cursor.close()
    conn.close()
      

Get word embeddings

In [27]:
db_to_embeddings(10000,0)

0/10000 done (0.0%)


Testing query

In [18]:
query_text = "japanese romantic"
query_embedding = get_embedding(query_text)

results = collection.query(
    query_embeddings=[query_embedding],
    # where={
    #     "genre": {'$eq': 'romantic'}
    # },
    n_results=5
)

for movie in results["documents"][0]:
    print(f"{movie}")

99 Years of Love - Japanese Americans <p>The story follows a family of Japanese immigrants who crossed over to America 99 years ago. Kusanagi plays both the young Hiramatsu Chokichi (later taken over by Nakai) and his son, Ichiro. When the war breaks out the Japanese immigrants face racism and segregation. Ichiro pledges his alliance to America and gets sent to Europe, second son Jiro stays back with Ichiro's beloved Shinobu (who he has a crush on) and tries to protect his parents' farm. Their two sisters Shizu and Sachie are sent back to Japan and have to experience the horrors of war, one in Hiroshima and the other in Okinawa.</p>
#Remolove: Futsuu no Koi wa Jado <p>Mimi Ozakura is an industrial physician who has never been in a relationship. To Mimi, protecting the health of the employees is the most important thing and romance comes after that. Mimi later gets to know someone through social media. Although Mimi does not know the identity of the person, he is actually someone who is