In [33]:
import logging
import pandas as pd
from scrapers.filmladder import FilmladderScraper
from daily_pipeline import assign_ids_cinemas, assign_ids_screenings, extract_unique_movies
from daily_pipeline import add_cineville_tag, add_imdb_links
# from db.database import save_movies, save_screenings, save_cinemas
# from external.imdb_api import fetch_imdb_metadata
%load_ext autoreload
%autoreload 2

logging.basicConfig(level=logging.INFO)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
logging.info("Starting daily data pipeline...")

# 1️⃣ Scrape Filmladder (returns two DataFrames)
filmladder = FilmladderScraper()
screenings_df, cinemas_df = filmladder.run()

# 2️⃣ Assign IDs
screenings_df = assign_ids_screenings(screenings_df)
cinemas_df = add_cineville_tag(assign_ids_cinemas(cinemas_df))

# 3️⃣ Extract and fetch IMDb metadata for unique movies
movies_df = extract_unique_movies(screenings_df)

# Add intermediate step: only fetch imdb links / metadata for films that do not yet have one
movies_df = add_imdb_links(movies_df)

# # 4️⃣ Store data in the database
# save_movies(movies_df)
# save_screenings(screenings_df)
# save_cinemas(cinemas_df)

# logging.info("Daily data pipeline completed.")

INFO:root:Starting daily data pipeline...


In [None]:
# Example Usage:
# df = pd.read_csv("movies.csv")  # Assuming a CSV with a 'movie_link' column
from scrapers.imdb import IMDBScraper
scraper = IMDBScraper(headless=True)
result_df = scraper.run(movies_df['movie_link'].tolist())

INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Driver [/Users/ardsnijders/.wdm/drivers/chromedriver/mac64/133.0.6943.127/chromedriver-mac-x64/chromedriver] found in cache
Scraping Progress: 100%|██████████| 160/160 [05:02<00:00,  1.89s/it]


                                            movie_link  \
0    https://www.filmladder.nl/film/favoriten-2024/...   
1    https://www.filmladder.nl/film/merckx-2025/pop...   
2    https://www.filmladder.nl/film/a-complete-unkn...   
3    https://www.filmladder.nl/film/alpha-2024/popu...   
4    https://www.filmladder.nl/film/anora-2024/popu...   
..                                                 ...   
155  https://www.filmladder.nl/film/gallipoli-1981/...   
156  https://www.filmladder.nl/film/marcel-the-shel...   
157  https://www.filmladder.nl/film/memoir-of-a-sna...   
158  https://www.filmladder.nl/film/platoon-1986/po...   
159  https://www.filmladder.nl/film/the-seed-of-the...   

                                 imdb_link  
0    https://www.imdb.com/title/tt31015543  
1    https://www.imdb.com/title/tt35488509  
2    https://www.imdb.com/title/tt11563598  
3    https://www.imdb.com/title/tt28797114  
4    https://www.imdb.com/title/tt28607951  
..                               

In [32]:
test_df = movies_df.sample(5).copy().drop(columns='imdb_link')

from daily_pipeline import add_imdb_links
test_df = add_imdb_links(test_df)
test_df

INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Driver [/Users/ardsnijders/.wdm/drivers/chromedriver/mac64/133.0.6943.127/chromedriver-mac-x64/chromedriver] found in cache
Scraping Progress: 100%|██████████| 5/5 [00:09<00:00,  2.00s/it]


Unnamed: 0,movie_id,title,year,movie_link,imdb_link
0,c840317b2b,in the mood for love,2000,https://www.filmladder.nl/film/in-the-mood-for...,https://www.imdb.com/title/tt0118694
1,82ed88983a,no other land,2024,https://www.filmladder.nl/film/no-other-land-2...,https://www.imdb.com/title/tt30953759
2,eeef3a9e14,la haine,1995,https://www.filmladder.nl/film/la-haine-1995/p...,https://www.imdb.com/title/tt0113247
3,6551ca291e,love fail repeat,2024,https://www.filmladder.nl/film/love-fail-repea...,https://www.imdb.com/title/tt31487517
4,9ddc608efd,the pianist,2002,https://www.filmladder.nl/film/the-pianist-200...,https://www.imdb.com/title/tt0253474


In [8]:
from scrapers.letterboxd import LetterboxdScraper
lb_scraper = LetterboxdScraper()
watchlist_df = lb_scraper.run()


INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Driver [/Users/ardsnijders/.wdm/drivers/chromedriver/mac64/133.0.6943.126/chromedriver-mac-x64/chromedriver] found in cache


In [9]:
from utils.helpers import normalize_and_hash
watchlist_df["movie_id"] = watchlist_df.apply(lambda row: normalize_and_hash(row["title"], row["year"]), axis=1)
filtered_screenings = screenings_df[screenings_df['movie_id'].isin(watchlist_df['movie_id'])]
filtered_screenings.sort_values(by='rating', ascending=False).head(5)

Unnamed: 0,cinema_name,title,year,show_datetime,ticket_url,rating,poster_url,movie_id,cinema_id
673,filmhallen,the deer hunter,1978,2025-02-27T20:30:00+01:00,https://www.filmladder.nl/kaartjes/1207655603,8.1★,https://assets.filmladder.nl/uploads/imdb_post...,15d50b88bb,5269ad1c5d
359,eye,winter sleep,2014,2025-03-02T14:45:00+01:00,https://www.filmladder.nl/kaartjes/1208683385,8.0★,https://assets.filmladder.nl/uploads/depot_ima...,8a872e068f,9857853deb
5,cinecenter,a complete unknown,2024,2025-02-25T16:10:00+01:00,https://www.filmladder.nl/kaartjes/1208540078,7.6★,https://assets.filmladder.nl/uploads/depot_ima...,e7bc07ba2a,0f979e20b9
1744,pathé city,a complete unknown,2024,2025-02-28T19:40:00+01:00,https://www.filmladder.nl/kaartjes/1208819960,7.6★,https://assets.filmladder.nl/uploads/depot_ima...,e7bc07ba2a,bb2d3a934c
1750,pathé city,a complete unknown,2024,2025-03-03T17:00:00+01:00,https://www.filmladder.nl/kaartjes/1208819966,7.6★,https://assets.filmladder.nl/uploads/depot_ima...,e7bc07ba2a,bb2d3a934c


In [11]:
movies_df

Unnamed: 0,movie_id,title,year
0,378909531a,favoriten,2024
1,430904b120,merckx,2025
2,e7bc07ba2a,a complete unknown,2024
3,ff1d83f7ce,alpha,2024
4,172e369640,anora,2024
...,...,...,...
155,ff1c2c304c,gallipoli,1981
156,74c1d8b113,marcel the shell with shoes on,2021
157,3927581269,memoir of a snail,2024
158,42e553e73d,platoon,1986


In [52]:
path = 'external_data/imdb_data/title.basics.tsv.gz'
imdb_data = pd.read_csv(path, sep='\t')

  imdb_data = pd.read_csv(path, sep='\t')


In [54]:
imdb_data = imdb_data[imdb_data["titleType"] == "movie"].copy()

In [60]:
import pandas as pd
from thefuzz import process

def add_imdb_id_fuzzy(movies_df, imdb_movies_df, threshold=90):
    """
    Matches movies in movies_df with IMDb movies using exact and fuzzy matching.

    Args:
        movies_df (pd.DataFrame): DataFrame with movie titles and years.
        imdb_movies_df (pd.DataFrame): IMDb DataFrame with tconst, primaryTitle, and originalTitle.
        threshold (int): Similarity threshold for fuzzy matching (default: 90).

    Returns:
        pd.DataFrame: movies_df with an added 'imdb_id' column.
    """
    movies_df = movies_df.copy()

    # Prepare IMDb lookup keys (lowercased, dashes instead of spaces)
    imdb_movies_df["primary_key"] = imdb_movies_df["primaryTitle"].str.lower().str.replace(" ", "-", regex=True) + "-" + imdb_movies_df["startYear"].astype(str)
    imdb_movies_df["original_key"] = imdb_movies_df["originalTitle"].str.lower().str.replace(" ", "-", regex=True) + "-" + imdb_movies_df["startYear"].astype(str)

    # Exact match dictionaries
    primary_lookup = dict(zip(imdb_movies_df["primary_key"], imdb_movies_df["tconst"]))
    original_lookup = dict(zip(imdb_movies_df["original_key"], imdb_movies_df["tconst"]))

    # Fuzzy matching lists
    primary_titles = imdb_movies_df["primary_key"].tolist()
    original_titles = imdb_movies_df["original_key"].tolist()

    def find_imdb_id(row):
        """Finds the best matching IMDb ID using exact and fuzzy matching."""
        title_key = row["title_year"]

        # Step 1: Try exact match
        imdb_id = primary_lookup.get(title_key) or original_lookup.get(title_key)
        if imdb_id:
            return imdb_id

        # Step 2: Try fuzzy matching on primaryTitle
        best_match, score = process.extractOne(title_key, primary_titles)
        if score >= threshold:
            return primary_lookup.get(best_match)

        # Step 3: Try fuzzy matching on originalTitle
        best_match, score = process.extractOne(title_key, original_titles)
        if score >= threshold:
            return original_lookup.get(best_match)

        return None  # No match found

    # Apply the matching function
    movies_df["imdb_id"] = movies_df.apply(find_imdb_id, axis=1)

    return movies_df


# Example usage
movies_with_imdb = add_imdb_id_fuzzy(movies_df, imdb_data)

KeyboardInterrupt: 

In [58]:
len(movies_with_imdb[movies_with_imdb.imdb_id.isna()])

43

In [30]:
# Filter only movies
imdb_data = imdb_data[imdb_data["titleType"] == "movie"].copy()

# Create name_year column
# Create name_year column: lowercase + replace spaces with dashes + append startYear
imdb_data["title_year"] = (
    imdb_data["primaryTitle"]
    .str.lower()
    .str.replace(r"\s+", "-", regex=True)  # Replace spaces with dashes
    + "-" + imdb_data["startYear"]
)

movies_df["title_year"] = (
    movies_df["title"]
    .str.lower()
    .str.replace(r"\s+", "-", regex=True)  # Replace spaces with dashes
    + "-" + movies_df["year"]
)

In [38]:
import pandas as pd

def add_imdb_id(movies_df, imdb_movies_df):
    """
    Matches movies in movies_df with IMDb movies using the title_year key
    and adds the corresponding IMDb ID (tconst).
    
    Args:
        movies_df (pd.DataFrame): DataFrame with movie titles and years.
        imdb_movies_df (pd.DataFrame): IMDb DataFrame with tconst and name_year.

    Returns:
        pd.DataFrame: movies_df with an added 'imdb_id' column.
    """
    # Ensure both DataFrames have 'title_year' and 'name_year' as lowercase + dash-formatted
    movies_df = movies_df.copy()
    imdb_movies_df = imdb_movies_df.copy()

    # Rename IMDb columns for easier merging
    imdb_movies_df = imdb_movies_df.rename(columns={"tconst": "imdb_id"})

    # Merge on title_year (name_year from IMDb)
    merged_df = movies_df.merge(imdb_movies_df[["title_year", "imdb_id"]], on="title_year", how="left")

    return merged_df

# Example usage
movies_with_imdb = add_imdb_id(movies_df, imdb_data)

# Save the updated DataFrame
# movies_with_imdb.to_csv("movies_with_imdb.csv", index=False)
