In [11]:
import logging
import pandas as pd
from scrapers.filmladder import FilmladderScraper
from daily_pipeline import assign_ids_cinemas, assign_ids_screenings, extract_unique_movies
from daily_pipeline import add_cineville_tag
# from db.database import save_movies, save_screenings, save_cinemas
# from external.imdb_api import fetch_imdb_metadata
%load_ext autoreload
%autoreload 2

logging.basicConfig(level=logging.INFO)

In [2]:
logging.info("Starting daily data pipeline...")

# 1️⃣ Scrape Filmladder (returns two DataFrames)
filmladder = FilmladderScraper()
screenings_df, cinemas_df = filmladder.run()

# 2️⃣ Assign IDs
screenings_df = assign_ids_screenings(screenings_df)
cinemas_df = add_cineville_tag(assign_ids_cinemas(cinemas_df))

# 3️⃣ Extract and fetch IMDb metadata for unique movies
movies_df = extract_unique_movies(screenings_df)

# movies_df = fetch_metadata(movies_df)

# # 4️⃣ Store data in the database
# save_movies(movies_df)
# save_screenings(screenings_df)
# save_cinemas(cinemas_df)

# logging.info("Daily data pipeline completed.")

INFO:root:Starting daily data pipeline...
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Driver [/Users/ardsnijders/.wdm/drivers/chromedriver/mac64/133.0.6943.126/chromedriver-mac-x64/chromedriver] found in cache


In [3]:
cinemas_df.head(5)

Unnamed: 0,name,location,address,website,cinema_id
0,bijlmerbios,Amsterdam,,https://www.filmladder.nlhttps://www.filmladde...,cd51b85716
1,cinecenter,Amsterdam,,https://www.filmladder.nlhttps://www.filmladde...,0f979e20b9
2,cinema de vlugt,Amsterdam,,https://www.filmladder.nlhttps://www.filmladde...,921dfa19dd
3,de balie,Amsterdam,,https://www.filmladder.nlhttps://www.filmladde...,72491188f7
4,de uitkijk,Amsterdam,,https://www.filmladder.nlhttps://www.filmladde...,e8a281e73a


In [4]:
screenings_df.sample(5)

Unnamed: 0,cinema_name,title,year,show_datetime,ticket_url,rating,poster_url,movie_id,cinema_id
2847,the movies,the brutalist,2024,2025-03-03T14:15:00+01:00,https://www.filmladder.nl/kaartjes/1208783746,7.7★,https://assets.filmladder.nl/uploads/depot_ima...,6e064a5a9d,f1713e87bf
2601,rialto vu,maria,2024,2025-02-26T15:45:00+01:00,https://www.filmladder.nl/kaartjes/1208629617,6.4★,https://assets.filmladder.nl/uploads/depot_ima...,923a1081d3,e0c1757e61
2463,pathé tuschinski,the brutalist 2d,2024,2025-03-03T19:30:00+01:00,https://www.filmladder.nl/kaartjes/1208820270,7.7★,https://assets.filmladder.nl/uploads/depot_ima...,fb2f809034,d1556dc097
577,filmhallen,maria,2024,2025-02-26T13:00:00+01:00,https://www.filmladder.nl/kaartjes/1208562401,6.4★,https://assets.filmladder.nl/uploads/depot_ima...,923a1081d3,5269ad1c5d
820,kriterion,looney tunes: the day the earth blew up (ov),2024,2025-02-27T16:00:00+01:00,https://www.filmladder.nl/kaartjes/1208785801,6.7★,https://assets.filmladder.nl/uploads/filmladde...,b2a0458b81,8a70f32178


In [5]:
movies_df.sample(5)

Unnamed: 0,movie_id,title,year
35,1bf38e41a9,becoming led zeppelin,2025
141,fede234231,peter pan's neverland nightmare,2024
14,aa43df76dd,queer,2024
7,d07fcf4d8f,en fanfare,2024
80,2fce1f09d1,julie zwijgt,2024


In [6]:
from scrapers.letterboxd import LetterboxdScraper
lb_scraper = LetterboxdScraper()
watchlist_df = lb_scraper.run()


INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Driver [/Users/ardsnijders/.wdm/drivers/chromedriver/mac64/133.0.6943.126/chromedriver-mac-x64/chromedriver] found in cache


In [20]:
from utils.helpers import normalize_and_hash
watchlist_df["movie_id"] = watchlist_df.apply(lambda row: normalize_and_hash(row["title"], row["year"]), axis=1)
filtered_screenings = screenings_df[screenings_df['movie_id'].isin(watchlist_df['movie_id'])]
filtered_screenings.sort_values(by='rating', ascending=False).head(5)

Unnamed: 0,cinema_name,title,year,show_datetime,ticket_url,rating,poster_url,movie_id,cinema_id
673,filmhallen,the deer hunter,1978,2025-02-27T20:30:00+01:00,https://www.filmladder.nl/kaartjes/1207655603,8.1★,https://assets.filmladder.nl/uploads/imdb_post...,15d50b88bb,5269ad1c5d
359,eye,winter sleep,2014,2025-03-02T14:45:00+01:00,https://www.filmladder.nl/kaartjes/1208683385,8.0★,https://assets.filmladder.nl/uploads/depot_ima...,8a872e068f,9857853deb
5,cinecenter,a complete unknown,2024,2025-02-25T16:10:00+01:00,https://www.filmladder.nl/kaartjes/1208540078,7.6★,https://assets.filmladder.nl/uploads/depot_ima...,e7bc07ba2a,0f979e20b9
1744,pathé city,a complete unknown,2024,2025-02-28T19:40:00+01:00,https://www.filmladder.nl/kaartjes/1208819960,7.6★,https://assets.filmladder.nl/uploads/depot_ima...,e7bc07ba2a,bb2d3a934c
1750,pathé city,a complete unknown,2024,2025-03-03T17:00:00+01:00,https://www.filmladder.nl/kaartjes/1208819966,7.6★,https://assets.filmladder.nl/uploads/depot_ima...,e7bc07ba2a,bb2d3a934c
