In [2]:
import logging
import pandas as pd
from scrapers.filmladder import FilmladderScraper
from daily_pipeline import assign_ids_cinemas, assign_ids_screenings, extract_unique_movies
# from db.database import save_movies, save_screenings, save_cinemas
# from external.imdb_api import fetch_imdb_metadata

logging.basicConfig(level=logging.INFO)

In [3]:
logging.info("Starting daily data pipeline...")

# 1️⃣ Scrape Filmladder (returns two DataFrames)
filmladder = FilmladderScraper()
screenings_df, cinemas_df = filmladder.run()

# 2️⃣ Assign IDs
screenings_df = assign_ids_screenings(screenings_df)
cinemas_df = assign_ids_cinemas(cinemas_df)

# 3️⃣ Extract and fetch IMDb metadata for unique movies
movies_df = extract_unique_movies(screenings_df)

# movies_df = fetch_metadata(movies_df)

# # 4️⃣ Store data in the database
# save_movies(movies_df)
# save_screenings(screenings_df)
# save_cinemas(cinemas_df)

# logging.info("Daily data pipeline completed.")

INFO:root:Starting daily data pipeline...
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Driver [/Users/ardsnijders/.wdm/drivers/chromedriver/mac64/133.0.6943.126/chromedriver-mac-x64/chromedriver] found in cache


In [10]:
cinemas_df.head(5)

Unnamed: 0,name,location,address,website,cinema_id
0,bijlmerbios,Amsterdam,,https://www.filmladder.nlhttps://www.filmladde...,cd51b85716
1,cinecenter,Amsterdam,,https://www.filmladder.nlhttps://www.filmladde...,0f979e20b9
2,cinema de vlugt,Amsterdam,,https://www.filmladder.nlhttps://www.filmladde...,921dfa19dd
3,de balie,Amsterdam,,https://www.filmladder.nlhttps://www.filmladde...,72491188f7
4,de uitkijk,Amsterdam,,https://www.filmladder.nlhttps://www.filmladde...,e8a281e73a


In [12]:
screenings_df.sample(5)

Unnamed: 0,cinema_name,title,year,show_datetime,ticket_url,rating,poster_url,movie_id,cinema_id
443,filmhallen,a real pain,2024,2025-02-28T21:30:00+01:00,https://www.filmladder.nl/kaartjes/1208783577,7.1★,https://assets.filmladder.nl/uploads/depot_ima...,39dc3501fc,5269ad1c5d
2422,pathé tuschinski,maria,2024,2025-03-02T15:20:00+01:00,https://www.filmladder.nl/kaartjes/1208820318,6.4★,https://assets.filmladder.nl/uploads/depot_ima...,923a1081d3,d1556dc097
1394,pathé arena,captain america: brave new world 3d,2025,2025-03-02T21:10:00+01:00,https://www.filmladder.nl/kaartjes/1208822075,6.1★,https://assets.filmladder.nl/uploads/depot_ima...,fd8c69f8be,a8798dbb51
1924,pathé city,we live in time,2024,2025-03-01T18:20:00+01:00,https://www.filmladder.nl/kaartjes/1208819941,7.0★,https://assets.filmladder.nl/uploads/depot_ima...,1f438778fd,bb2d3a934c
1048,pathé amsterdam noord,captain america: brave new world 2d,2025,2025-02-25T19:45:00+01:00,https://www.filmladder.nl/kaartjes/1208828158,6.1★,https://assets.filmladder.nl/uploads/depot_ima...,2d20728f3d,bd8a87e0e2


In [13]:
movies_df.sample(5)

Unnamed: 0,movie_id,title,year
52,ae3c1f373d,all we imagine as light,2024
37,ff336d10d5,ernest cole: lost and found,2024
86,2e499709ef,juffrouw pots,2024
81,8cfdad6838,the best years of our lives,1946
49,cf70ba044b,unknown pleasures,2002


In [5]:
from scrapers.letterboxd import LetterboxdScraper
lb_scraper = LetterboxdScraper()
watchlist_df = lb_scraper.run()


INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Driver [/Users/ardsnijders/.wdm/drivers/chromedriver/mac64/133.0.6943.126/chromedriver-mac-x64/chromedriver] found in cache


In [18]:
from utils.helpers import normalize_and_hash
watchlist_df["movie_id"] = watchlist_df.apply(lambda row: normalize_and_hash(row["title"], row["year"]), axis=1)
filtered_screenings = screenings_df[screenings_df['movie_id'].isin(watchlist_df['movie_id'])]
filtered_screenings.sort_values(by='rating', ascending=False)

Unnamed: 0,cinema_name,title,year,show_datetime,ticket_url,rating,poster_url,movie_id,cinema_id
673,filmhallen,the deer hunter,1978,2025-02-27T20:30:00+01:00,https://www.filmladder.nl/kaartjes/1207655603,8.1★,https://assets.filmladder.nl/uploads/imdb_post...,15d50b88bb,5269ad1c5d
359,eye,winter sleep,2014,2025-03-02T14:45:00+01:00,https://www.filmladder.nl/kaartjes/1208683385,8.0★,https://assets.filmladder.nl/uploads/depot_ima...,8a872e068f,9857853deb
5,cinecenter,a complete unknown,2024,2025-02-25T16:10:00+01:00,https://www.filmladder.nl/kaartjes/1208540078,7.6★,https://assets.filmladder.nl/uploads/depot_ima...,e7bc07ba2a,0f979e20b9
1744,pathé city,a complete unknown,2024,2025-02-28T19:40:00+01:00,https://www.filmladder.nl/kaartjes/1208819960,7.6★,https://assets.filmladder.nl/uploads/depot_ima...,e7bc07ba2a,bb2d3a934c
1750,pathé city,a complete unknown,2024,2025-03-03T17:00:00+01:00,https://www.filmladder.nl/kaartjes/1208819966,7.6★,https://assets.filmladder.nl/uploads/depot_ima...,e7bc07ba2a,bb2d3a934c
...,...,...,...,...,...,...,...,...,...
964,lab111,wild at heart,1990,2025-02-26T10:45:00+01:00,https://www.filmladder.nl/kaartjes/1208525025,7.2★,https://assets.filmladder.nl/uploads/imdb_post...,5a01f4e4ca,2d9401c277
2640,rialto vu,wild at heart,1990,2025-02-28T18:30:00+01:00,https://www.filmladder.nl/kaartjes/1208786251,7.2★,https://assets.filmladder.nl/uploads/imdb_post...,5a01f4e4ca,e0c1757e61
2641,rialto vu,wild at heart,1990,2025-03-02T17:15:00+01:00,https://www.filmladder.nl/kaartjes/1208787987,7.2★,https://assets.filmladder.nl/uploads/imdb_post...,5a01f4e4ca,e0c1757e61
815,kriterion,"goodbye, dragon inn",2003,2025-03-01T16:30:00+01:00,https://www.filmladder.nl/kaartjes/1208633261,7.1★,https://assets.filmladder.nl/uploads/imdb_post...,6eed39a5cf,8a70f32178
