In [2]:
import logging
import pandas as pd
from scrapers.filmladder import FilmladderScraper
from daily_pipeline import assign_ids_cinemas, assign_ids_screenings, extract_unique_movies
# from db.database import save_movies, save_screenings, save_cinemas
# from external.imdb_api import fetch_imdb_metadata

logging.basicConfig(level=logging.INFO)

In [3]:
logging.info("Starting daily data pipeline...")

# 1️⃣ Scrape Filmladder (returns two DataFrames)
filmladder = FilmladderScraper()
screenings_df, cinemas_df = filmladder.run()

# 2️⃣ Assign IDs
screenings_df = assign_ids_screenings(screenings_df)
cinemas_df = assign_ids_cinemas(cinemas_df)

# 3️⃣ Extract and fetch IMDb metadata for unique movies
movies_df = extract_unique_movies(screenings_df)

# movies_df = fetch_metadata(movies_df)

# # 4️⃣ Store data in the database
# save_movies(movies_df)
# save_screenings(screenings_df)
# save_cinemas(cinemas_df)

# logging.info("Daily data pipeline completed.")

INFO:root:Starting daily data pipeline...
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Driver [/Users/ardsnijders/.wdm/drivers/chromedriver/mac64/133.0.6943.126/chromedriver-mac-x64/chromedriver] found in cache


In [9]:
cinemas_df

Unnamed: 0,name,location,address,website,cinema_id
0,bijlmerbios,Amsterdam,,https://www.filmladder.nlhttps://www.filmladde...,cd51b85716
1,cinecenter,Amsterdam,,https://www.filmladder.nlhttps://www.filmladde...,0f979e20b9
2,cinema de vlugt,Amsterdam,,https://www.filmladder.nlhttps://www.filmladde...,921dfa19dd
3,de balie,Amsterdam,,https://www.filmladder.nlhttps://www.filmladde...,72491188f7
4,de uitkijk,Amsterdam,,https://www.filmladder.nlhttps://www.filmladde...,e8a281e73a
5,eye,Amsterdam,,https://www.filmladder.nlhttps://www.filmladde...,9857853deb
6,fc hyena,Amsterdam,,https://www.filmladder.nlhttps://www.filmladde...,e120609169
7,filmhallen,Amsterdam,,https://www.filmladder.nlhttps://www.filmladde...,5269ad1c5d
8,filmhuis cavia,Amsterdam,,https://www.filmladder.nlhttps://www.filmladde...,5fed520d94
9,het ketelhuis,Amsterdam,,https://www.filmladder.nlhttps://www.filmladde...,7b3e5dbf17


In [5]:
from scrapers.letterboxd import LetterboxdScraper
lb_scraper = LetterboxdScraper()
watchlist_df = lb_scraper.run()


INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Driver [/Users/ardsnijders/.wdm/drivers/chromedriver/mac64/133.0.6943.126/chromedriver-mac-x64/chromedriver] found in cache


In [6]:
from utils.helpers import normalize_and_hash

watchlist_df["movie_id"] = watchlist_df.apply(lambda row: normalize_and_hash(row["title"], row["year"]), axis=1)

In [7]:
filtered_screenings = screenings_df[screenings_df['movie_id'].isin(watchlist_df['movie_id'])]

print(filtered_screenings)


     cinema_name               title  year              show_datetime  \
5     cinecenter  a complete unknown  2024  2025-02-25T16:10:00+01:00   
6     cinecenter  a complete unknown  2024  2025-02-25T18:05:00+01:00   
7     cinecenter  a complete unknown  2024  2025-02-25T20:55:00+01:00   
8     cinecenter  a complete unknown  2024  2025-02-26T13:20:00+01:00   
9     cinecenter  a complete unknown  2024  2025-02-26T16:10:00+01:00   
...          ...                 ...   ...                        ...   
2749  the movies  a complete unknown  2024  2025-03-02T18:15:00+01:00   
2750  the movies  a complete unknown  2024  2025-03-03T12:30:00+01:00   
2751  the movies  a complete unknown  2024  2025-03-03T15:30:00+01:00   
2752  the movies  a complete unknown  2024  2025-03-03T18:30:00+01:00   
2753  the movies  a complete unknown  2024  2025-03-03T21:20:00+01:00   

                                         ticket_url rating  \
5     https://www.filmladder.nl/kaartjes/1208540078   7.6★   