### Pipeline, step by step

In [1]:
import sys
import os
sys.path.append(os.getcwd())

import logging
import pandas as pd
from data_pipelines.scrapers.filmladder import FilmladderScraper
from data_pipelines.scrapers.imdb import IMDBFetcher, IMDBScraper
from data_pipelines.daily_pipeline import extract_unique_movies, get_new_movies
from data_pipelines.daily_pipeline import process_cinemas, process_screenings, process_enriched_movies
from backend.data_models.save_to_db import save_all_to_db

%load_ext autoreload
%autoreload 2

logging.basicConfig(level=logging.INFO)

In [2]:
logging.info("Starting daily data pipeline...")

# 1️⃣ Scrape Filmladder (returns two DataFrames)
filmladder = FilmladderScraper()
screenings_df, cinemas_df = filmladder.run()

INFO:root:Starting daily data pipeline...
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:There is no [mac64] chromedriver "134.0.6998.88" for browser google-chrome "134.0.6998" in cache
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:WebDriver version 134.0.6998.88 selected
INFO:WDM:Modern chrome version https://storage.googleapis.com/chrome-for-testing-public/134.0.6998.88/mac-x64/chromedriver-mac-x64.zip
INFO:WDM:About to download new driver from https://storage.googleapis.com/chrome-for-testing-public/134.0.6998.88/mac-x64/chromedriver-mac-x64.zip
INFO:WDM:Driver downloading response is 200
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Driver has been saved in cache [/Users/ardsnijders/.wdm/drivers/chromedriver/mac64/134.0.6998.88]


In [3]:
# 2️⃣ Remove duplicates (imax, 2d, etc), add unique ids
screenings_df = process_screenings(screenings_df)
cinemas_df = process_cinemas(cinemas_df)

/Users/ardsnijders/Documents/cineville_scraper/backend


In [4]:
scraped_movies = extract_unique_movies(screenings_df)
scraped_movies.head(5)

Unnamed: 0,movie_id,title,year,movie_link
0,ae3c1f373d,all we imagine as light,2024.0,https://www.filmladder.nl/film/all-we-imagine-...
1,2e81eff841,hard truths,2024.0,https://www.filmladder.nl/film/hard-truths-202...
2,62892044b0,maria,2017.0,https://www.filmladder.nl/film/maria-by-callas...
3,c357c60cb8,a complete unknown,,https://www.filmladder.nl/film/a-complete-unkn...
4,ff1d83f7ce,alpha,2024.0,https://www.filmladder.nl/film/alpha-2024/popu...


In [5]:
pd.set_option('display.width', 1000)
new_movies = get_new_movies(scraped_movies=scraped_movies)
new_movies

Scraped: 'all we imagine as light' | Matched: 'all we imagine as light' (Existing DB) | Score: 100.00 | Found: True
Scraped: 'hard truths' | Matched: 'hard truths' (Existing DB) | Score: 100.00 | Found: True
Scraped: 'maria' | Matched: 'maria' (Existing DB) | Score: 100.00 | Found: True
Scraped: 'a complete unknown' | Matched: 'a complete unknown' (Existing DB) | Score: 100.00 | Found: True
Scraped: 'alpha' | Matched: 'alpha' (Existing DB) | Score: 100.00 | Found: True
Scraped: 'anora' | Matched: 'anora' (Existing DB) | Score: 100.00 | Found: True
Scraped: 'bridget jones: mad about the boy' | Matched: 'bridget jones: mad about the boy' (Existing DB) | Score: 100.00 | Found: True
Scraped: 'en fanfare' | Matched: 'en fanfare' (Existing DB) | Score: 100.00 | Found: True
Scraped: 'i'm still here' | Matched: 'i'm still here' (Existing DB) | Score: 100.00 | Found: True
Scraped: 'la prisonnière de bordeaux' | Matched: 'la prisonnière de bordeaux' (Existing DB) | Score: 100.00 | Found: True
Sc

Unnamed: 0,movie_id,title,year,movie_link
15,7044139f9c,the most precious of cargoes,2024.0,https://www.filmladder.nl/film/the-most-precio...
16,2562df1c05,vermiglio,2024.0,https://www.filmladder.nl/film/vermiglio-2024/...
65,c573773899,napló - diary for my children,1984.0,https://www.filmladder.nl/film/naplo-diary-for...
69,7df1debfb5,the host,2006.0,https://www.filmladder.nl/film/the-host-2006/p...
93,aba0fa43e3,gomorra,2008.0,https://www.filmladder.nl/film/gomorra-2008/po...
95,3b466f5876,my own private idaho,,https://www.filmladder.nl/film/my-own-private-...
118,bcf1690a39,pina,2011.0,https://www.filmladder.nl/film/pina-2011/popup...
127,65f6e8fc8e,novocaine 2d,,https://www.filmladder.nl/film/novocaine-2025-...
136,8da77f53af,novocaine 4dx,,https://www.filmladder.nl/film/novocaine-2025-...
140,74077e7959,vaiana 2 (ov),2024.0,https://www.filmladder.nl/film/vaiana-2-ov-202...


In [6]:
scraper = IMDBFetcher(headless=True)
new_movies = scraper.run(new_movies)

INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Driver [/Users/ardsnijders/.wdm/drivers/chromedriver/mac64/134.0.6998.88/chromedriver-mac-x64/chromedriver] found in cache
Scraping Progress: 100%|██████████| 14/14 [00:29<00:00,  2.10s/it]


In [7]:
scraper = IMDBScraper(headless=True)
enriched_new_movies = scraper.run(new_movies)

INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Driver [/Users/ardsnijders/.wdm/drivers/chromedriver/mac64/134.0.6998.88/chromedriver-mac-x64/chromedriver] found in cache
Scraping IMDb Metadata: 100%|██████████| 13/13 [01:16<00:00,  5.91s/it]


In [8]:
metadata_df = pd.DataFrame(scraper.metadata_results)
enriched_new_movies = new_movies.merge(metadata_df, on="imdb_link", how="left")
enriched_new_movies = process_enriched_movies(enriched_new_movies)

In [9]:

# from data_pipelines.scrapers.letterboxd import LetterboxdScraper
# lb_scraper = LetterboxdScraper()
# watchlist_df = lb_scraper.run()

In [10]:
enriched_new_movies

Unnamed: 0,movie_id,title,year,movie_link,imdb_link,imdb_year,rating,genres,content_rating,duration,director,writers,actors,rating_count,plot,release_date,keywords,poster_url,trailer_url
0,7044139f9c,the most precious of cargoes,2024,https://www.filmladder.nl/film/the-most-precio...,https://www.imdb.com/title/tt10462154,2024,7.2,"[Animation, Drama, Back to top]",12A,PT1H21M,[Michel Hazanavicius],"[Jean-Claude Grumberg, Michel Hazanavicius]","[Dominique Blanc, Grégory Gadebois, Denis Poda...",655.0,"In war-torn times, a poor woodcutter and his w...",2024-11-20,"[snow,forest,woodcutter,death,family relations...",https://m.media-amazon.com/images/M/MV5BM2MxM2...,https://www.imdb.com/video/imdb/vi541771289
1,2562df1c05,vermiglio,2024,https://www.filmladder.nl/film/vermiglio-2024/...,https://www.imdb.com/title/tt28618488,2024,6.9,"[Drama, History, Back to top]",15,PT1H59M,[Maura Delpero],[Maura Delpero],"[Tommaso Ragno, Roberta Rovelli, Martina Scrinzi]",3340.0,"1944, Vermiglio, a remote mountain village. Th...",2024-09-19,"[world war two,italian alps,remote village,fam...",https://m.media-amazon.com/images/M/MV5BMTc1OT...,https://www.imdb.com/video/imdb/vi1844693273
2,c573773899,napló - diary for my children,1984,https://www.filmladder.nl/film/naplo-diary-for...,https://www.imdb.com/title/tt0084388,1984,7.4,"[Biography, Drama, Back to top]",PG,PT1H46M,[Márta Mészáros],[Márta Mészáros],"[Zsuzsa Czinkóczi, Ágnes Csere, Anna Polony]",1069.0,Having lost her parents to Stalin&apos;s purge...,1984-05-03,"[central europe,parents,f rated,title directed...",https://m.media-amazon.com/images/M/MV5BZGRkMT...,https://www.imdb.com/video/imdb/vi2121450265
3,7df1debfb5,the host,2006,https://www.filmladder.nl/film/the-host-2006/p...,https://www.imdb.com/title/tt0468492,2006,7.1,"[Kaiju, Monster Horror, Psychological Drama, T...",15,PT2H,[Bong Joon Ho],"[Bong Joon Ho, Won-jun Ha, Chul-hyun Baek]","[Song Kang-ho, Byun Hee-Bong, Park Hae-il]",137347.0,A monster emerges from Seoul&apos;s Han River ...,2006-11-10,"[abduction,revenge,rescue,panic,cult film]",https://m.media-amazon.com/images/M/MV5BMTQzYz...,https://www.imdb.com/video/imdb/vi1234043161
4,aba0fa43e3,gomorra,2008,https://www.filmladder.nl/film/gomorra-2008/po...,https://www.imdb.com/title/tt0929425,2008,7.0,"[True Crime, Crime, Drama, Back to top]",15,PT2H17M,[Matteo Garrone],"[Roberto Saviano, Maurizio Braucci, Ugo Chiti]","[Gianfelice Imparato, Salvatore Abbruzzese, To...",52094.0,Scampia Vele is the Corbusian architecture whi...,2008-10-10,"[italy,unzipping pants,world trade center manh...",https://m.media-amazon.com/images/M/MV5BMTM2ND...,https://www.imdb.com/video/imdb/vi1803074073
5,3b466f5876,my own private idaho,1992,https://www.filmladder.nl/film/my-own-private-...,https://www.imdb.com/title/tt0102494,1992,6.9,"[Coming-of-Age, Road Trip, Drama, Back to top]",18,PT1H44M,[Gus Van Sant],"[Gus Van Sant, William Shakespeare]","[River Phoenix, Keanu Reeves, James Russo]",64131.0,Two best friends living on the streets of Port...,1992-03-27,"[portland oregon,narcolepsy,idaho,gay sex,male...",https://m.media-amazon.com/images/M/MV5BZjkxY2...,https://www.imdb.com/video/imdb/vi1536688409
6,bcf1690a39,pina,2011,https://www.filmladder.nl/film/pina-2011/popup...,https://www.imdb.com/title/tt1440266,2011,7.6,"[Documentary, Music, Back to top]",U,PT1H43M,[Wim Wenders],[Wim Wenders],"[Pina Bausch, Regina Advento, Malou Airaudo]",16295.0,A tribute to the late German choreographer Pin...,2011-02-24,"[choreographer,modern dance,artistic creation,...",https://m.media-amazon.com/images/M/MV5BMTMyMT...,https://www.imdb.com/video/imdb/vi3067387417
7,65f6e8fc8e,novocaine 2d,2025,https://www.filmladder.nl/film/novocaine-2025-...,https://www.imdb.com/title/tt29603959,2025,,"[Dark Comedy, One-Person Army Action, Superher...",15,PT1H50M,"[Dan Berk, Robert Olsen]",[Lars Jacobson],"[Jack Quaid, Amber Midthunder, Ray Nicholson]",,"When the girl of his dreams is kidnapped, a ma...",2025-03-28,"[superhero action,one person army action,bank ...",https://m.media-amazon.com/images/M/MV5BYjIwOD...,https://www.imdb.com/video/imdb/vi4208904473
8,74077e7959,vaiana 2 (ov),2024,https://www.filmladder.nl/film/vaiana-2-ov-202...,https://www.imdb.com/title/tt13622970,2024,6.8,"[Computer Animation, Sea Adventure, Adventure,...",PG,PT1H40M,"[David G. Derrick Jr., Jason Hand, Dana Ledoux...","[Jared Bush, Dana Ledoux Miller, Bek Smith]","[Auli&apos;i Cravalho, Dwayne Johnson, Hualala...",80924.0,After receiving an unexpected call from her wa...,2024-11-29,"[island,ocean,disney princess,boat,sailing]",https://m.media-amazon.com/images/M/MV5BYzQ5Yj...,https://www.imdb.com/video/imdb/vi3932079641
9,7f69ba5a43,shadows of forgotten ancestors,1965,https://www.filmladder.nl/film/shadows-of-forg...,https://www.imdb.com/title/tt0058642,1965,7.8,"[Drama, Romance, Back to top]",,PT1H37M,[Sergei Parajanov],"[Ivan Chendej, Mikhail Kotsiubinsky, Sergei Pa...","[Ivan Mikolaychuk, Larisa Kadochnikova, Tatyan...",8984.0,A timeless Carpathian story - the young Ivan f...,1965-09-04,"[hutsul history,ukrainian,ukraine,carpathian m...",https://m.media-amazon.com/images/M/MV5BZDBiMm...,https://www.imdb.com/video/imdb/vi507757337


In [11]:
# enriched_new_movies = enriched_new_movies.where(pd.notna(enriched_new_movies), None)
save_all_to_db(cinemas_df=cinemas_df,
               movies_df=enriched_new_movies,
               screenings_df=screenings_df)