### Pipeline, step by step

In [1]:
import sys
import os
sys.path.append(os.getcwd())

import logging
import pandas as pd
from data_pipelines.scrapers.filmladder import FilmladderScraper
from data_pipelines.daily_pipeline import extract_unique_movies
from data_pipelines.daily_pipeline import process_cinemas, process_screenings, process_enriched_movies
from data_pipelines.save_to_db import save_movies, save_screenings, save_cinemas
from data_pipelines.save_to_db import get_existing_movies

%load_ext autoreload
%autoreload 2

logging.basicConfig(level=logging.INFO)

In [2]:
logging.info("Starting daily data pipeline...")

# 1️⃣ Scrape Filmladder (returns two DataFrames)
filmladder = FilmladderScraper()
screenings_df, cinemas_df = filmladder.run()

INFO:root:Starting daily data pipeline...
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Driver [/Users/ardsnijders/.wdm/drivers/chromedriver/mac64/133.0.6943.141/chromedriver-mac-x64/chromedriver] found in cache


In [3]:
# 2️⃣ Remove duplicates (imax, 2d, etc), add unique ids
screenings_df = process_screenings(screenings_df)
cinemas_df = process_cinemas(cinemas_df)

/Users/ardsnijders/Documents/cineville_scraper/backend


In [4]:
scraped_movies = extract_unique_movies(screenings_df)
scraped_movies

Unnamed: 0,movie_id,title,year,movie_link
0,378909531a,favoriten,2024,https://www.filmladder.nl/film/favoriten-2024/...
1,430904b120,merckx,2025,https://www.filmladder.nl/film/merckx-2025/pop...
2,e7bc07ba2a,a complete unknown,2024,https://www.filmladder.nl/film/a-complete-unkn...
3,ff1d83f7ce,alpha,2024,https://www.filmladder.nl/film/alpha-2024/popu...
4,172e369640,anora,2024,https://www.filmladder.nl/film/anora-2024/popu...
...,...,...,...,...
150,d6ba2e07ad,from here to eternity,1953,https://www.filmladder.nl/film/from-here-to-et...
151,3927581269,memoir of a snail,2024,https://www.filmladder.nl/film/memoir-of-a-sna...
152,767834fde2,paths of glory,1957,https://www.filmladder.nl/film/paths-of-glory-...
153,42e553e73d,platoon,1986,https://www.filmladder.nl/film/platoon-1986/po...


In [5]:
from data_pipelines.daily_pipeline import get_new_movies
pd.set_option('display.width', 1000)
new_movies = get_new_movies(scraped_movies=scraped_movies)
new_movies

No existing Movies table found


Unnamed: 0,movie_id,title,year,movie_link
0,378909531a,favoriten,2024,https://www.filmladder.nl/film/favoriten-2024/...
1,430904b120,merckx,2025,https://www.filmladder.nl/film/merckx-2025/pop...
2,e7bc07ba2a,a complete unknown,2024,https://www.filmladder.nl/film/a-complete-unkn...
3,ff1d83f7ce,alpha,2024,https://www.filmladder.nl/film/alpha-2024/popu...
4,172e369640,anora,2024,https://www.filmladder.nl/film/anora-2024/popu...
...,...,...,...,...
150,d6ba2e07ad,from here to eternity,1953,https://www.filmladder.nl/film/from-here-to-et...
151,3927581269,memoir of a snail,2024,https://www.filmladder.nl/film/memoir-of-a-sna...
152,767834fde2,paths of glory,1957,https://www.filmladder.nl/film/paths-of-glory-...
153,42e553e73d,platoon,1986,https://www.filmladder.nl/film/platoon-1986/po...


In [6]:
from data_pipelines.scrapers.imdb import IMDBFetcher
scraper = IMDBFetcher(headless=True)
new_movies = scraper.run(new_movies)

INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Driver [/Users/ardsnijders/.wdm/drivers/chromedriver/mac64/133.0.6943.141/chromedriver-mac-x64/chromedriver] found in cache
Scraping Progress: 100%|██████████| 155/155 [04:54<00:00,  1.90s/it]


Unnamed: 0,movie_id,title,year,movie_link,imdb_link
0,378909531a,favoriten,2024,https://www.filmladder.nl/film/favoriten-2024/...,https://www.imdb.com/title/tt31015543
1,430904b120,merckx,2025,https://www.filmladder.nl/film/merckx-2025/pop...,https://www.imdb.com/title/tt35488509
2,e7bc07ba2a,a complete unknown,2024,https://www.filmladder.nl/film/a-complete-unkn...,https://www.imdb.com/title/tt11563598
3,ff1d83f7ce,alpha,2024,https://www.filmladder.nl/film/alpha-2024/popu...,https://www.imdb.com/title/tt28797114
4,172e369640,anora,2024,https://www.filmladder.nl/film/anora-2024/popu...,https://www.imdb.com/title/tt28607951
...,...,...,...,...,...
152,d6ba2e07ad,from here to eternity,1953,https://www.filmladder.nl/film/from-here-to-et...,https://www.imdb.com/title/tt0045793
153,3927581269,memoir of a snail,2024,https://www.filmladder.nl/film/memoir-of-a-sna...,https://www.imdb.com/title/tt23770030
154,767834fde2,paths of glory,1957,https://www.filmladder.nl/film/paths-of-glory-...,https://www.imdb.com/title/tt0050825
155,42e553e73d,platoon,1986,https://www.filmladder.nl/film/platoon-1986/po...,https://www.imdb.com/title/tt0091763


In [40]:
missing_mv = new_movies.sort_values(by='year').tail(10)

In [41]:
test_scraper = IMDBScraper()
mv_with_year = test_scraper.run(missing_mv)
# test_scraper.page_source

INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Driver [/Users/ardsnijders/.wdm/drivers/chromedriver/mac64/133.0.6943.141/chromedriver-mac-x64/chromedriver] found in cache
Scraping IMDb Metadata: 100%|██████████| 10/10 [00:52<00:00,  5.21s/it]


In [42]:
mv_with_year
# pd.DataFrame(scraper.metadata_results).sort_values(by='imdb_year')

Unnamed: 0,movie_id,title_x,year,movie_link,imdb_link,title_y,original_title,imdb_year,rating,genres,...,duration,director,writers,actors,rating_count,plot,release_date,keywords,poster_url,trailer_url
0,b649021912,i'm still here,,https://www.filmladder.nl/film/i-m-still-here-...,https://www.imdb.com/title/tt14961016,Ainda Estou Aqui,Ainda Estou Aqui,2025,8.7,"[Docudrama, Period Drama, Political Drama, Bio...",...,PT2H17M,[Walter Salles],"[Murilo Hauser, Heitor Lorega, Marcelo Rubens ...","[Fernanda Torres, Fernanda Montenegro, Selton ...",80142,A mother is forced to reinvent herself when he...,2025-02-21,"[dictatorship,year 1971,mother,husband,based o...",https://m.media-amazon.com/images/M/MV5BM2FjMj...,https://www.imdb.com/video/imdb/vi3697657369
1,94aec9fbed,maria,,https://www.filmladder.nl/film/maria-2024-4451...,https://www.imdb.com/title/tt22893404,Maria,Maria,2025,6.4,"[Period Drama, Tragedy, Biography, Drama, Musi...",...,PT2H4M,[Pablo Larraín],[Steven Knight],"[Angelina Jolie, Pierfrancesco Favino, Alba Ro...",16168,"Maria Callas, the world&apos;s greatest opera ...",2025-01-10,"[maria callas character,marilyn monroe charact...",https://m.media-amazon.com/images/M/MV5BNWJjMz...,https://www.imdb.com/video/imdb/vi2663172377
2,c6242d7fdd,the last journey,,https://www.filmladder.nl/film/the-last-journe...,https://www.imdb.com/title/tt30454602,Den sista resan,Den sista resan,2024,7.9,"[Documentary, Back to top]",...,PT1H35M,"[Filip Hammar, Fredrik Wikingsson]",[],"[Filip Hammar, Fredrik Wikingsson, Lars Hammar]",4841,Renowned Swedish TV-duo Filip and Fredrik emba...,2024-03-01,[],https://m.media-amazon.com/images/M/MV5BZTU0Yz...,https://www.imdb.com/video/imdb/vi2997012761
3,c3da9655b4,chungking express,,https://www.filmladder.nl/film/chungking-expre...,https://www.imdb.com/title/tt0109424,Chung Hing sam lam,Chung Hing sam lam,1995,8.0,"[Comedy, Crime, Drama, Mystery, Romance, Back ...",...,PT1H42M,[Wong Kar-Wai],[Wong Kar-Wai],"[Brigitte Lin, Takeshi Kaneshiro, Tony Leung C...",102137,Two melancholic Hong Kong policemen fall in lo...,1995-09-15,"[drug smuggling,hong kong,daydream,loneliness,...",https://m.media-amazon.com/images/M/MV5BNDQ4Zm...,https://www.imdb.com/video/imdb/vi594918937
4,271defa604,the wild pear tree,,https://www.filmladder.nl/film/the-wild-pear-t...,https://www.imdb.com/title/tt6628102,Ahlat Agaci,Ahlat Agaci,2018,8.0,"[Drama, Back to top]",...,PT3H8M,[Nuri Bilge Ceylan],"[Ebru Ceylan, Nuri Bilge Ceylan, Akin Aksu]","[Dogu Demirkol, Murat Cemcir, Bennu Yildirimlar]",29248,An unpublished writer returns to his hometown ...,2018-11-30,"[anatolia,home town,snow,gambling debt,provinc...",https://m.media-amazon.com/images/M/MV5BOWU1Yj...,https://www.imdb.com/video/imdb/vi1009891865
5,d70058f126,el ángel exterminador,,https://www.filmladder.nl/film/el-angel-exterm...,https://www.imdb.com/title/tt0056732,El ángel exterminador,El ángel exterminador,1962,8.0,"[Dark Comedy, Drama, Fantasy, Back to top]",...,PT1H35M,[Luis Buñuel],"[Luis Buñuel, Luis Alcoriza]","[Silvia Pinal, Jacqueline Andere, Enrique Rambal]",36688,The guests at an upper-class dinner party find...,NaT,"[class differences,social satire,surrealism,co...",https://m.media-amazon.com/images/M/MV5BZTk0Yj...,https://www.imdb.com/video/imdb/vi990887449
6,1bc91ab815,the elephant man,,https://www.filmladder.nl/film/the-elephant-ma...,https://www.imdb.com/title/tt0080678,The Elephant Man,The Elephant Man,1980,8.2,"[Docudrama, Period Drama, Tragedy, Biography, ...",...,PT2H4M,[David Lynch],"[Christopher De Vore, Eric Bergren, David Lynch]","[Anthony Hopkins, John Hurt, Anne Bancroft]",270267,A Victorian surgeon rescues a heavily disfigur...,1980-10-16,"[dignity,disability,human exploitation,hospita...",https://m.media-amazon.com/images/M/MV5BMGE3MD...,https://www.imdb.com/video/imdb/vi1763229209
7,ce10a6d4f3,viridiana,,https://www.filmladder.nl/film/viridiana-29999...,https://www.imdb.com/title/tt0055601,Viridiana,Viridiana,1961,8.0,"[Period Drama, Drama, Back to top]",...,PT1H31M,[Luis Buñuel],"[Julio Alejandro, Luis Buñuel, Benito Pérez Ga...","[Silvia Pinal, Francisco Rabal, Fernando Rey]",26944,"Viridiana, a young nun about to take her final...",NaT,"[nun,novice,da vinci last supper,suspected rap...",https://m.media-amazon.com/images/M/MV5BYjlkMW...,https://www.imdb.com/video/imdb/vi1008516633
8,f4f3f81cae,taxi driver,,https://www.filmladder.nl/film/taxi-driver-197...,https://www.imdb.com/title/tt0075314,Taxi Driver,Taxi Driver,1976,8.2,"[Psychological Drama, Crime, Drama, Back to top]",...,PT1H54M,[Martin Scorsese],[Paul Schrader],"[Robert De Niro, Jodie Foster, Cybill Shepherd]",967606,A mentally unstable veteran works as a nightti...,1976-08-19,"[taxi driver,loner,street life,child prostitut...",https://m.media-amazon.com/images/M/MV5BZDNhMG...,https://www.imdb.com/video/imdb/vi474987289
9,22f36fba4c,romeo + juliet,,https://www.filmladder.nl/film/romeo-juliet-19...,https://www.imdb.com/title/tt0117509,Romeo + Juliet,Romeo + Juliet,1997,6.7,"[Romantic Epic, Teen Romance, Tragedy, Tragic ...",...,PT2H,[Baz Luhrmann],"[William Shakespeare, Craig Pearce, Baz Luhrmann]","[Leonardo DiCaprio, Claire Danes, John Leguizamo]",250583,Shakespeare&apos;s famous play is updated to t...,1997-03-28,"[female nudity,sex scene,suicide,forbidden lov...",https://m.media-amazon.com/images/M/MV5BZjBhYj...,https://www.imdb.com/video/imdb/vi58720537


In [16]:
new_movies.sort_values(by='year').tail(50)

Unnamed: 0,movie_id,title,year,movie_link,imdb_link
9,ef69b61e26,la prisonnière de bordeaux,2024.0,https://www.filmladder.nl/film/la-prisonniere-...,https://www.imdb.com/title/tt30851085
7,2e81eff841,hard truths,2024.0,https://www.filmladder.nl/film/hard-truths-202...,https://www.imdb.com/title/tt11891850
6,d07fcf4d8f,en fanfare,2024.0,https://www.filmladder.nl/film/en-fanfare-2024...,https://www.imdb.com/title/tt30327451
4,172e369640,anora,2024.0,https://www.filmladder.nl/film/anora-2024/popu...,https://www.imdb.com/title/tt28607951
3,ff1d83f7ce,alpha,2024.0,https://www.filmladder.nl/film/alpha-2024/popu...,https://www.imdb.com/title/tt28797114
2,e7bc07ba2a,a complete unknown,2024.0,https://www.filmladder.nl/film/a-complete-unkn...,https://www.imdb.com/title/tt11563598
33,e2a2174d6f,sing sing,2024.0,https://www.filmladder.nl/film/sing-sing-2024/...,https://www.imdb.com/title/tt28479262
34,74627b6a96,straatcoaches vs aliens,2024.0,https://www.filmladder.nl/film/straatcoaches-v...,https://www.imdb.com/title/tt31452296
32,742690e667,red de jungle,2024.0,https://www.filmladder.nl/film/red-de-jungle-2...,https://www.imdb.com/title/tt10695672
156,20a1327ff2,the seed of the sacred fig,2024.0,https://www.filmladder.nl/film/the-seed-of-the...,https://www.imdb.com/title/tt32178949


In [8]:
from data_pipelines.scrapers.imdb import IMDBScraper
scraper = IMDBScraper(headless=True)
enriched_new_movies = scraper.run(new_movies)

INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Driver [/Users/ardsnijders/.wdm/drivers/chromedriver/mac64/133.0.6943.141/chromedriver-mac-x64/chromedriver] found in cache
Scraping IMDb Metadata: 100%|██████████| 147/147 [14:35<00:00,  5.95s/it]


In [11]:
from data_pipelines.daily_pipeline import process_cinemas, process_screenings, process_enriched_movies
metadata_df = pd.DataFrame(scraper.metadata_results)
enriched_new_movies = new_movies.merge(metadata_df, on="imdb_link", how="left")
enriched_new_movies = process_enriched_movies(enriched_new_movies)

In [49]:
# cinemas_df.to_pickle('/Users/ardsnijders/Documents/cineville_scraper/backend/data_pipelines/temp_data/cinemas.pkl')
# enriched_new_movies.to_pickle('/Users/ardsnijders/Documents/cineville_scraper/backend/data_pipelines/temp_data/enriched_new_movies.pkl')
# screenings_df.to_pickle('/Users/ardsnijders/Documents/cineville_scraper/backend/data_pipelines/temp_data/screenings_df.pkl')

In [3]:
# import pandas as pd

# cinemas_df = pd.read_pickle('/Users/ardsnijders/Documents/cineville_scraper/backend/data_pipelines/temp_data/cinemas.pkl')
# enriched_new_movies = pd.read_pickle('/Users/ardsnijders/Documents/cineville_scraper/backend/data_pipelines/temp_data/enriched_new_movies.pkl')
# screenings_df = pd.read_pickle('/Users/ardsnijders/Documents/cineville_scraper/backend/data_pipelines/temp_data/screenings_df.pkl')

Error saving data: (builtins.ValueError) cannot convert float NaN to integer
[SQL: INSERT INTO movies (movie_id, title, year, movie_link, imdb_link, imdb_year, rating, genres, content_rating, duration, director, writers, actors, rating_count, plot, release_date, keywords, poster_url, trailer_url) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)]
[parameters: [{'keywords': ['deception,christian muslim conflict,dance,musical scene,village'], 'rating_count': 13782.0, 'release_date': datetime.date(2012, 6, 22), ... (665 characters truncated) ... ttps://www.imdb.com/title/tt1772424', 'writers': ['Nadine Labaki', 'Jihad Hojeily', 'Rodney El Haddad'], 'genres': ['Comedy', 'Drama', 'Back to top']}, {'keywords': ['grandmother,father,mother,child,children'], 'rating_count': 12216.0, 'release_date': NaT, 'rating': 8.0, 'imdb_year': '', 'actors': ['P ... (786 characters truncated) ... s://www.imdb.com/title/tt31392609', 'writers': ['Pat Boonnitipat', 'Thodsapon Thiptinnakorn'], 'ge

In [12]:
from data_pipelines.save_to_db import save_all_to_db, save_movies
# enriched_new_movies = enriched_new_movies.where(pd.notna(enriched_new_movies), None)
save_all_to_db(cinemas_df=cinemas_df,
               movies_df=enriched_new_movies,
               screenings_df=screenings_df)