### Pipeline, step by step

In [1]:
import sys
import os
sys.path.append(os.getcwd())

import logging
import pandas as pd
from data_pipelines.scrapers.filmladder import FilmladderScraper
from data_pipelines.daily_pipeline import extract_unique_movies
from data_pipelines.daily_pipeline import process_cinemas, process_screenings, process_enriched_movies
from data_pipelines.save_to_db import save_movies, save_screenings, save_cinemas
from data_pipelines.save_to_db import get_existing_movies

%load_ext autoreload
%autoreload 2

logging.basicConfig(level=logging.INFO)

In [2]:
logging.info("Starting daily data pipeline...")

# 1️⃣ Scrape Filmladder (returns two DataFrames)
filmladder = FilmladderScraper()
screenings_df, cinemas_df = filmladder.run()

INFO:root:Starting daily data pipeline...
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Driver [/Users/ardsnijders/.wdm/drivers/chromedriver/mac64/133.0.6943.141/chromedriver-mac-x64/chromedriver] found in cache


In [3]:
# 2️⃣ Remove duplicates (imax, 2d, etc), add unique ids
screenings_df = process_screenings(screenings_df)
cinemas_df = process_cinemas(cinemas_df)

/Users/ardsnijders/Documents/cineville_scraper/backend


In [4]:
scraped_movies = extract_unique_movies(screenings_df)
scraped_movies

Unnamed: 0,movie_id,title,year,movie_link
0,378909531a,favoriten,2024,https://www.filmladder.nl/film/favoriten-2024/...
1,430904b120,merckx,2025,https://www.filmladder.nl/film/merckx-2025/pop...
2,e7bc07ba2a,a complete unknown,2024,https://www.filmladder.nl/film/a-complete-unkn...
3,ff1d83f7ce,alpha,2024,https://www.filmladder.nl/film/alpha-2024/popu...
4,172e369640,anora,2024,https://www.filmladder.nl/film/anora-2024/popu...
...,...,...,...,...
150,d6ba2e07ad,from here to eternity,1953,https://www.filmladder.nl/film/from-here-to-et...
151,3927581269,memoir of a snail,2024,https://www.filmladder.nl/film/memoir-of-a-sna...
152,767834fde2,paths of glory,1957,https://www.filmladder.nl/film/paths-of-glory-...
153,42e553e73d,platoon,1986,https://www.filmladder.nl/film/platoon-1986/po...


In [5]:
from data_pipelines.daily_pipeline import get_new_movies
pd.set_option('display.width', 1000)
new_movies = get_new_movies(scraped_movies=scraped_movies)
new_movies

No existing Movies table found


Unnamed: 0,movie_id,title,year,movie_link
0,378909531a,favoriten,2024,https://www.filmladder.nl/film/favoriten-2024/...
1,430904b120,merckx,2025,https://www.filmladder.nl/film/merckx-2025/pop...
2,e7bc07ba2a,a complete unknown,2024,https://www.filmladder.nl/film/a-complete-unkn...
3,ff1d83f7ce,alpha,2024,https://www.filmladder.nl/film/alpha-2024/popu...
4,172e369640,anora,2024,https://www.filmladder.nl/film/anora-2024/popu...
...,...,...,...,...
150,d6ba2e07ad,from here to eternity,1953,https://www.filmladder.nl/film/from-here-to-et...
151,3927581269,memoir of a snail,2024,https://www.filmladder.nl/film/memoir-of-a-sna...
152,767834fde2,paths of glory,1957,https://www.filmladder.nl/film/paths-of-glory-...
153,42e553e73d,platoon,1986,https://www.filmladder.nl/film/platoon-1986/po...


In [6]:
from data_pipelines.scrapers.imdb import IMDBFetcher
scraper = IMDBFetcher(headless=True)
new_movies = scraper.run(new_movies)
new_movies

INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Driver [/Users/ardsnijders/.wdm/drivers/chromedriver/mac64/133.0.6943.141/chromedriver-mac-x64/chromedriver] found in cache
Scraping Progress: 100%|██████████| 155/155 [04:54<00:00,  1.90s/it]


Unnamed: 0,movie_id,title,year,movie_link,imdb_link
0,378909531a,favoriten,2024,https://www.filmladder.nl/film/favoriten-2024/...,https://www.imdb.com/title/tt31015543
1,430904b120,merckx,2025,https://www.filmladder.nl/film/merckx-2025/pop...,https://www.imdb.com/title/tt35488509
2,e7bc07ba2a,a complete unknown,2024,https://www.filmladder.nl/film/a-complete-unkn...,https://www.imdb.com/title/tt11563598
3,ff1d83f7ce,alpha,2024,https://www.filmladder.nl/film/alpha-2024/popu...,https://www.imdb.com/title/tt28797114
4,172e369640,anora,2024,https://www.filmladder.nl/film/anora-2024/popu...,https://www.imdb.com/title/tt28607951
...,...,...,...,...,...
152,d6ba2e07ad,from here to eternity,1953,https://www.filmladder.nl/film/from-here-to-et...,https://www.imdb.com/title/tt0045793
153,3927581269,memoir of a snail,2024,https://www.filmladder.nl/film/memoir-of-a-sna...,https://www.imdb.com/title/tt23770030
154,767834fde2,paths of glory,1957,https://www.filmladder.nl/film/paths-of-glory-...,https://www.imdb.com/title/tt0050825
155,42e553e73d,platoon,1986,https://www.filmladder.nl/film/platoon-1986/po...,https://www.imdb.com/title/tt0091763


In [8]:
from data_pipelines.scrapers.imdb import IMDBScraper
scraper = IMDBScraper(headless=True)
enriched_new_movies = scraper.run(new_movies)

INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Driver [/Users/ardsnijders/.wdm/drivers/chromedriver/mac64/133.0.6943.141/chromedriver-mac-x64/chromedriver] found in cache
Scraping IMDb Metadata: 100%|██████████| 147/147 [14:35<00:00,  5.95s/it]


In [9]:
from data_pipelines.daily_pipeline import process_cinemas, process_screenings, process_enriched_movies
metadata_df = pd.DataFrame(scraper.metadata_results)
enriched_new_movies = new_movies.merge(metadata_df, on="imdb_link", how="left")
enriched_new_movies = process_enriched_movies(enriched_new_movies)

In [49]:
# cinemas_df.to_pickle('/Users/ardsnijders/Documents/cineville_scraper/backend/data_pipelines/temp_data/cinemas.pkl')
# enriched_new_movies.to_pickle('/Users/ardsnijders/Documents/cineville_scraper/backend/data_pipelines/temp_data/enriched_new_movies.pkl')
# screenings_df.to_pickle('/Users/ardsnijders/Documents/cineville_scraper/backend/data_pipelines/temp_data/screenings_df.pkl')

In [3]:
# import pandas as pd

# cinemas_df = pd.read_pickle('/Users/ardsnijders/Documents/cineville_scraper/backend/data_pipelines/temp_data/cinemas.pkl')
# enriched_new_movies = pd.read_pickle('/Users/ardsnijders/Documents/cineville_scraper/backend/data_pipelines/temp_data/enriched_new_movies.pkl')
# screenings_df = pd.read_pickle('/Users/ardsnijders/Documents/cineville_scraper/backend/data_pipelines/temp_data/screenings_df.pkl')

Error saving data: (builtins.ValueError) cannot convert float NaN to integer
[SQL: INSERT INTO movies (movie_id, title, year, movie_link, imdb_link, imdb_year, rating, genres, content_rating, duration, director, writers, actors, rating_count, plot, release_date, keywords, poster_url, trailer_url) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)]
[parameters: [{'keywords': ['deception,christian muslim conflict,dance,musical scene,village'], 'rating_count': 13782.0, 'release_date': datetime.date(2012, 6, 22), ... (665 characters truncated) ... ttps://www.imdb.com/title/tt1772424', 'writers': ['Nadine Labaki', 'Jihad Hojeily', 'Rodney El Haddad'], 'genres': ['Comedy', 'Drama', 'Back to top']}, {'keywords': ['grandmother,father,mother,child,children'], 'rating_count': 12216.0, 'release_date': NaT, 'rating': 8.0, 'imdb_year': '', 'actors': ['P ... (786 characters truncated) ... s://www.imdb.com/title/tt31392609', 'writers': ['Pat Boonnitipat', 'Thodsapon Thiptinnakorn'], 'ge

In [10]:
from data_pipelines.save_to_db import save_all_to_db, save_movies
# enriched_new_movies = enriched_new_movies.where(pd.notna(enriched_new_movies), None)
save_all_to_db(cinemas_df=cinemas_df,
               movies_df=enriched_new_movies,
               screenings_df=screenings_df)