### Pipeline, step by step

In [1]:
import sys
import os
sys.path.append(os.getcwd())

import logging
import pandas as pd
from data_pipelines.scrapers.filmladder import FilmladderScraper
from data_pipelines.daily_pipeline import assign_ids_cinemas, assign_ids_screenings, extract_unique_movies
from data_pipelines.daily_pipeline import add_cineville_tag, add_imdb_links, clean_screenings, get_new_movies
from data_pipelines.save_to_db import save_movies, save_screenings, save_cinemas
from data_pipelines.save_to_db import get_existing_movies

%load_ext autoreload
%autoreload 2

logging.basicConfig(level=logging.INFO)

In [2]:
logging.info("Starting daily data pipeline...")

# 1️⃣ Scrape Filmladder (returns two DataFrames)
filmladder = FilmladderScraper()
screenings_df, cinemas_df = filmladder.run()
cinemas_df = add_cineville_tag(assign_ids_cinemas(cinemas_df))

INFO:root:Starting daily data pipeline...
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Driver [/Users/ardsnijders/.wdm/drivers/chromedriver/mac64/133.0.6943.141/chromedriver-mac-x64/chromedriver] found in cache


/Users/ardsnijders/Documents/cineville_scraper/backend


In [3]:
# 2️⃣ Assign IDs
screenings_df = assign_ids_screenings(screenings_df)
screenings_df = clean_screenings(screenings_df)
screenings_df

Unnamed: 0,cinema_name,title,year,show_datetime,ticket_url,rating,movie_link,poster_url,movie_id,cinema_id
0,bijlmerbios,favoriten,2024,2025-02-27 21:00:00+01:00,https://www.filmladder.nl/kaartjes/1208540575,7.7★,https://www.filmladder.nl/film/favoriten-2024/...,https://assets.filmladder.nl/uploads/depot_ima...,378909531a,cd51b85716
1,bijlmerbios,favoriten,2024,2025-02-28 19:45:00+01:00,https://www.filmladder.nl/kaartjes/1208629906,7.7★,https://www.filmladder.nl/film/favoriten-2024/...,https://assets.filmladder.nl/uploads/depot_ima...,378909531a,cd51b85716
2,bijlmerbios,favoriten,2024,2025-03-01 21:00:00+01:00,https://www.filmladder.nl/kaartjes/1208633393,7.7★,https://www.filmladder.nl/film/favoriten-2024/...,https://assets.filmladder.nl/uploads/depot_ima...,378909531a,cd51b85716
3,bijlmerbios,merckx,2025,2025-02-27 19:00:00+01:00,https://www.filmladder.nl/kaartjes/1208540437,7.4★,https://www.filmladder.nl/film/merckx-2025/pop...,https://assets.filmladder.nl/uploads/depot_ima...,430904b120,cd51b85716
4,cinecenter,a complete unknown,2024,2025-02-27 11:00:00+01:00,https://www.filmladder.nl/kaartjes/1208769301,7.6★,https://www.filmladder.nl/film/a-complete-unkn...,https://assets.filmladder.nl/uploads/depot_ima...,e7bc07ba2a,0f979e20b9
...,...,...,...,...,...,...,...,...,...,...
2949,the movies,the brutalist,2024,2025-03-03 14:15:00+01:00,https://www.filmladder.nl/kaartjes/1208783746,7.7★,https://www.filmladder.nl/film/the-brutalist-2...,https://assets.filmladder.nl/uploads/depot_ima...,6e064a5a9d,f1713e87bf
2950,the movies,the brutalist,2024,2025-03-04 14:15:00+01:00,https://www.filmladder.nl/kaartjes/1208783747,7.7★,https://www.filmladder.nl/film/the-brutalist-2...,https://assets.filmladder.nl/uploads/depot_ima...,6e064a5a9d,f1713e87bf
2951,the movies,the brutalist,2024,2025-03-05 14:15:00+01:00,https://www.filmladder.nl/kaartjes/1208783748,7.7★,https://www.filmladder.nl/film/the-brutalist-2...,https://assets.filmladder.nl/uploads/depot_ima...,6e064a5a9d,f1713e87bf
2952,the movies,the seed of the sacred fig,2024,2025-03-02 12:15:00+01:00,https://www.filmladder.nl/kaartjes/1208783779,7.6★,https://www.filmladder.nl/film/the-seed-of-the...,https://assets.filmladder.nl/uploads/depot_ima...,20a1327ff2,f1713e87bf


In [4]:
scraped_movies = extract_unique_movies(screenings_df)
scraped_movies

Unnamed: 0,movie_id,title,year,movie_link
0,378909531a,favoriten,2024,https://www.filmladder.nl/film/favoriten-2024/...
1,430904b120,merckx,2025,https://www.filmladder.nl/film/merckx-2025/pop...
2,e7bc07ba2a,a complete unknown,2024,https://www.filmladder.nl/film/a-complete-unkn...
3,ff1d83f7ce,alpha,2024,https://www.filmladder.nl/film/alpha-2024/popu...
4,172e369640,anora,2024,https://www.filmladder.nl/film/anora-2024/popu...
...,...,...,...,...
159,d6ba2e07ad,from here to eternity,1953,https://www.filmladder.nl/film/from-here-to-et...
160,3927581269,memoir of a snail,2024,https://www.filmladder.nl/film/memoir-of-a-sna...
161,767834fde2,paths of glory,1957,https://www.filmladder.nl/film/paths-of-glory-...
162,42e553e73d,platoon,1986,https://www.filmladder.nl/film/platoon-1986/po...


In [5]:
from data_pipelines.daily_pipeline import get_new_movies
new_movies = get_new_movies(scraped_movies=scraped_movies)
# new_movies = new_movies.sample(7)
new_movies

Unnamed: 0,movie_id,title,year,movie_link
0,378909531a,favoriten,2024,https://www.filmladder.nl/film/favoriten-2024/...
1,430904b120,merckx,2025,https://www.filmladder.nl/film/merckx-2025/pop...
2,e7bc07ba2a,a complete unknown,2024,https://www.filmladder.nl/film/a-complete-unkn...
3,ff1d83f7ce,alpha,2024,https://www.filmladder.nl/film/alpha-2024/popu...
5,3046272957,bridget jones: mad about the boy,2025,https://www.filmladder.nl/film/bridget-jones-m...
...,...,...,...,...
159,d6ba2e07ad,from here to eternity,1953,https://www.filmladder.nl/film/from-here-to-et...
160,3927581269,memoir of a snail,2024,https://www.filmladder.nl/film/memoir-of-a-sna...
161,767834fde2,paths of glory,1957,https://www.filmladder.nl/film/paths-of-glory-...
162,42e553e73d,platoon,1986,https://www.filmladder.nl/film/platoon-1986/po...


In [6]:
from data_pipelines.scrapers.imdb import IMDBFetcher
scraper = IMDBFetcher(headless=True)
new_movies = scraper.run(new_movies)
new_movies

INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Driver [/Users/ardsnijders/.wdm/drivers/chromedriver/mac64/133.0.6943.141/chromedriver-mac-x64/chromedriver] found in cache
Scraping Progress: 100%|██████████| 152/152 [04:59<00:00,  1.97s/it]


Unnamed: 0,movie_id,title,year,movie_link,imdb_link
0,378909531a,favoriten,2024,https://www.filmladder.nl/film/favoriten-2024/...,https://www.imdb.com/title/tt31015543
1,430904b120,merckx,2025,https://www.filmladder.nl/film/merckx-2025/pop...,https://www.imdb.com/title/tt35488509
2,e7bc07ba2a,a complete unknown,2024,https://www.filmladder.nl/film/a-complete-unkn...,https://www.imdb.com/title/tt11563598
3,ff1d83f7ce,alpha,2024,https://www.filmladder.nl/film/alpha-2024/popu...,https://www.imdb.com/title/tt28797114
4,3046272957,bridget jones: mad about the boy,2025,https://www.filmladder.nl/film/bridget-jones-m...,https://www.imdb.com/title/tt32063050
...,...,...,...,...,...
165,d6ba2e07ad,from here to eternity,1953,https://www.filmladder.nl/film/from-here-to-et...,https://www.imdb.com/title/tt0045793
166,3927581269,memoir of a snail,2024,https://www.filmladder.nl/film/memoir-of-a-sna...,https://www.imdb.com/title/tt23770030
167,767834fde2,paths of glory,1957,https://www.filmladder.nl/film/paths-of-glory-...,https://www.imdb.com/title/tt0050825
168,42e553e73d,platoon,1986,https://www.filmladder.nl/film/platoon-1986/po...,https://www.imdb.com/title/tt0091763


In [7]:
from data_pipelines.scrapers.imdb import IMDBScraper
scraper = IMDBScraper(headless=True)
enriched_new_movies = scraper.run(new_movies)
# page_source = scraper.fetch_data('https://www.imdb.com/title/tt0066491/')

INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Driver [/Users/ardsnijders/.wdm/drivers/chromedriver/mac64/133.0.6943.141/chromedriver-mac-x64/chromedriver] found in cache
Scraping IMDb Metadata: 100%|██████████| 139/139 [14:25<00:00,  6.22s/it]


In [19]:
# Merge on imdb_link instead of title
metadata_df = pd.DataFrame(scraper.metadata_results)
enriched_new_movies = new_movies.merge(metadata_df, on="imdb_link", how="left")
enriched_new_movies["release_date"] = pd.to_datetime(enriched_new_movies["release_date"]).dt.date
enriched_new_movies['year'] = enriched_new_movies['imdb_year']
enriched_new_movies = enriched_new_movies.where(pd.notna(enriched_new_movies), None)

In [49]:
# cinemas_df.to_pickle('/Users/ardsnijders/Documents/cineville_scraper/backend/data_pipelines/temp_data/cinemas.pkl')
# enriched_new_movies.to_pickle('/Users/ardsnijders/Documents/cineville_scraper/backend/data_pipelines/temp_data/enriched_new_movies.pkl')
# screenings_df.to_pickle('/Users/ardsnijders/Documents/cineville_scraper/backend/data_pipelines/temp_data/screenings_df.pkl')

In [3]:
# import pandas as pd

# cinemas_df = pd.read_pickle('/Users/ardsnijders/Documents/cineville_scraper/backend/data_pipelines/temp_data/cinemas.pkl')
# enriched_new_movies = pd.read_pickle('/Users/ardsnijders/Documents/cineville_scraper/backend/data_pipelines/temp_data/enriched_new_movies.pkl')
# screenings_df = pd.read_pickle('/Users/ardsnijders/Documents/cineville_scraper/backend/data_pipelines/temp_data/screenings_df.pkl')

Error saving data: (builtins.ValueError) cannot convert float NaN to integer
[SQL: INSERT INTO movies (movie_id, title, year, movie_link, imdb_link, imdb_year, rating, genres, content_rating, duration, director, writers, actors, rating_count, plot, release_date, keywords, poster_url, trailer_url) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)]
[parameters: [{'keywords': ['deception,christian muslim conflict,dance,musical scene,village'], 'rating_count': 13782.0, 'release_date': datetime.date(2012, 6, 22), ... (665 characters truncated) ... ttps://www.imdb.com/title/tt1772424', 'writers': ['Nadine Labaki', 'Jihad Hojeily', 'Rodney El Haddad'], 'genres': ['Comedy', 'Drama', 'Back to top']}, {'keywords': ['grandmother,father,mother,child,children'], 'rating_count': 12216.0, 'release_date': NaT, 'rating': 8.0, 'imdb_year': '', 'actors': ['P ... (786 characters truncated) ... s://www.imdb.com/title/tt31392609', 'writers': ['Pat Boonnitipat', 'Thodsapon Thiptinnakorn'], 'ge

In [None]:
from data_pipelines.save_to_db import save_all_to_db
# enriched_new_movies = enriched_new_movies.where(pd.notna(new_movies), None)
save_all_to_db(cinemas_df=cinemas_df,
               movies_df=enriched_new_movies,
               screenings_df=screenings_df)