In [1]:
import json, requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import notebook

params = {
    'list_url': "https://letterboxd.com/jack/list/official-top-250-films-with-the-most-fans/",
    'name_of_save_file': "top_250_fans_films",
    'pages': "A", # "A" (Natural/ all pages) or number
}

headers = {'content-type': 'application/json'}

r = requests.get(params['list_url'], headers=headers)
soup = BeautifulSoup(r.text)

if params['pages'] == "A":
  try:
    params['pages'] = int(soup.find("div", class_="pagination").find_all("li")[-1].find("a").text)
  except AttributeError:
    params['pages'] = 1

In [None]:
def get_page_films(url):
  film_infos = []
  r = requests.get(url, headers=headers)
  soup = BeautifulSoup(r.text)

  list_entries = soup.find(class_="js-list-entries")

  films_on_page = list_entries.find_all("li")

  for film_entry in notebook.tqdm(films_on_page, position=1, leave=False):
    film_html = film_entry.find("div")
    id = film_html['data-film-id']
    name = film_html.find("img")['alt']
    # link = "https://letterboxd.com" + film_html['data-target-link']
    date_and_name = {
        'ID': int(id),
        'Name': name,
    }
    film_infos.append(date_and_name)
  return film_infos

In [7]:
from multiprocessing import Pool
from tqdm import tqdm

base_url = params['list_url']

urls = []

for page in range(1, params['pages']+1):
  url = base_url
  if page > 1:
        url = base_url + "page/" + str(page)
  urls.append(url)

with Pool(4) as pool:
  list_of_film_info = []
  for result in tqdm(pool.imap(func=get_page_films, iterable=urls), total=len(urls)):
      list_of_film_info += result

len(list_of_film_info)

100%|██████████| 3/3 [00:00<00:00,  6.29it/s]


250

In [8]:
# output json file
print(f"Gathered {len(list_of_film_info)} films")
print(f"Outputting file: {params['name_of_save_file']}.json")
with open(params['name_of_save_file']+".json", "w") as out:
  json.dump(list_of_film_info, out)

Gathered 250 films
Outputting file: top_250_fans_films.json
