# Scrapping de letterboxd

Pasos a seguir:
1. extraer todos los usuarios que tengan listas armadas y armar un csv de usuarios: id_usuario, link_usuario
2. para cada usuario, extraer los links a sus las listas y armar un csv de listas: id_usuario, id_lista, link_lista
3. para cada lista, exportar a un csv con las interacciones: id_lista, id_pelicula, posicion, link_pelicula
4. para cada link_pelicula, obtener los datos de las peliculas

al final hay que tener 3 archivos:
interacciones (listas-peliculas)
peliculas
listas

In [150]:
import requests
import lxml
from lxml import etree
from lxml import html
import xml.etree.ElementTree as ET
import pandas as pd
import os
from pathlib import Path
import csv
import random
import json


 Paso 1: extraer todos los usuarios que tengan listas armadas y armar un csv de usuarios: id_usuario, link_usuario


In [2]:
path = "data/users.csv"
file = "users.csv"

for number in range(1,1000):
    r = requests.get(f"https://letterboxd.com/members/popular/this/week/page/{number}/")
    tree = lxml.html.fromstring(r.text)
    table = tree.xpath('//*[@id="content"]/div/div/section/table/tbody')
    
    # si la tabla no está vacía de filas
    if len(table[0].findall("tr")) != 0:
        # guardo las variables
        user_link = [element.attrib["href"] for element in table[0].findall(".//*[@class='title-3']/a")]
        user_lists_link = [element.attrib["href"] for element in table[0].findall(".//*[@class='has-icon icon-16 icon-list']")]
        user_watched_num = [int(element.text.replace(",", "")) for element in table[0].findall(".//*[@class='has-icon icon-16 icon-watched']")]
        user_lists_num = [int(element.text.replace(",", "")) for element in table[0].findall(".//*[@class='has-icon icon-16 icon-list']")]
        user_likes_num = [int(element.text.replace(",", "")) for element in table[0].findall(".//*[@class='has-icon icon-16 icon-liked']")]
        
        #armo el dataframe
        df_to_append = pd.DataFrame()
        df_to_append = pd.DataFrame(
            {'user_link': user_link,
            'user_lists_link': user_lists_link,
            'user_watched_num': user_watched_num,
            'user_lists_num': user_lists_num,
            'user_likes_num': user_likes_num
            })
        #header del csv
        header = ["user_link", "user_lists_link", "user_watched_num", "user_lists_num", "user_likes_num"]
        
        # si no existe el csv, se crea con el header
        if os.path.isfile(path)==False:
            with open(file, 'a') as f:
                writer = csv.writer(f)
                writer.writerow(header)
                df_to_append.to_csv(f, header=False, index=False)
                f.flush()
        
        # si existe el csv, se appendea el dataframe
        else:
            with open(file, 'a') as f:
                df_to_append.to_csv(f, header=False, index=False)
                f.flush()
    
    else:
        break




KeyboardInterrupt: 

2. para cada usuario, extraer los links a sus las listas y armar un csv de listas: id_usuario, id_lista, link_lista

In [None]:
#importar datos de usuarios
users = pd.read_csv(users)

users_list_links = "https://letterboxd.com" + users.user_lists_link + "page/"

In [134]:
path = "data/lists.csv"
file = "lists.csv"

#para cada usuario
for user in users_list_links:
    #tengo que ver todas sus páginas
    for number in range(1, 10000):
        r = requests.get(f"{user}{number}")
        tree = lxml.html.fromstring(r.text)
        table = tree.xpath('//*[@id="content"]/div/div/section/section')
        
        if len(table[0].findall(".//*[@class='list -overlapped -summary ']")) != 0:
            
            list_link = [element.attrib["href"] for element in table[0].findall(".//*[@class='list-link']")]
            list_id = [element.attrib['data-film-list-id'] for element in table[0].findall(".//*[@class='list -overlapped -summary ']")]
            user_id = [element.attrib['data-person'] for element in table[0].findall(".//*[@class='list -overlapped -summary ']")]
            list_extension = [element.text for element in table[0].findall(".//*[@class='value']")]
        
            #armo el dataframe
            df_to_append = pd.DataFrame()
            df_to_append = pd.DataFrame(
                {'user_id': user_id,
                'list_id': list_id,
                'list_link': list_link,
                'list_extension': list_extension
                })

            #header del csv
            header = ["user_id", "list_id", "list_link", "list_extension"]

            # si no existe el csv, se crea con el header
            if os.path.isfile(path)==False:
                with open(path, 'a') as f:
                    writer = csv.writer(f)
                    writer.writerow(header)
                    df_to_append.to_csv(f, header=False, index=False)
                    f.flush()

            # si existe el csv, se appendea el dataframe
            else:
                with open(path, 'a') as f:
                    df_to_append.to_csv(f, header=False, index=False)
                    f.flush()

        else:
            break    


KeyboardInterrupt: 

3. para cada lista, exportar a un csv con las interacciones: id_lista, id_pelicula, posicion, link_pelicula

In [43]:
#get photos from movies page

# https://stackoverflow.com/questions/73803684/trying-to-scrape-posters-from-letterboxd-python


from bs4 import BeautifulSoup as bs

url = 'https://letterboxd.com/film/jojo-rabbit/'

r = requests.get(url)
soup = bs(r.text)

script_w_data = soup.select_one('script[type="application/ld+json"]')
json_obj = json.loads(script_w_data.text.split(' */')[1].split('/* ]]>')[0])
print(json_obj['image'])

https://a.ltrbxd.com/resized/film-poster/4/4/4/6/0/0/444600-jojo-rabbit-0-230-0-345-crop.jpg?v=a5ad083635


<script type="application/ld+json">
/* <![CDATA[ */
{"image":"https://a.ltrbxd.com/resized/film-poster/4/4/4/6/0/0/444600-jojo-rabbit-0-230-0-345-crop.jpg?v=a5ad083635","@type":"Movie","director":[{"@type":"Person","name":"Taika Waititi","sameAs":"/director/taika-waititi/"}],"dateModified":"2023-09-09","productionCompany":[{"@type":"Organization","name":"Fox Searchlight Pictures","sameAs":"/studio/fox-searchlight-pictures/"},{"@type":"Organization","name":"Defender Films","sameAs":"/studio/defender-films/"},{"@type":"Organization","name":"Piki Films","sameAs":"/studio/piki-films/"},{"@type":"Organization","name":"TSG Entertainment","sameAs":"/studio/tsg-entertainment/"},{"@type":"Organization","name":"Czech Anglo Productions","sameAs":"/studio/czech-anglo-productions/"}],"releasedEvent":[{"@type":"PublicationEvent","startDate":"2019"}],"@context":"http://schema.org","url":"https://letterboxd.com/film/jojo-rabbit/","actors":[{"@type":"Person","name":"Roman Griffin Davis","sameAs":"/acto

In [53]:
script_w_data = soup.select_one('script[type="application/ld+json"]')
script_w_data.text.split(' */')

['\n/* <![CDATA[',
 '\n{"image":"https://a.ltrbxd.com/resized/film-poster/4/4/4/6/0/0/444600-jojo-rabbit-0-230-0-345-crop.jpg?v=a5ad083635","@type":"Movie","director":[{"@type":"Person","name":"Taika Waititi","sameAs":"/director/taika-waititi/"}],"dateModified":"2023-09-09","productionCompany":[{"@type":"Organization","name":"Fox Searchlight Pictures","sameAs":"/studio/fox-searchlight-pictures/"},{"@type":"Organization","name":"Defender Films","sameAs":"/studio/defender-films/"},{"@type":"Organization","name":"Piki Films","sameAs":"/studio/piki-films/"},{"@type":"Organization","name":"TSG Entertainment","sameAs":"/studio/tsg-entertainment/"},{"@type":"Organization","name":"Czech Anglo Productions","sameAs":"/studio/czech-anglo-productions/"}],"releasedEvent":[{"@type":"PublicationEvent","startDate":"2019"}],"@context":"http://schema.org","url":"https://letterboxd.com/film/jojo-rabbit/","actors":[{"@type":"Person","name":"Roman Griffin Davis","sameAs":"/actor/roman-griffin-davis/"},{"@t

In [21]:
lists_links = "data/lists_complete.csv"

#importar datos de listas
lists = pd.read_csv(lists_links)

users_list_links = "https://letterboxd.com" + lists.list_link + "page/"

In [151]:
user_agent_list = [
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
    "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15"
    ]


headers = {
    'User-Agent':'Mozilla/5.0',
    'Content-Type':'application/json',
    'method':'GET',
    'Accept':'application/vnd.github.cloak-preview'
}

headers['User-Agent'] = random.choice(user_agent_list)


In [171]:
number = 1
#list_link = "https://letterboxd.com/thedude27/list/shahrukh-khan-filmography-ranked/detail/page/"

#failed
list_link = "https://letterboxd.com/themoviejunkiez/list/our-film-collection-updated-8-19-2023/detail/page/"

for attempt in range(3):
    try:
        r = requests.get(f"{list_link}{number}", allow_redirects="False", headers=headers)
        break
    
    except requests.exceptions.ChunkedEncodingError:
        time.sleep(1)
else:
    print("Failed to retrieve url")


tree = lxml.html.fromstring(r.text)
table = tree.xpath('//*[@id="content"]/div/div/section/ul')
table[0]

<Element ul at 0x13fb20050>

In [172]:
details = [element for element in table[0].findall(".//*[@class='film-detail-content']")]

stars = []
for calif in details:
        
        try:
            if calif[1].tag == "p":
                stars.append((calif[1][0].text))
            else:
                stars.append("-")

        except:
            stars.append("-")

print(stars)

[]


In [97]:
stars_interpreter = {
    "-" : -1,
    " ½ ": 0.5,
    " ★ " : 1,
    " ★½ " : 1.5,
    " ★★ " : 2,
    " ★★½ " : 2.5,
    " ★★★ " : 3,
    " ★★★½ " : 3.5, 
    " ★★★★ " : 4,
    " ★★★★½ " : 4.5,
     " ★★★★★ ": 5
}


In [6]:
#get list likes

list_id = 33470905
r = requests.get(f"https://letterboxd.com/ajax/letterboxd-metadata/?likeables=filmlist%3A{list_id}&likeCounts=filmlist%3A{list_id}")
list_likes = json.loads(r.text)["likeables"][0]["count"]
list_likes

4

In [173]:
#films data
list_id_arr = [list_id] * len(table[0].findall("li"))

#movie_id = set([element.attrib["data-film-id"] for element in table[0].findall(".//*[@data-film-id]")])
#print(len(movie_id))

movie_link = [element.attrib["href"] for element in table[0].findall(".//*[@class='headline-2 prettify']/a")]
print(len(movie_link))

#movie_stars = [element.text for element in table[0].findall(".//*[@class='film-detail-meta rating-green']/span")]
#print((movie_stars))
details = [element for element in table[0].findall(".//*[@class='film-detail-content']")]

movie_stars = []

for calif in details:
    
    try:
        if calif[1].tag == "p":
            movie_stars.append((calif[1][0].text))
        else:
            movie_stars.append("-")

    except:
        movie_stars.append("-")

print(len(movie_stars))

movie_calification = [stars_interpreter[x] for x in movie_stars]
print(len(movie_calification))


movie_position = [element.text for element in table[0].findall(".//*[@class='list-number']")] 

if len(movie_position) == 0:
    movie_position = [0] * len(movie_link)
print(len(movie_position))


#armo el dataframe
df_to_append = pd.DataFrame()
df_to_append = pd.DataFrame(
{'list_id': list_id_arr,
#'movie_id': movie_id,
'movie_link': movie_link,
'movie_stars': movie_stars,
'movie_calification': movie_calification,
'movie_position': movie_position

})

0
0
0
0


ValueError: All arrays must be of the same length

In [142]:
len([element.attrib["data-film-id"] for element in table[0].findall(".//*/div/[@data-film-id]")])

43

In [99]:
for lista in tree.xpath('//*[@id="content"]/div/div/section/ul/li[7]/div'):
    print(lista.attrib)

{'class': 'really-lazy-load poster film-poster film-poster-50936 linked-film-poster', 'data-image-width': '125', 'data-image-height': '187', 'data-film-id': '50936', 'data-film-slug': 'cape-fear-1991', 'data-poster-url': '/film/cape-fear-1991/image-150/', 'data-linked': 'linked', 'data-target-link': '/film/cape-fear-1991/', 'data-target-link-target': '', 'data-cache-busting-key': '873e12ae', 'data-show-menu': 'true'}


In [175]:
#argumentos para el .py

path_movies = "data/movies_lists.csv"
path_lists = "data/lists_data.csv"
listas = pd.read_csv("data/lists copy.csv")

#dictionary for stars
stars_interpreter = {
    "-" : -1,
    " ½ ": 0.5,
    " ★ " : 1,
    " ★½ " : 1.5,
    " ★★ " : 2,
    " ★★½ " : 2.5,
    " ★★★ " : 3,
    " ★★★½ " : 3.5, 
    " ★★★★ " : 4,
    " ★★★★½ " : 4.5,
     " ★★★★★ ": 5
}



#para cada lista del archivo de listas
for i in range(0, len(listas)):
    list_link = "https://letterboxd.com" + listas["list_link"][i] + "detail/page/"
    list_id = listas["list_id"][i]

    #tengo que ver todas sus páginas
    for number in range(1, 10000):
        r = requests.get(f"{list_link}{number}")
        tree = lxml.html.fromstring(r.text)
        table = tree.xpath('//*[@id="content"]/div/div/section/ul')

        #si es la página 1, guardo data de la lista
        if number == 1:
            #list data
            list_title = [title.text for title in tree.xpath('//*[@id="content"]/div/div/section/div[2]/h1')]
            list_description = ' '.join([str(element.text) for element in tree.xpath('//*[@id="content"]/div/div/section/div[2]/div/p')])
            #get list likes
            r = requests.get(f"https://letterboxd.com/ajax/letterboxd-metadata/?likeables=filmlist%3A{list_id}&likeCounts=filmlist%3A{list_id}")
            list_likes = json.loads(r.text)["likeables"][0]["count"]

            #armo el dataframe
            df_to_append = pd.DataFrame()
            df_to_append = pd.DataFrame(
                {'list_id': list_id,
                'list_title': list_title,
                'list_description': list_description,
                'list_likes': list_likes
                })

            #header del csv
            header = ["list_id","list_title", "list_description", "list_likes"]

            # si no existe el csv, se crea con el header
            if os.path.isfile(path_lists)==False:
                with open(path_lists, 'a') as f:
                    writer = csv.writer(f)
                    writer.writerow(header)
                    df_to_append.to_csv(f, header=False, index=False)
                    f.flush()

            # si existe el csv, se appendea el dataframe
            else:
                with open(path_lists, 'a') as f:
                    df_to_append.to_csv(f, header=False, index=False)
                    f.flush()


                
        if len(table[0].findall("li")) != 0:
            
           #films data
            list_id_arr = [list_id] * len(table[0].findall("li"))
                
            movie_id = [element.attrib["data-film-id"] for element in table[0].findall(".//*[@data-film-id]")]

            movie_link = [element.attrib["href"] for element in table[0].findall(".//*[@class='headline-2 prettify']/a")]
   
            #movie_stars = [element.text for element in table[0].findall(".//*[@class='film-detail-meta rating-green']/span")]
            #print((movie_stars))
            details = [element for element in table[0].findall(".//*[@class='film-detail-content']")]

            movie_stars = []
            for calif in details:
                    
                    try:
                        if calif[1].tag == "p":
                            movie_stars.append((calif[1][0].text))
                        else:
                            movie_stars.append("-")

                    except:
                        movie_stars.append("-")


            movie_calification = [stars_interpreter[x] for x in movie_stars]

            movie_position = [element.text for element in table[0].findall(".//*[@class='list-number']")] 

            if len(movie_position) == 0:
                movie_position = [0] * len(movie_id)

            print(len(movie_position))


            #armo el dataframe
            df_to_append = pd.DataFrame()
            df_to_append = pd.DataFrame(
                {'list_id': list_id_arr,
                'movie_id': movie_id,
                'movie_link': movie_link,
                'movie_stars': movie_stars,
                'movie_calification': movie_calification,
                'movie_position': movie_position

                })

            #header del csv
            header = ["list_id","movie_id", "movie_link", "movie_stars", "movie_calification", "movie_position"]

            # si no existe el csv, se crea con el header
            if os.path.isfile(path_movies)==False:
                with open(path_movies, 'a') as f:
                    writer = csv.writer(f)
                    writer.writerow(header)
                    df_to_append.to_csv(f, header=False, index=False)
                    f.flush()

            # si existe el csv, se appendea el dataframe
            else:
                with open(path_movies, 'a') as f:
                    df_to_append.to_csv(f, header=False, index=False)
                    f.flush()

        else:
            break 

68
10
34
23
10
6
33
16
6
5
61
5
8
31
20
23
59
12
9
9


KeyboardInterrupt: 