In [3]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from langdetect import detect, LangDetectException

Due to Letterbox not having its offical API yet the data was scraped using tool Beautiful Soup. Reviews on the Letterboxd can be sorted in 5 different orders: newest first, earliest first, most popular first, highest ratings first, lowest rating first. For every sort order the maximum pages number that can be viewed is 256. To get maximum amount of data 256 pages were scraped for every sort order.

The preprocessing involved filtering out non-English reviews not marked with 'this review may contain spoilers,' converting dates to the appropriate format, and removing duplicates.

In [2]:
# https://letterboxd.com/film/in-the-mood-for-love/reviews/  - newest
# https://letterboxd.com/film/in-the-mood-for-love/reviews/by/added-earliest/ - earliest
# https://letterboxd.com/film/in-the-mood-for-love/reviews/by/activity/ - most popular
# https://letterboxd.com/film/in-the-mood-for-love/reviews/by/entry-rating/ - highest rating
# https://letterboxd.com/film/in-the-mood-for-love/reviews/by/entry-rating-lowest/ - lowest raring

In [52]:
def convert_date(date_str):
    # Convert date from format '13 Jun 2024' to 'YYYY-MM-DD'
    try:
        date_obj = datetime.strptime(date_str, "%d %b %Y")
        if date_obj.year > 2500:   # niektore lata sa wyzsze niz obecna data, kalendarz buddyjski, 543 lata do przodu
            print(date_obj)
            date_obj = date_obj.replace(year=date_obj.year - 543)
            print(date_obj, '\n=========')
        return date_obj.strftime("%Y-%m-%d")
    except ValueError:
        return date_str  # Return the original string if there's an error in conversion

In [50]:
def is_english(text):
    try:
        return detect(text) == 'en'
    except LangDetectException:
        return False

In [49]:
def scrape_reviews(movie_title, limits=[1, 257]):
    all_reviews = []
    all_dates = []
    all_ratings = []
    # endpoints = ["by/added-earliest/"]
    endpoints = ["", "by/added-earliest/", "by/activity/", "by/entry-rating/", "by/entry-rating-lowest/"]
    base_url = f"https://letterboxd.com/film/{movie_title}/reviews/"
    for endpoint in endpoints:
        # jest dostępne maksymalnie 256 stron
        for i in range(limits[0], limits[1]):
            # print(i, endpoint)
            url = base_url + endpoint
            if i != 1:
                url += f"page/{i}/"
            try:
                data = requests.get(url)
                data.raise_for_status() 
                soup = BeautifulSoup(data.content, 'lxml')
                film_details = soup.find_all(class_="film-detail")

                for detail in film_details:
                    # Find review element
                    review_element = detail.find(class_="body-text -prose collapsible-text")
                    review_text = ""
                    if review_element:
                        review_text = review_element.find("p").text

                    # Check if the review is in English
                    if is_english(review_text):
                        all_reviews.append(review_text)

                        # Find date element
                        date_text = ""
                        date_element = detail.find(class_="_nobr")
                        if date_element:
                            date_text = date_element.text
                            print(date_text, convert_date(date_text))
                        all_dates.append(convert_date(date_text))  # Convert date here

                        # Find rating element
                        rating_text = ""
                        rating_element = detail.find(class_=lambda x: x and x.startswith('rating -green rated-'))
                        if rating_element:
                            # Extract the numerical rating from the class name
                            rating_class = rating_element['class']
                            for cls in rating_class:
                                if cls.startswith('rated-'):
                                    rating_text = cls.split('-')[-1]
                                    break
                        all_ratings.append(rating_text)
            except Exception as e:
                print(f"Error on page {i} with endpoint '{endpoint}': {e}")

    # Create DataFrame and save to CSV
    reviews_data = pd.DataFrame({
        'Date': all_dates,
        'Review': all_reviews,
        'Rating': all_ratings
    })

    csv_filename = f"{movie_title}_reviews.csv"
    reviews_data.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"Reviews, dates, and ratings have been saved to {csv_filename}")

In [None]:
scrape_reviews("american-psycho")

In [None]:
scrape_reviews("fight-club")

In [None]:
scrape_reviews("blade-runner-2049")

In [None]:
scrape_reviews("drive-2011")

In [4]:
def preprocessing(title):
    df = pd.read_csv(f"data/{title}_reviews.csv")
    og_shape = df.shape[0]
    print(f"Liczba wierszy dla {title}:", df.shape[0])
    df = df[df['Review'] != "This review may contain spoilers. I can handle the truth."]
    df = df.drop_duplicates(subset=['Date', 'Review', 'Rating'])
    print(f"Liczba wierszy w {title} po usunięciu duplikatów i schowanych recenzji:", df.shape[0])
    df.to_csv(f"data/{title}_fixed.csv", index=False)
    return df, og_shape


In [5]:
titles = ["blade-runner-2049", "drive-2011", "fight-club", "american-psycho"]
dfs = []
before = []
after = []
for name in titles:
    df, og_shape = preprocessing(name)
    dfs.append(df)
    before.append(og_shape)
    after.append(df.shape[0])

Liczba wierszy dla blade-runner-2049: 10737
Liczba wierszy w blade-runner-2049 po usunięciu duplikatów i schowanych recenzji: 9296
Liczba wierszy dla drive-2011: 10626
Liczba wierszy w drive-2011 po usunięciu duplikatów i schowanych recenzji: 9558
Liczba wierszy dla fight-club: 10728
Liczba wierszy w fight-club po usunięciu duplikatów i schowanych recenzji: 9048
Liczba wierszy dla american-psycho: 10452
Liczba wierszy w american-psycho po usunięciu duplikatów i schowanych recenzji: 9279


In [44]:
print("titles:", titles)
print("before:", before, sum(before))
print("after:",  after, sum(after))
print(f"removed rows: { (sum(before)-sum(after))/sum(before) * 100}%")

titles: ['blade-runner-2049', 'drive-2011', 'fight-club', 'american-psycho']
before: [10737, 10626, 10728, 10452] 42543
after: [9296, 9558, 9048, 9279] 37181
usunięto 12.60371859060245% wierszy
