In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.options import Options
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions
import time

edge_options = Options()
#edge_options.add_argument('--headless')
edge_options.add_argument("--blink-settings=imagesEnabled=false")  # Disable images

import requests
from bs4 import BeautifulSoup

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}

import os
path=r"C:\Users\berid\python\yts mx project"

import pandas as pd

### Scrape URLs

driver = webdriver.Edge(options=edge_options)
driver.maximize_window()

all_letterboxd_urls=[]

for page in range(1,501):
    url=f"https://letterboxd.com/films/popular/page/{page}/"
    driver.get(url)
    time.sleep(5)
    try:
        url_elements=WebDriverWait(driver,10).until(expected_conditions.visibility_of_all_elements_located((By.CSS_SELECTOR,'ul[class="poster-list -p70 -grid"] li[class="listitem poster-container"] a[class="frame"]')))
        for url_element in url_elements:
            try:
                title=url_element.get_attribute('textContent')
                url=url_element.get_attribute('href')
                
                name_url_dict={'Title':title,'URL':url}

                all_letterboxd_urls.append(name_url_dict)
            except:
                continue
    except:
        continue

    print(f'Page : {page}, Scraped : {len(all_letterboxd_urls)}',end='\r')

driver.quit()

In [2]:
import pickle
#pickle.dump(all_letterboxd_urls,open(os.path.join(path,'scraped_data','all_letterboxd_urls.pickle'),'wb'))
all_letterboxd_urls=pickle.load(open(os.path.join(path,'scraped_data','all_letterboxd_urls.pickle'),'rb'))

In [3]:
all_letterboxd_urls = list({d["URL"]: d for d in all_letterboxd_urls}.values())
print(f'Total letterboxd URLs : {len(all_letterboxd_urls)}')

Total letterboxd URLs : 35990


In [4]:
yts_movie_titles=pd.read_csv(os.path.join(path,'scraped_data','cleaned_yts_movies_df.csv'))['TITLE'].str.strip().unique().tolist()

In [5]:
useful_letterboxd_urls=[d['URL'] for d in all_letterboxd_urls if d['Title'].split(' (')[0].strip() in yts_movie_titles]
#print(len(useful_letterboxd_urls)/len(yts_movie_titles))

### Scrape Details

In [6]:
def return_movie_details(url):

    html=requests.get(url).content
    soup=BeautifulSoup(html)

    try:
        title=soup.select_one('div[class="details"] h1[class="headline-1 filmtitle"]').text.replace('\n','')
    except:
        title=None

    try:
        year=soup.select_one('div[class="details"] div[class="releaseyear"]').text.replace('\n','')
    except:
        year=None

    try:
        original_name=soup.select_one('div[class="details"] h2[class="originalname"]').text.replace('\n','')
    except:
        original_name=None

    try:
        imdb=soup.select_one('p[class="text-link text-footer"] a[data-track-action="IMDb"]')['href']
    except:
        imdb=None 

    try:
        director=soup.select_one('div[class="details"] span[class="directorlist"]').text.replace('\n','')
    except:
        director=None

    try:
        plot=soup.select_one('div[class="review body-text -prose -hero prettify"]').text
    except:
        plot=None

    try:
        genres=soup.select('div[id="tab-genres"] div[class="text-sluglist capitalize"]')[0].p.get_text(separator='|').replace('\n','')
    except:
        genres=None

    try:
        tags=soup.select('div[id="tab-genres"] div[class="text-sluglist capitalize"]')[1].p.get_text(separator='|').replace('\n','')
    except:
        tags=None

    dict={'URL':url, 'Title':title, 'Year':year, 'Original Name':original_name, 'IMDB':imdb, 'Director':director, 'Plot':plot, 'Genres':genres, 'Tags':tags}
    return dict


In [7]:
import sqlite3

conn=sqlite3.connect(os.path.join(path,'scraped_data','letterboxd.db'))
cursor=conn.cursor()

In [8]:
cursor.execute("""
CREATE TABLE IF NOT EXISTS movie_details (
    MOVIE_URL TEXT,
    TITLE TEXT,
    YEAR TEXT,
    ORIGINAL_NAME TEXT,
    IMDB_LINK TEXT,
    DIRECTOR TEXT,
    PLOT TEXT,
    GENRES TEXT,
    TAGS TEXT
)
""")

conn.commit()

In [9]:
cursor.execute("SELECT DISTINCT MOVIE_URL FROM movie_details")
scraped_urls = [row[0] for row in cursor.fetchall()]

print(f'URLs scraped: {len(scraped_urls)}')

URLs scraped: 11669


In [10]:
urls_to_scrape=[url for url in useful_letterboxd_urls if url not in scraped_urls]
print(f'URLs to scrape: {len(urls_to_scrape)}')

URLs to scrape: 0


In [11]:
for i,movie_url in enumerate(urls_to_scrape,start=1):
    
    try:
        movie_dict=return_movie_details(movie_url)
    except:
        continue

    values=[str(v) for k,v in movie_dict.items()]

    cursor.execute('INSERT INTO movie_details (MOVIE_URL, TITLE, YEAR, ORIGINAL_NAME, IMDB_LINK, DIRECTOR, PLOT, GENRES, TAGS) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)', values)
    conn.commit()

    print(f'Progress: {i}/{len(urls_to_scrape)}',end='\r')


### Preprocessing

In [12]:
import pandas as pd

df=pd.read_sql_query('SELECT * FROM movie_details',conn)

In [13]:
df=df.replace('None',None)
df.isna().mean()

MOVIE_URL        0.000000
TITLE            0.000000
YEAR             0.001114
ORIGINAL_NAME    0.839575
IMDB_LINK        0.000171
DIRECTOR         0.000428
PLOT             0.000000
GENRES           0.000600
TAGS             0.083126
dtype: float64

In [14]:
df['PLOT']=df['PLOT'].apply(lambda x:x.split('\nSynopsis\n')[-1].replace('\n','') if isinstance(x,str) else None)

df['GENRES']=df['GENRES'].apply(lambda x:x.replace('| |','|') if isinstance(x,str) else None)

df['TAGS']=df['TAGS'].apply(lambda x:x.replace('||','|').replace('| |','|') if isinstance(x,str) else None)

df['IMDB_LINK']=df['IMDB_LINK'].str.split('maindetails').str[0]
df['IMDB_LINK']=df['IMDB_LINK'].str.replace('http','https')


##### Bigrams

In [15]:
yts_urls=pd.read_csv(os.path.join(path,'scraped_data','cleaned_yts_movies_df.csv'))['IMDB_LINK'].to_list()
df=df[df['IMDB_LINK'].isin(yts_urls)]
df.shape

(7112, 9)

In [16]:
import nltk
from nltk.util import ngrams
from collections import Counter
import contractions

from nltk.sentiment import SentimentIntensityAnalyzer
sia=SentimentIntensityAnalyzer()

from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

from nltk.corpus import stopwords
stopwords=stopwords.words('english')
stopwords_extended = stopwords+['a', 'the', 'but', 'and', 's', '.', ',', "'s", 'i', 'in', 'to',':',';',"'",'(',')','–']


In [17]:
def get_bigrams(text):
    try:
        text = contractions.fix(text)
        plot_tokenized = nltk.word_tokenize(text)
        tokens_lowered = [t.lower().strip() for t in plot_tokenized]
        removed_stopwords = [t for t in tokens_lowered if t not in stopwords_extended]
        return list(ngrams(removed_stopwords, 2))
    except:
        return 'No Plot'

# Extract bigrams for all plots and count them
all_bigrams = []
df['PLOT'].apply(lambda plot: all_bigrams.extend(get_bigrams(plot)))
df['BIGRAMS']=df['PLOT'].apply(lambda plot:get_bigrams(plot))

# Count the most common bigrams
bigram_counts = Counter(all_bigrams)
most_common_bigrams = bigram_counts.most_common(10000)
most_common_bigrams=[i[0] for i in most_common_bigrams]

In [18]:
bigrams_series=[]
for i,bigrams_list in enumerate(df['BIGRAMS'],start=1):
    new_bigrams_list=[]
    for bigram in bigrams_list:
        if bigram in most_common_bigrams:
            new_bigrams_list.append(bigram)
    bigrams_series.append(new_bigrams_list)
    
    print(f'{i}/{len(df['BIGRAMS'])}',end='\r')

7112/7112

In [19]:
df['BIGRAMS']=bigrams_series

In [20]:
bigram_texts_series=[]

for i,bigrams_list in enumerate(df['BIGRAMS'],start=1):
    bigram_texts_list=[]
    for bigram in bigrams_list:
        bigram_text=' '.join([w for w in bigram])
        bigram_texts_list.append(bigram_text)
    
    bigram_texts_list='|'.join(i for i in bigram_texts_list)
    bigram_texts_series.append(bigram_texts_list)

df['BIGRAMS_TAGS']=bigram_texts_series
df=df.drop(columns='BIGRAMS')

In [21]:
df.to_csv(os.path.join(path,'scraped_data','cleaned_letterboxd_movies_df.csv'),index=False)