# Import Libraries

In [142]:
# General
import requests
from tqdm.notebook import tqdm

# Web scraping
from bs4 import BeautifulSoup
from time import sleep
from random import randint
import csv

# IMDb Web Scraping

In [2]:
# Test IMDB
url_imdb = "https://www.imdb.com/search/title/?title_type=feature&release_date=,2021-06-30&user_rating=6.0,&num_votes=1000,"

# Search includes 21,993 movie titles

In [3]:
# Download HTML and check response code 
response_imdb = requests.get(url_imdb)
response_imdb.status_code

200

In [4]:
# IMDb's URL structure changes for lists including more than 10,000 titles
# 'start=10001' does not work, changes to random string, e.g. 'after=WzI5MDQ5LCJ0dDAwNjcwNDAiLDEwNzUxXQ%3D%3D'
# Avoid problem by splitting into three searches

# Define iterations per search
# Search 1 (movies released up until 1989-12-31 - includes 7,545 titles)
url_1 = "https://www.imdb.com/search/title/?title_type=feature&release_date=,1989-12-31&user_rating=6.0,&num_votes=1000,&count=250&start="
iterations_1 = range(1, 7438, 250)

# Search 2 (movies released between 1990-01-01 and 2009-12-31 - includes 7,380 titles)
url_2 = "https://www.imdb.com/search/title/?title_type=feature&release_date=1990-01-01,2009-12-31&user_rating=6.0,&num_votes=1000,&count=250&start="
iterations_2 = range(1, 7373, 250)

# Search 3 (movies released between 2010-01-01 and 2021-06-30 - includes 7,159 titles)
url_3 = "https://www.imdb.com/search/title/?title_type=feature&release_date=2010-01-01,2021-06-30&user_rating=6.0,&num_votes=1000,&count=250&start="
iterations_3 = range(1, 7144, 250)

In [5]:
# Test creating URL string for each page search, populated with iterations
# Search 1
for i in iterations_1:
    start_at = str(i)
    urls_1 = url_1 + start_at
    print(urls_1)

# Search 2
for i in iterations_2:
    start_at = str(i)
    urls_2 = url_2 + start_at
    print(urls_2)
    
# Search 3
for i in iterations_3:
    start_at = str(i)
    urls_3 = url_3 + start_at
    print(urls_3)

https://www.imdb.com/search/title/?title_type=feature&release_date=,1989-12-31&user_rating=6.0,&num_votes=1000,&count=250&start=1
https://www.imdb.com/search/title/?title_type=feature&release_date=,1989-12-31&user_rating=6.0,&num_votes=1000,&count=250&start=251
https://www.imdb.com/search/title/?title_type=feature&release_date=,1989-12-31&user_rating=6.0,&num_votes=1000,&count=250&start=501
https://www.imdb.com/search/title/?title_type=feature&release_date=,1989-12-31&user_rating=6.0,&num_votes=1000,&count=250&start=751
https://www.imdb.com/search/title/?title_type=feature&release_date=,1989-12-31&user_rating=6.0,&num_votes=1000,&count=250&start=1001
https://www.imdb.com/search/title/?title_type=feature&release_date=,1989-12-31&user_rating=6.0,&num_votes=1000,&count=250&start=1251
https://www.imdb.com/search/title/?title_type=feature&release_date=,1989-12-31&user_rating=6.0,&num_votes=1000,&count=250&start=1501
https://www.imdb.com/search/title/?title_type=feature&release_date=,1989-12

## Assemble URLs per search

In [6]:
# Function to scrape each page
def scrape(iterations, url):
    pages = []
    for i in tqdm(iterations):
        start_at = str(i)
        urls = url + start_at
        # Download HTML with get request
        response = requests.get(urls)
        # Monitor the status code for each page
        print(f"Status: {str(response.status_code)}")
        # Store pages into a list
        pages.append(response)
        # Respectful nap time
        wait_time = randint(2, 5)
        print(f"Wait time: {str(wait_time)} seconds")
        print("")
        sleep(wait_time)
    return pages

In [7]:
# Search 1
pages_1 = scrape(iterations_1, url_1)

  0%|          | 0/30 [00:00<?, ?it/s]

Status: 200
Wait time: 4 seconds

Status: 200
Wait time: 4 seconds

Status: 200
Wait time: 2 seconds

Status: 200
Wait time: 4 seconds

Status: 200
Wait time: 5 seconds

Status: 200
Wait time: 5 seconds

Status: 200
Wait time: 3 seconds

Status: 200
Wait time: 5 seconds

Status: 200
Wait time: 5 seconds

Status: 200
Wait time: 2 seconds

Status: 200
Wait time: 2 seconds

Status: 200
Wait time: 3 seconds

Status: 200
Wait time: 3 seconds

Status: 200
Wait time: 4 seconds

Status: 200
Wait time: 5 seconds

Status: 200
Wait time: 4 seconds

Status: 200
Wait time: 2 seconds

Status: 200
Wait time: 3 seconds

Status: 200
Wait time: 3 seconds

Status: 200
Wait time: 4 seconds

Status: 200
Wait time: 3 seconds

Status: 200
Wait time: 3 seconds

Status: 200
Wait time: 4 seconds

Status: 200
Wait time: 5 seconds

Status: 200
Wait time: 4 seconds

Status: 200
Wait time: 3 seconds

Status: 200
Wait time: 4 seconds

Status: 200
Wait time: 3 seconds

Status: 200
Wait time: 3 seconds

Status: 200
Wa

In [8]:
# Search 2
pages_2 = scrape(iterations_2, url_2)

  0%|          | 0/30 [00:00<?, ?it/s]

Status: 200
Wait time: 3 seconds

Status: 200
Wait time: 2 seconds

Status: 200
Wait time: 4 seconds

Status: 200
Wait time: 3 seconds

Status: 200
Wait time: 4 seconds

Status: 200
Wait time: 3 seconds

Status: 200
Wait time: 2 seconds

Status: 200
Wait time: 4 seconds

Status: 200
Wait time: 2 seconds

Status: 200
Wait time: 2 seconds

Status: 200
Wait time: 5 seconds

Status: 200
Wait time: 3 seconds

Status: 200
Wait time: 3 seconds

Status: 200
Wait time: 3 seconds

Status: 200
Wait time: 3 seconds

Status: 200
Wait time: 4 seconds

Status: 200
Wait time: 5 seconds

Status: 200
Wait time: 5 seconds

Status: 200
Wait time: 4 seconds

Status: 200
Wait time: 2 seconds

Status: 200
Wait time: 3 seconds

Status: 200
Wait time: 5 seconds

Status: 200
Wait time: 3 seconds

Status: 200
Wait time: 4 seconds

Status: 200
Wait time: 2 seconds

Status: 200
Wait time: 3 seconds

Status: 200
Wait time: 5 seconds

Status: 200
Wait time: 2 seconds

Status: 200
Wait time: 3 seconds

Status: 200
Wa

In [9]:
# Search 3
pages_3 = scrape(iterations_3, url_3)

  0%|          | 0/29 [00:00<?, ?it/s]

Status: 200
Wait time: 5 seconds

Status: 200
Wait time: 4 seconds

Status: 200
Wait time: 3 seconds

Status: 200
Wait time: 3 seconds

Status: 200
Wait time: 5 seconds

Status: 200
Wait time: 4 seconds

Status: 200
Wait time: 4 seconds

Status: 200
Wait time: 4 seconds

Status: 200
Wait time: 3 seconds

Status: 200
Wait time: 2 seconds

Status: 200
Wait time: 5 seconds

Status: 200
Wait time: 2 seconds

Status: 200
Wait time: 2 seconds

Status: 200
Wait time: 4 seconds

Status: 200
Wait time: 5 seconds

Status: 200
Wait time: 5 seconds

Status: 200
Wait time: 2 seconds

Status: 200
Wait time: 5 seconds

Status: 200
Wait time: 5 seconds

Status: 200
Wait time: 2 seconds

Status: 200
Wait time: 2 seconds

Status: 200
Wait time: 5 seconds

Status: 200
Wait time: 3 seconds

Status: 200
Wait time: 2 seconds

Status: 200
Wait time: 5 seconds

Status: 200
Wait time: 4 seconds

Status: 200
Wait time: 2 seconds

Status: 200
Wait time: 2 seconds

Status: 200
Wait time: 3 seconds



In [11]:
# Function to store movie IDs
def get_ids(pages):
    ids = []
    pages_parsed = []
    # Get HTML for each page
    for page in tqdm(range(len(pages))):
        pages_parsed.append(BeautifulSoup(pages[page].content, "html.parser"))
        html = pages_parsed[page].find_all("img")
        # Store id for each movie title per page
        for title in html:
            ids.append(title.get("data-tconst"))
    return ids     

In [12]:
# Search 1
ids_1 = get_ids(pages_1)
len(ids_1)

  0%|          | 0/30 [00:00<?, ?it/s]

7454

In [13]:
# Search 2
ids_2 = get_ids(pages_2)
len(ids_2)

  0%|          | 0/30 [00:00<?, ?it/s]

7380

In [14]:
# Search 3
ids_3 = get_ids(pages_3)
len(ids_3)

  0%|          | 0/29 [00:00<?, ?it/s]

7159

In [18]:
# Combine ids into one list
ids = ids_1 + ids_2 + ids_3

In [19]:
# Convert to set to remove potential duplicates
ids = list(set(ids))
len(ids)

21993

In [315]:
# Export list to CSV
with open("imdb_ids.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerows(ids)