# Data Collection

## Table of Contents:
1. [List of Movies](#names)

In [1]:
from bs4 import BeautifulSoup
import requests
import time

## List of Movies <a id='names'></a>

First, we'll need to grab our list of movie names from IMDB. We use the "Horror" genre because I think the words mentioned in these reviews should prove useful features for classification.

Let's define a functiont to collect this data for us.

In [2]:
#get all titles from the container and put into list all_titles
def titles(movies_container):
    """Returns list of movie titles from IMDB search results."""
    return [h3.find('a').get_text() for h3 in movies_container.findAll('h3')]

def collect_imdb_data(imdb_seach_url, total_results):
    """Returns list of titles for given number of results.
    Expect this function to take (total_results/50)/2 seconds.
    Total results should be less than 10,000."""
    #initialize all lists
    all_titles = []
    #create soup for first page
    html_page = requests.get(imdb_seach_url)
    soup = BeautifulSoup(html_page.content, 'html.parser')
    #create containers for first page
    movies_container = soup.find('div', class_="lister-list")
    imdb_movie_list = movies_container.findAll('div', class_="lister-item-content")
    #collect first page data
    for title in titles(movies_container):
        all_titles.append(title)
    #check if total_results is greater than 10,000 since the IMDB URL changes after that many results
    if total_results > 10_000:
        print("The amount of results is too large, this function can only support up to 10,000. Collecting data for top 10,000 results only.")
        total_results = 10_001
    #iterate through the rest of the results to collect data
    for i in range(51,total_results,50):
        #create soup for current page
        url = imdb_seach_url+"&start={i}&ref_=adv_nxt"
        html_page = requests.get(url)
        soup = BeautifulSoup(html_page.content, 'html.parser')
        #create containers for current page
        movies_container = soup.find('div', class_="lister-list")
        imdb_movie_list = movies_container.findAll('div', class_="lister-item-content")
        #collect current page data
        for title in titles(movies_container):
            all_titles.append(title)
        #buffer for half a second so as to not DDOS IMDB
        time.sleep(0.5)
    return all_titles

In [3]:
genre = "horror"
url = f"https://www.imdb.com/search/title/?title_type=feature&genres={genre}&explore=genres"
titles = collect_imdb_data(url, 1_000)

In [4]:
titles

['The Lodge',
 'The Platform',
 'Fantasy Island',
 'Midsommar',
 'The Wretched',
 'The Invisible Man',
 'The Lighthouse',
 'Underwater',
 'Gretel & Hansel',
 'The Hunt',
 'It Chapter Two',
 'Doctor Sleep',
 'Hereditary',
 'It',
 'The Killing of a Sacred Deer',
 'Sea Fever',
 'Antrum: The Deadliest Film Ever Made',
 'Alien',
 'What We Do in the Shadows',
 'The Shining',
 'Zombieland: Double Tap',
 'Ready or Not',
 'Salò, or the 120 Days of Sodom',
 'A Quiet Place Part II',
 'Get Out',
 'I See You',
 'A Quiet Place',
 'Halloween Kills',
 'Annihilation',
 'The Conjuring: The Devil Made Me Do It',
 'The Green Knight',
 'Us',
 'The VVitch: A New-England Folktale',
 'The Cabin in the Woods',
 'Vivarium',
 'Color Out of Space',
 'Psycho',
 'Sinister',
 'Train to Busan',
 'A Serbian Film',
 'The Meg',
 'The Turning',
 'Alien: Covenant',
 'Mother!',
 'The Babysitter',
 'A Nightmare on Elm Street',
 'The Conjuring',
 'Bird Box',
 'Suspiria',
 'Brightburn',
 'The Lodge',
 'The Platform',
 'Fantas