# Selenium web scraper to get data from the IMDB website
- I have limited the movies to those that have more than 25000 votes

In [2]:
from selenium import webdriver
import time
import re
import pandas as pd

In [3]:
storyline_xpath = '//*[@id="titleStoryLine"]/div/p/span'
title_xpath = '//h1[@class=""]'
summary_xpath = '//div[@class="summary_text"]'

runtime_xpath = '//*[@id="titleDetails"]//time'
runtime_backup_xpath = '//div[@class="subtext"]/time'

director_xpath = '//div[@class="credit_summary_item"]/a' #[0]
starts_xpath = '//div[@class="credit_summary_item"]/a' #[4,5,6]
actors_xpath = '//table[@class="cast_list"]/tbody/tr'
score_xpath = '//span[@itemprop="ratingValue"]'
votes_xpath = '//span[@itemprop="ratingCount"]'
rated_xpath = '//div[@class="subtext"]'
genres_xpath = '//div[@class="see-more inline canwrap"]' #[1]
metascore_xpath = '//div[@class="titleReviewBarItem"]/a/div'
movie_info_xpath = '//*[@id="titleDetails"]'

next_page_of_titles_xpath = '//a[@class = "lister-page-next next-page"]' #[0]
title_button_xpath = '//h3[@class="lister-item-header"]/a'

In [9]:
url = "https://www.imdb.com/search/title/?title_type=feature,tv_movie&num_votes=25000,&adult=include&sort=num_votes,asc"
# start Chrome
try:
    
    driver = webdriver.Chrome('/Users/benmurphy/Downloads/chromedriver')
    driver.get(url)
except:
    print("error with driver")
    
# list of dictionaries, each representing a movie
rows_list = []

# We are on first page, there will be a next button on this page
# When this value is false, there are no movies left
there_is_next_button = True

while(there_is_next_button):
    # find the title buttons of all movies on the page
    title_buttons = driver.find_elements_by_xpath(title_button_xpath)
    for title_index in range(len(title_buttons)):
        
        #click title of movie to load into it's page
        title_buttons = driver.find_elements_by_xpath(title_button_xpath)
        title_buttons[title_index].click()

        # Get data of the movie
        # title
        try:
            title = driver.find_element_by_xpath(title_xpath).text
        except:
            title = ''
            
        # summary
        try:
            summary = driver.find_element_by_xpath(summary_xpath).text
        except:
            summary = ''
        
        # storyline
        try:
            storyline = driver.find_element_by_xpath(storyline_xpath).text
        except:
            storyline = ''
    
        #runtime
        try:
            runtime_text = driver.find_element_by_xpath(runtime_xpath).text
            runtime = int(re.match(r'\d+', runtime_text).group(0))
        except: 
            try:
                runtime_text = driver.find_element_by_xpath(runtime_backup_xpath).text
                runtime = runtime_text
            except:
                runtime = -1
        
        # director
        try:
            director = driver.find_elements_by_xpath(director_xpath)[0].text
        except:
            director = ''
        
        #imdb score
        try:
            score = float(driver.find_element_by_xpath(score_xpath).text)
        except:
            score = -1.0
            
        try:
            votes = driver.find_element_by_xpath(votes_xpath).text
            votes = int(re.sub(r'\,', '', votes))
        except:
            votes = -1

        # actors list required some extra work
        actors_and_characters = []
        actor_pattern = r'[A-Z][A-z-]+ [A-Z][A-z-\'.]+ ?[A-z]+'
        try:
            actors_elements = driver.find_elements_by_xpath(actors_xpath)
            for i in range(1,len(actors_elements)):
                actor = actors_elements[i].text
                actors_and_characters.append(actor)
            actors = [actor.split(' ...')[0] for actor in actors_and_characters]
        except:
            actors = []

        # Some movies dont have a age rating, the xpath will always find an element but it may not have a rating
        try:
            rated = driver.find_element_by_xpath(rated_xpath).text
            rated = re.match(r'^[\d\-A-z]+', rated).group(0)
        except:
            rated = 'Not rated'

        try:
            genres = driver.find_elements_by_xpath(genres_xpath)[1].text
            genres = genres[8:].split(' | ')
        except:
            genres = []

        try:
            metascore = int(driver.find_element_by_xpath(metascore_xpath).text)
        except:
            metascore = -1
                
        #release date wasn't cooperating so I just got a bunch of info about the movie and regex searched for the date
        movie_info = driver.find_element_by_xpath(movie_info_xpath).text
        try:
            release_date = re.search(r'[\d]{1,2} [A-z]+ [\d]{1,4} \([A-z ]+\)', movie_info).group(0)
        except:
            release_date = ''

        budget_list = re.search('Budget:\$[\d,]+', movie_info)
        if budget_list is not None:  
            budget = int(budget_list.group(0)[8:].replace(',', ''))
        else:
            budget = -1
        
        try:
            production_company = re.search(r'Production Co: [A-z, ]+ S', movie_info).group(0)
            production_company = production_company[15:-2].split(', ')
        except:
            prod_company = []
            
        movie_dict = {'Title': title, 
                      'Summary': summary, 
                      'Storyline': storyline, 
                      'Runtime': runtime, 
                      'Director': director, 
                      'Actors': actors,
                      'IMDB_Score': score,
                      'Votes': votes,
                      'Rated': rated,
                      'Genres': genres,
                      'Metascore': metascore,
                      'Release_Date': release_date,
                      'Budget($)': budget,
                      'Production_Company': production_company
                     }
        rows_list.append(movie_dict)
        driver.back()

    try:
        # find next button, if yes, click it
        next_button = driver.find_elements_by_xpath(next_page_of_titles_xpath)[0]
        next_button.click()
    except:
         # if no next button, exit while loop, (while next exists) we have reached end of pages of titles
        there_is_next_button = False


df = pd.DataFrame(rows_list)    
driver.close()

df.to_csv('IMDB_Sept.csv')