# Data Collection

## Table of Contents:
1. [List of Movies](#names)
2. [List of Reviews](#reviews)

In [1]:
from bs4 import BeautifulSoup
import requests
import time
import pandas as pd

## List of Movies <a id='names'></a>

First, we'll need to grab our list of movie names from IMDB. We use the "Horror" genre because I think the words mentioned in these reviews should prove useful features for classification.

In [2]:
def titles(movies_container):
    """Returns list of movie titles from IMDB search results."""
    return [h3.find('a').get_text() for h3 in movies_container.findAll('h3')]

def years(movies_container):
    """Returns list of movie release years from IMDB search results."""
    return [h3.find('span', class_='lister-item-year text-muted unbold').get_text().strip('(I) ') for h3 in movies_container.findAll('h3')]

def collect_imdb_data(imdb_seach_url, total_results):
    """Returns list of titles and year of release for given number of results.
    Expect this function to take (total_results/50)/2 seconds.
    Total results should be less than 10,000."""
    #initialize all lists
    all_titles = []
    all_years = []
    #create soup for first page
    html_page = requests.get(imdb_seach_url)
    html_tree = BeautifulSoup(html_page.content, 'html.parser')
    #create containers for first page
    movies_container = html_tree.find('div', class_="lister-list")
    imdb_movie_list = movies_container.findAll('div', class_="lister-item-content")
    #collect first page data
    for title in titles(movies_container):
        all_titles.append(title)
    #check if total_results is greater than 10,000 since the IMDB URL changes after that many results
    if total_results > 10_000:
        print("The amount of results is too large, this function can only support up to 10,000. Collecting data for top 10,000 results only.")
        total_results = 10_001
    #iterate through the rest of the results to collect data
    for i in range(51,total_results+50,50):
        #create soup for current page
        url = imdb_seach_url+f"&start={i}&ref_=adv_nxt"
        html_page = requests.get(url)
        html_tree = BeautifulSoup(html_page.content, 'html.parser')
        #create containers for current page
        movies_container = html_tree.find('div', class_="lister-list")
        imdb_movie_list = movies_container.findAll('div', class_="lister-item-content")
        #collect current page data
        for title in titles(movies_container):
            all_titles.append(title)
        for year in years(movies_container):
            all_years.append(year)
        #buffer for half a second so as to not DDOS IMDB
        time.sleep(0.5)
    #combine and return page data
    return list(zip(all_titles, all_years))

## List of Reviews <a id='reviews'></a>

Now that we have a list of 1,000 movies and their respective release year, we can use this information to grab all of the reviews for these horror movies from Rotten Tomatoes.

In [4]:
def texts(review_list):
    """Returns list of reviews from Rotten Tomato search results."""
    return [review.find('div', class_='the_review').get_text() for review in review_list.findAll('div', class_='row review_table_row')]

def scores(review_list):
    """Returns list of scores from Rotten Tomato search results."""
    return [review.find('div', class_='col-xs-16 review_container').findChildren('div')[0].get('class')[-1] for review in review_list.findAll('div', class_='row review_table_row')]

def collect_rt_reviews(name, year):
    """Returns list of text and scores of reviews for given RT search results."""
    rt_search_url = f"https://www.rottentomatoes.com/m/{name}_{year}/reviews?type=top_critics"
    #initialize all lists
    all_text = []
    all_scores = []
    #create soup for first page
    html_page = requests.get(rt_search_url)
    html_tree = BeautifulSoup(html_page.content, 'html.parser')
    reviews_container = html_tree.find('div', class_="content")
    #some RT urls do not contain the date, this if checks for that case
    if reviews_container is None:
#         print(f"{name} not found on RT, trying without year.") #debug
        rt_search_url = f"https://www.rottentomatoes.com/m/{name}/reviews?type=top_critics"
        html_page = requests.get(rt_search_url)
        html_tree = BeautifulSoup(html_page.content, 'html.parser')
        reviews_container = html_tree.find('div', class_="content")
    #if the page is still not found, return empty lists
    if reviews_container is None:
#         print(f"{name} not found on RT, returning empty list.") #debug
        return list(zip(all_text, all_scores))
    review_list = reviews_container.find('div', class_='review_table')
    for text in texts(review_list):
        all_text.append(text)
    for score in scores(review_list):
        all_scores.append(score)
    #look for page information
    page_info = reviews_container.findAll('span', class_='pageInfo')
    #if there is more than one page
    if page_info:
        #grab number of pages
        num_pages = page_info[0].get_text()[-1]
        #iterate through the rest of the results to collect data
        for i in range(2,int(num_pages)+1):
            #create soup for current page
            url = rt_search_url+f"&sort=&page={i}"
            html_page = requests.get(url)
            html_tree = BeautifulSoup(html_page.content, 'html.parser')
            #create containers for current page
            reviews_container = html_tree.find('div', class_="content")
            review_list = reviews_container.find('div', class_='review_table') 
            #collect current page data
            for text in texts(review_list):
                all_text.append(text)
            for score in scores(review_list):
                all_scores.append(score)
            #buffer for half a second so as to not DDOS RT
            time.sleep(0.5)
    #combine and return page data
    return list(zip(all_text, all_scores))

## Reviews for a Genre

In [6]:
def create_review_df(genre, num_movies):
    """
    Return DataFrame with reviews and their scores collected from RT 
    using list of movies of given genre from IMDB.
    """
    imdb_url = f"https://www.imdb.com/search/title/?title_type=feature&genres={genre}&explore=genres"
    movie_data = collect_imdb_data(imdb_url, num_movies)
    all_reviews = []
    count = 0 #debug
    for movie in movie_data:
        print(f"Currently collecting {movie} reviews.") #debug
        name = movie[0].lower().replace(" ","_")
        year = movie[1]
        for review in collect_rt_reviews(name, year):
            all_reviews.append(review)
        print(f"{name} reviews collected, {1000-count} left.") #debug
        count += 1
    return pd.DataFrame(all_reviews, columns = ['Review', 'Score'])

In [8]:
df = create_review_df("horror", 1_000)

Currently collecting ('The Lodge', '1982') reviews.
the_lodge reviews collected, 1000 left.
Currently collecting ('The Platform', '2013') reviews.
the_platform reviews collected, 999 left.
Currently collecting ('Fantasy Island', '2001') reviews.
fantasy_island reviews collected, 998 left.
Currently collecting ('Midsommar', '2015') reviews.
midsommar reviews collected, 997 left.
Currently collecting ('The Wretched', '2018') reviews.
the_wretched reviews collected, 996 left.
Currently collecting ('The Invisible Man', '1996') reviews.
the_invisible_man reviews collected, 995 left.
Currently collecting ('The Lighthouse', '2021') reviews.
the_lighthouse reviews collected, 994 left.
Currently collecting ('Underwater', '2019') reviews.
underwater reviews collected, 993 left.
Currently collecting ('Gretel & Hansel', '2020') reviews.
gretel_&_hansel reviews collected, 992 left.
Currently collecting ('The Hunt', '2020') reviews.
the_hunt reviews collected, 991 left.
Currently collecting ('It Cha

scary_stories_to_tell_in_the_dark reviews collected, 918 left.
Currently collecting ('Climax', '2020') reviews.
climax reviews collected, 917 left.
Currently collecting ('Tremors', '2015') reviews.
tremors reviews collected, 916 left.
Currently collecting ("Gerald's Game", '1981') reviews.
gerald's_game reviews collected, 915 left.
Currently collecting ('1BR', '2018') reviews.
1br reviews collected, 914 left.
Currently collecting ('The Nun', '2021') reviews.
the_nun reviews collected, 913 left.
Currently collecting ('Z', '1996') reviews.
z reviews collected, 912 left.
Currently collecting ("Don't Breathe", '2016') reviews.
don't_breathe reviews collected, 911 left.
Currently collecting ('Carrie', '2020') reviews.
carrie reviews collected, 910 left.
Currently collecting ('Near Dark', '2009') reviews.
near_dark reviews collected, 909 left.
Currently collecting ('Frankenstein', '2010') reviews.
frankenstein reviews collected, 908 left.
Currently collecting ('Crawl', '2002') reviews.
crawl

countdown reviews collected, 834 left.
Currently collecting ('Candyman', '2014') reviews.
candyman reviews collected, 833 left.
Currently collecting ('Hush', '2020') reviews.
hush reviews collected, 832 left.
Currently collecting ('Haunt', '2001') reviews.
haunt reviews collected, 831 left.
Currently collecting ('The Purge', '2019') reviews.
the_purge reviews collected, 830 left.
Currently collecting ('Freaks', '2013') reviews.
freaks reviews collected, 829 left.
Currently collecting ('We Summon the Darkness', '2013') reviews.
we_summon_the_darkness reviews collected, 828 left.
Currently collecting ('The Wailing', '1981') reviews.
the_wailing reviews collected, 827 left.
Currently collecting ('The Curse of La Llorona', '2019') reviews.
the_curse_of_la_llorona reviews collected, 826 left.
Currently collecting ('Hostel', '1977') reviews.
hostel reviews collected, 825 left.
Currently collecting ('Splice', '2017') reviews.
splice reviews collected, 824 left.
Currently collecting ('Dawn of 

the_cell reviews collected, 753 left.
Currently collecting ('Velvet Buzzsaw', '2007') reviews.
velvet_buzzsaw reviews collected, 752 left.
Currently collecting ('The Blair Witch Project', '2007') reviews.
the_blair_witch_project reviews collected, 751 left.
Currently collecting ('Wounds', '2018') reviews.
wounds reviews collected, 750 left.
Currently collecting ('Army of the Dead', '2003') reviews.
army_of_the_dead reviews collected, 749 left.
Currently collecting ('Halloween', '2019') reviews.
halloween reviews collected, 748 left.
Currently collecting ('Let Me In', '2017') reviews.
let_me_in reviews collected, 747 left.
Currently collecting ('Jeepers Creepers', '1997') reviews.
jeepers_creepers reviews collected, 746 left.
Currently collecting ('A Girl Walks Home Alone at Night', '2012') reviews.
a_girl_walks_home_alone_at_night reviews collected, 745 left.
Currently collecting ('Jaws 2', '2014') reviews.
jaws_2 reviews collected, 744 left.
Currently collecting ('Exorcism at 60,000 F

the_poughkeepsie_tapes reviews collected, 671 left.
Currently collecting ('Dracula Untold', '1983') reviews.
dracula_untold reviews collected, 670 left.
Currently collecting ('As Above, So Below', '1982') reviews.
as_above,_so_below reviews collected, 669 left.
Currently collecting ('The First Purge', '') reviews.
the_first_purge reviews collected, 668 left.
Currently collecting ('Human Zoo', '2017') reviews.
human_zoo reviews collected, 667 left.
Currently collecting ('Blade II', '2018') reviews.
blade_ii reviews collected, 666 left.
Currently collecting ('Raw', '1973') reviews.
raw reviews collected, 665 left.
Currently collecting ('The Covenant', '1998') reviews.
the_covenant reviews collected, 664 left.
Currently collecting ('Eden Lake', '2004') reviews.
eden_lake reviews collected, 663 left.
Currently collecting ('Jaws 3-D', '1989') reviews.
jaws_3-d reviews collected, 662 left.
Currently collecting ('The Texas Chainsaw Massacre', '2019') reviews.
the_texas_chainsaw_massacre revie

existenz reviews collected, 589 left.
Currently collecting ('Scary Movie 3', '1975') reviews.
scary_movie_3 reviews collected, 588 left.
Currently collecting ('Invasion of the Body Snatchers', '2007') reviews.
invasion_of_the_body_snatchers reviews collected, 587 left.
Currently collecting ('Scary Movie 5', '1987') reviews.
scary_movie_5 reviews collected, 586 left.
Currently collecting ('Braindead', '2011') reviews.
braindead reviews collected, 585 left.
Currently collecting ('Possession', '2016') reviews.
possession reviews collected, 584 left.
Currently collecting ('Lords of Chaos', '1990') reviews.
lords_of_chaos reviews collected, 583 left.
Currently collecting ('Before I Go to Sleep', '2017') reviews.
before_i_go_to_sleep reviews collected, 582 left.
Currently collecting ('Evil Dead II', '2003') reviews.
evil_dead_ii reviews collected, 581 left.
Currently collecting ('Villains', '2014') reviews.
villains reviews collected, 580 left.
Currently collecting ('Beyond Skyline', '1979')

the_open_house reviews collected, 509 left.
Currently collecting ('Bloodline', '2014') reviews.
bloodline reviews collected, 508 left.
Currently collecting ('Lake Placid', '2017') reviews.
lake_placid reviews collected, 507 left.
Currently collecting ('Behind the Trees', '1990') reviews.
behind_the_trees reviews collected, 506 left.
Currently collecting ('Little Shop of Horrors', '2018') reviews.
little_shop_of_horrors reviews collected, 505 left.
Currently collecting ('Hellboy II: The Golden Army', '2008') reviews.
hellboy_ii:_the_golden_army reviews collected, 504 left.
Currently collecting ('Queen of the Damned', '2012') reviews.
queen_of_the_damned reviews collected, 503 left.
Currently collecting ('The Final Destination', '2015') reviews.
the_final_destination reviews collected, 502 left.
Currently collecting ('The Love Witch', '1999') reviews.
the_love_witch reviews collected, 501 left.
Currently collecting ('The Wraith', '2019') reviews.
the_wraith reviews collected, 500 left.
C

from_beyond reviews collected, 431 left.
Currently collecting ('Perfect Stranger', '2007') reviews.
perfect_stranger reviews collected, 430 left.
Currently collecting ('Scary Movie 4', '1983') reviews.
scary_movie_4 reviews collected, 429 left.
Currently collecting ('House', '1979') reviews.
house reviews collected, 428 left.
Currently collecting ('Friday the 13th Part VIII: Jason Takes Manhattan', '1982') reviews.
friday_the_13th_part_viii:_jason_takes_manhattan reviews collected, 427 left.
Currently collecting ('Repulsion', '2019') reviews.
repulsion reviews collected, 426 left.
Currently collecting ('I Am the Pretty Thing That Lives in the House', '2018') reviews.
i_am_the_pretty_thing_that_lives_in_the_house reviews collected, 425 left.
Currently collecting ('The Source of Shadows', '1984') reviews.
the_source_of_shadows reviews collected, 424 left.
Currently collecting ('Ginger Snaps', '1972') reviews.
ginger_snaps reviews collected, 423 left.
Currently collecting ('Koroshiya 1', 

the_possession_of_hannah_grace reviews collected, 351 left.
Currently collecting ('Yoga Hosers', '1986') reviews.
yoga_hosers reviews collected, 350 left.
Currently collecting ('Red Lights', '2015') reviews.
red_lights reviews collected, 349 left.
Currently collecting ('Ravenous', '1982') reviews.
ravenous reviews collected, 348 left.
Currently collecting ('The Bye Bye Man', '1993') reviews.
the_bye_bye_man reviews collected, 347 left.
Currently collecting ('Coma', '2018') reviews.
coma reviews collected, 346 left.
Currently collecting ('Howl', '1982') reviews.
howl reviews collected, 345 left.
Currently collecting ('V/H/S', '1988') reviews.
v/h/s reviews collected, 344 left.
Currently collecting ('An American Crime', 'V) (2019') reviews.
an_american_crime reviews collected, 343 left.
Currently collecting ('Rattlesnake', '2017') reviews.
rattlesnake reviews collected, 342 left.
Currently collecting ('Anacondas: The Hunt for the Blood Orchid', '1990') reviews.
anacondas:_the_hunt_for_th

he's_out_there reviews collected, 270 left.
Currently collecting ('Once Bitten', '2002') reviews.
once_bitten reviews collected, 269 left.
Currently collecting ('The Stuff', '1986') reviews.
the_stuff reviews collected, 268 left.
Currently collecting ('Creep 2', '1991') reviews.
creep_2 reviews collected, 267 left.
Currently collecting ('Freaks of Nature', '2019') reviews.
freaks_of_nature reviews collected, 266 left.
Currently collecting ('The Empty Man', '2006') reviews.
the_empty_man reviews collected, 265 left.
Currently collecting ('Compliance', '2009') reviews.
compliance reviews collected, 264 left.
Currently collecting ('Nightmare Cinema', '1976') reviews.
nightmare_cinema reviews collected, 263 left.
Currently collecting ('The Monster', '2013') reviews.
the_monster reviews collected, 262 left.
Currently collecting ('Wolf', '1990') reviews.
wolf reviews collected, 261 left.
Currently collecting ('Run', '2009') reviews.
run reviews collected, 260 left.
Currently collecting ('The

spawn reviews collected, 189 left.
Currently collecting ('Swamp Thing', '2004') reviews.
swamp_thing reviews collected, 188 left.
Currently collecting ('Girl House', '2013') reviews.
girl_house reviews collected, 187 left.
Currently collecting ('Seed of Chucky', '2006') reviews.
seed_of_chucky reviews collected, 186 left.
Currently collecting ('Clown', '2010') reviews.
clown reviews collected, 185 left.
Currently collecting ('Tamara', '2015') reviews.
tamara reviews collected, 184 left.
Currently collecting ('Thirteen Women', '2007') reviews.
thirteen_women reviews collected, 183 left.
Currently collecting ('Humanoids from the Deep', '2020') reviews.
humanoids_from_the_deep reviews collected, 182 left.
Currently collecting ('Silver Bullet', '1982') reviews.
silver_bullet reviews collected, 181 left.
Currently collecting ('The Car', '1977') reviews.
the_car reviews collected, 180 left.
Currently collecting ('Paranormal Activity 3', '') reviews.
paranormal_activity_3 reviews collected, 1

i_still_know_what_you_did_last_summer reviews collected, 108 left.
Currently collecting ('V/H/S/2', '2018') reviews.
v/h/s/2 reviews collected, 107 left.
Currently collecting ('Delirium', '1976') reviews.
delirium reviews collected, 106 left.
Currently collecting ('Shivers', '2006') reviews.
shivers reviews collected, 105 left.
Currently collecting ('Invasion of the Body Snatchers', '1975') reviews.
invasion_of_the_body_snatchers reviews collected, 104 left.
Currently collecting ('Splinter', '2017') reviews.
splinter reviews collected, 103 left.
Currently collecting ('Poltergeist II: The Other Side', '2017') reviews.
poltergeist_ii:_the_other_side reviews collected, 102 left.
Currently collecting ('Abigail Haunting', '2007') reviews.
abigail_haunting reviews collected, 101 left.
Currently collecting ('Plan 9 from Outer Space', '2010') reviews.
plan_9_from_outer_space reviews collected, 100 left.
Currently collecting ('Bait', '2002') reviews.
bait reviews collected, 99 left.
Currently c

bug reviews collected, 26 left.
Currently collecting ('A Dark Song', '1985') reviews.
a_dark_song reviews collected, 25 left.
Currently collecting ('The Row', '2007') reviews.
the_row reviews collected, 24 left.
Currently collecting ('Tales from the Darkside', '1922') reviews.
tales_from_the_darkside reviews collected, 23 left.
Currently collecting ('The Crimson Rivers', '2019') reviews.
the_crimson_rivers reviews collected, 22 left.
Currently collecting ('The X Files: I Want to Believe', '2012') reviews.
the_x_files:_i_want_to_believe reviews collected, 21 left.
Currently collecting ('Open Water', '2015') reviews.
open_water reviews collected, 20 left.
Currently collecting ('Troll', '1979') reviews.
troll reviews collected, 19 left.
Currently collecting ('Hatchet', '2019') reviews.
hatchet reviews collected, 18 left.
Currently collecting ('Return of the Living Dead III', '2005') reviews.
return_of_the_living_dead_iii reviews collected, 17 left.
Currently collecting ('The Lords of Sale

In [9]:
df.to_pickle("reviews.pkl")