# Data Collection

## Table of Contents:
1. [List of Movies](#names)
2. [List of Reviews](#reviews)
3. [Reviews for a Genre](#genre)

In [1]:
from bs4 import BeautifulSoup
import requests
import time
import pandas as pd

## List of Movies <a id='names'></a>

First, we'll need to grab our list of movie names from IMDB. We use the "Horror" genre because I think the words mentioned in these reviews should prove useful features for classification.

In [2]:
def titles(movies_container):
    """Returns list of movie titles from IMDB search results."""
    return [h3.find('a').get_text() for h3 in movies_container.findAll('h3')]

def years(movies_container):
    """Returns list of movie release years from IMDB search results."""
    return [h3.find('span', class_='lister-item-year text-muted unbold').get_text().strip('(I) ') for h3 in movies_container.findAll('h3')]

def collect_imdb_data(imdb_seach_url, total_results):
    """Returns list of titles and year of release for given number of results.
    Expect this function to take (total_results/50)/2 seconds.
    Total results should be less than 10,000."""
    #initialize all lists
    all_titles = []
    all_years = []
    #create soup for first page
    html_page = requests.get(imdb_seach_url)
    html_tree = BeautifulSoup(html_page.content, 'html.parser')
    #create containers for first page
    movies_container = html_tree.find('div', class_="lister-list")
    imdb_movie_list = movies_container.findAll('div', class_="lister-item-content")
    #collect first page data
    for title in titles(movies_container):
        all_titles.append(title)
    #check if total_results is greater than 10,000 since the IMDB URL changes after that many results
    if total_results > 10_000:
        print("The amount of results is too large, this function can only support up to 10,000. Collecting data for top 10,000 results only.")
        total_results = 10_001
    #iterate through the rest of the results to collect data
    for i in range(51,total_results+50,50):
        #create soup for current page
        url = imdb_seach_url+f"&start={i}&ref_=adv_nxt"
        html_page = requests.get(url)
        html_tree = BeautifulSoup(html_page.content, 'html.parser')
        #create containers for current page
        movies_container = html_tree.find('div', class_="lister-list")
        imdb_movie_list = movies_container.findAll('div', class_="lister-item-content")
        #collect current page data
        for title in titles(movies_container):
            all_titles.append(title)
        for year in years(movies_container):
            all_years.append(year)
        #buffer for half a second so as to not DDOS IMDB
        time.sleep(0.5)
    #combine and return page data
    return list(zip(all_titles, all_years))

## List of Reviews <a id='reviews'></a>

Now that we have a list of 1,000 movies and their respective release year, we can use this information to grab all of the reviews for these horror movies from Rotten Tomatoes.

In [3]:
def texts(review_list):
    """Returns list of reviews from Rotten Tomato search results."""
    return [review.find('div', class_='the_review').get_text() for review in review_list.findAll('div', class_='row review_table_row')]

def scores(review_list):
    """Returns list of scores from Rotten Tomato search results."""
    return [review.find('div', class_='col-xs-16 review_container').findChildren('div')[0].get('class')[-1] for review in review_list.findAll('div', class_='row review_table_row')]

def collect_rt_reviews(name, year):
    """Returns list of text and scores of reviews for given RT search results."""
    rt_search_url = f"https://www.rottentomatoes.com/m/{name}_{year}/reviews?type=top_critics"
    #initialize all lists
    all_text = []
    all_scores = []
    #create soup for first page
    html_page = requests.get(rt_search_url)
    html_tree = BeautifulSoup(html_page.content, 'html.parser')
    reviews_container = html_tree.find('div', class_="content")
    #some RT urls do not contain the date, this if checks for that case
    if reviews_container is None:
#         print(f"{name} not found on RT, trying without year.") #debug
        rt_search_url = f"https://www.rottentomatoes.com/m/{name}/reviews?type=top_critics"
        html_page = requests.get(rt_search_url)
        html_tree = BeautifulSoup(html_page.content, 'html.parser')
        reviews_container = html_tree.find('div', class_="content")
    #if the page is still not found, return empty lists
    if reviews_container is None:
#         print(f"{name} not found on RT, returning empty list.") #debug
        return list(zip(all_text, all_scores))
    review_list = reviews_container.find('div', class_='review_table')
    for text in texts(review_list):
        all_text.append(text)
    for score in scores(review_list):
        all_scores.append(score)
    #look for page information
    page_info = reviews_container.findAll('span', class_='pageInfo')
    #if there is more than one page
    if page_info:
        #grab number of pages
        num_pages = page_info[0].get_text()[-1]
        #iterate through the rest of the results to collect data
        for i in range(2,int(num_pages)+1):
            #create soup for current page
            url = rt_search_url+f"&sort=&page={i}"
            html_page = requests.get(url)
            html_tree = BeautifulSoup(html_page.content, 'html.parser')
            #create containers for current page
            reviews_container = html_tree.find('div', class_="content")
            review_list = reviews_container.find('div', class_='review_table') 
            #collect current page data
            for text in texts(review_list):
                all_text.append(text)
            for score in scores(review_list):
                all_scores.append(score)
            #buffer for half a second so as to not DDOS RT
            time.sleep(0.5)
    #combine and return page data
    return list(zip(all_text, all_scores))

## Reviews for a Genre <a id='genre'></a>

Lastly, we can combine these functions to get a list of reviews and their scores for movies under a specific genre.

In [4]:
def create_review_df(genre, num_movies):
    """
    Return DataFrame with reviews and their scores collected from RT 
    using list of movies of given genre from IMDB.
    """
    imdb_url = f"https://www.imdb.com/search/title/?title_type=feature&genres={genre}&explore=genres"
    movie_data = collect_imdb_data(imdb_url, num_movies)
    all_reviews = []
#     count = 0 #debug
    for movie in movie_data:
#         print(f"Currently collecting {movie} reviews.") #debug
        name = movie[0].lower().replace(" ","_")
        year = movie[1]
        for review in collect_rt_reviews(name, year):
            all_reviews.append(review)
#         print(f"{name} reviews collected, {1000-count} left.") #debug
#         count += 1 #debug
    return pd.DataFrame(all_reviews, columns = ['Review', 'Score'])

In [5]:
df = create_review_df("horror", 1_000)

In [6]:
df.to_pickle("../Data/reviews.pkl")