## WEB SCRAPING

***Disclaimer: Some of these website have since adjusted the permissions of robot.txt so these scrapers may not work since I did the project.***

#### 1. Gather urls of movie content reviews

First we will gather the urls of movie reviews from www.moviesguide.org.  This website reviews movies based on the content such as, language, violence, and nudity.   

I used Scrapy because the website uses 'infinite scrolling' to display the movies that are reviewed.  The spider scrapes the urls of movies displayed on the page, then it requests an update to the website via the infinite scrolling mechanic, scrapes the next batch of urls, and continues until there are no new urls to scrape. 

Some of the urls do not link to movie reviews (they link to pages with content such as interviews and news articles) so the spider ignores these urls.

1. Use Scrapy to get urls from www.moviesguide.org  
    a. Navigate to this directory in the terminal and input the following commands:  
`>> cd get_urls`  
`>> scrapy crawl get_urls -o urls.json`  
2. This creates a json file with urls of all of the movie reviews on the website.



#### 2. Scrape the movie content data

Now we will use Beautiful Soup to scrape the movie content data from the urls we just gathered.  There were a lot of errors in the urls scraped from the website.  For example, sometimes there was a typo in the url and it did not link to the correct website.  Beautiful Soup allowed me to iteratively gather urls, store the data in csv files, and the correct the url in the json file when I found a faulty url.

In [4]:
from bs4 import BeautifulSoup

In [5]:
def get_rating(row):
    ''' 
    get_rating(row)
    
    Return the content rating from a row in the table of content ratings.
    
    Parameters
    ----------
    row : list
        A list of the html tags in the row.
        
    Returns
    ----------
    out : string
        The content rating as a string.  Either 'None', 'Light', 'Medium', or 'Heavy'
    '''
    
    rating_list = ['None', 'Light', 'Medium', 'Heavy']

    for i in range(len(row)):
        rating = row[i].div.attrs['class']
        # Check each value in the row, 'movieguide_circle' is the default so ignore it
        if rating[0] != 'movieguide_circle':
            return rating_list[i]

In [None]:
def get_genre(soup):
    '''
    get_genre(soup)
    
    Parameters
    ----------
    soup : Beautiful Soup object
        The current website to be scraped.
        
    Returns
    ----------
    genre: string
    rating: string
    
    '''
    
    review_box = soup.find(class_='cb-review-box')
    info = review_box.find_all('p')
    genre_list = info[2].extract().text.split(':')[1:]
    genre_list = [l.strip() for l in genre_list]
    genre = ' '.join(genre_list)
    genre = genre.split('/')[0]
    
    rating_list = info[4].text.split()[1:]
    rating_list = [l.strip() for l in rating_list]
    rating = ' '.join(rating_list)
    
    return genre, rating

In [6]:
def get_content(soup):
    '''
    get_content(soup)
    
    Parameters
    ----------
    soup : Beautiful Soup object
        The current website to be scraped.
        
    Returns
    ----------
    content_dict : dict
    
    
    Gathers the data from the get_rating and get_genre functions and returns the results as a dictionary. 
    '''
    
    content = soup.find(class_='movieguide_content_summary')
    
    # Check for exception when given a bad url.
    try:
        content_table = content.find_all('td')
        language = content_table[1:5]
        violence = content_table[6:10]
        sex = content_table[11:15]
        nudity = content_table[16:20]

        # Create dictionary and put values in the dictionary
        content_dict = {'Language': '', 'Violence': '', 'Sex': '', 'Nudity': ''}
        content_dict['Language'] = get_rating(language)
        content_dict['Violence'] = get_rating(violence)
        content_dict['Sex'] = get_rating(sex)
        content_dict['Nudity'] = get_rating(nudity)
        content_dict['Title'] = (soup.find(class_="entry-title cb-entry-title cb-single-title")
                                    .text.strip()
                                    .title())

        genre, rating = get_genre(soup)
        content_dict['Genre'] = genre
        content_dict['Rating'] = rating

        return content_dict
    except:
            print("Error here")
    return

In [None]:
def scrape(url):
    '''
    scrape(url)
    
    Parameters
    ----------
    url : string
        The current website to be scraped.
        
    Returns
    ----------    
    data : dict
    
    This function requests the html from the website, creates a Beautiful Soup object, 
    passes it to the function that scrapes the data and then returns the data.
    
    '''
    
    
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page, "lxml")
    
    data = get_content(soup)

    
    return data

Now time to actually scrape the data.

In [None]:
with open('urls.json', 'r') as jf:
    all_urls = json.load(jf)

In [None]:
# Store the dictionaries from each scrape url in a list.
# When 100 movies have been scraped store as a csv file and restart.
data_list = []
n = 0 # Counter for files

for url in all_urls:
    print(len(data_list))
    data = scrape(url['url'])
    if data is not None:
        data_list.append(data)
    
    if len(data_list) == 100:
        pd.DataFrame(data_list).to_csv(f'data/scraped/scraped_{n}.csv')
        print(f'scraped_{n}.csv saved')
        data_list = []
        n += 1
        
pd.DataFrame(data_list).to_csv(f'data/scraped/scraped_{n}.csv')
print(f'scraped_{n}.csv saved')

#### 2. Scrape the movie budgets and box office returns

In [None]:
def scrape_numbers(url):
    '''
    scrape(url)
    
    Parameters
    ----------
    url : string
        The current website to be scraped.
        
    Returns
    ---------- 
    df : DataFrame object
    
    This reads the data in the table from the html into a DataFrame.
    The DataFrame columns are set and and the data is formatted.
    
    '''
    
    
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page, "lxml")
    
    table = soup.find('table')
    pd_table = pd.read_html(str(table))
    df = pd.DataFrame(pd_table[0])
    df.dropna(inplace=True)
    df.columns = (['Index', 'Release_Date', 'Title', 'Production_Budget',
                   'Domestic_Gross', 'Worldwide_Gross'])
    df.drop(columns='Index', inplace=True)

    
    df['Production_Budget'] = df['Production_Budget'].str.replace('$', '')
    df['Production_Budget'] = df['Production_Budget'].str.replace(',', '')
    df['Domestic_Gross'] = df['Domestic_Gross'].str.replace('$', '')
    df['Domestic_Gross'] = df['Domestic_Gross'].str.replace(',', '')
    df['Worldwide_Gross'] = df['Worldwide_Gross'].str.replace('$', '')
    df['Worldwide_Gross'] = df['Worldwide_Gross'].str.replace(',', '')
    
    df['Production_Budget'] = df['Production_Budget'].astype(int)
    df['Domestic_Gross'] = df['Domestic_Gross'].astype(int)
    df['Worldwide_Gross'] = df['Worldwide_Gross'].astype(int)
    
    return df

In [7]:
urls = ([f'https://www.the-numbers.com/movie/budgets/all/{n+1}' 
         for n in range(0, 5600, 100)])

In [None]:
#DataFrames are created 10 urls at a time and stored a csv file.

data_list = pd.DataFrame()
n = 0 # Counter for files
f = 0

for url in urls:
#    print(url)
    data = scrape_numbers(url)
    
    if data is not None:
#        print('appending')
        data_list = data_list.append(data, ignore_index=True)

    if n == 9:
        data_list.to_csv(f'data/budgets/budget_{f}.csv')
        print(f'data/budgets/budget_{f}.csv file saved')
        data_list = pd.DataFrame()
        n = -1
        f += 1
    n += 1

pd.DataFrame(data_list).to_csv(f'data/budgets/budget_{f}.csv')
print(f'data/budgets/budget_{f}.csv file saved')