# PROJECT - 2  WEB SCRAPING 

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import dateutil.parser

### Getting  information from a movie page

In [2]:
url = 'https://www.boxofficemojo.com/title/tt2488496/?ref_=bo_cso_table_1'
response = requests.get(url)
page = response.text
soup = BeautifulSoup(page, 'lxml')
# soup.prettify()

#### Movie Title

In [3]:
soup.find('title').text
title_string = soup.find('title').text
title_string

'Star Wars: Episode VII - The Force Awakens - Box Office Mojo'

In [4]:
title = title_string[:-18]
title

'Star Wars: Episode VII - The Force Awakens'

#### Domestic Gross

In [5]:
soup.find(class_='mojo-performance-summary-table').find_all('span',class_='money')

[<span class="money">$936,662,225</span>,
 <span class="money">$1,131,791,908</span>,
 <span class="money">$2,068,454,133</span>]

In [6]:
dom_gross = soup.find(class_='mojo-performance-summary-table').find_all('span',class_='money')[0].text
dom_gross

'$936,662,225'

#### International Gross

In [7]:
intl_gross = soup.find(class_='mojo-performance-summary-table').find_all('span',class_='money')[1].text
intl_gross

'$1,131,791,908'

#### Worldwide Gross

In [8]:
world_gross = soup.find(class_='mojo-performance-summary-table').find_all('span',class_='money')[2].text
world_gross

'$2,068,454,133'

#### Domestic Openig Gross

In [9]:
dom_opening_gross = soup.find(class_='mojo-summary-values').find_all('span', class_ = 'money')[0]
dom_opening_gross

<span class="money">$247,966,675</span>

#### Budget

In [10]:
prod_budget = soup.find(class_='mojo-summary-values').find_all('span', class_ = 'money')[1]
prod_budget

<span class="money">$245,000,000</span>

#### Domestic Distributor

In [11]:
soup.find(class_='mojo-summary-values').find_all('span', class_ = 'money')

[<span class="money">$247,966,675</span>,
 <span class="money">$245,000,000</span>]

In [12]:
distributor_reggex = re.compile('Distributor')
soup.find(text=distributor_reggex)

'Domestic Distributor'

In [13]:
distr_string = find(text=distributor_reggex).findNext('span').text

NameError: name 'find' is not defined

In [14]:
dist_string = soup.find(text=distributor_reggex).findNext('span').text[:18]
dom_dist = dist_string
dom_dist

'Walt Disney Studio'

### Get Movie (findNext) Values

In [15]:
def get_movie_value(soup, field_name):
    
    """
    Get a value fro the movie page and take the string attribute of a movie on the page and return
    the value of the next sibling object or None if nothing is found.
    
    """
    obj = soup.find(text=re.compile(field_name))
    if not obj:
        return None
    next_element = obj.findNext()
    
    if next_element:
        return next_element.text
    else:
        return None
    

#### Domestic Opening Gross

In [16]:
dom_opening_gross = soup.find(class_='mojo-summary-values').find_all('span', class_ = 'money')[0].text
dom_opening_gross

'$247,966,675'

In [17]:
production_budget = soup.find(class_='mojo-summary-values').find_all('span', class_ = 'money')[1].text
production_budget

'$245,000,000'

#### Runtime

In [18]:
runtime = get_movie_value(soup, 'Running')
print(runtime)

2 hr 18 min


#### Rating

In [19]:
rating = get_movie_value(soup, 'MPAA')
print(rating)

PG-13


#### Release Date

In [20]:
rel_date = get_movie_value(soup, 'Release Date').split('\n')[0]
print(rel_date)

December 16, 2015


In [21]:
date = dateutil.parser.parse(rel_date)
date.month

12

In [22]:
date = dateutil.parser.parse(rel_date)
date.year


2015

In [23]:
import datetime

a = '2010-01-31'

datee = datetime.datetime.strptime(a, "%Y-%m-%d")


datee.month
Out[9]: 1

datee.year
Out[10]: 2010

datee.day
Out[11]: 31

#### Genres

In [24]:
genres = get_movie_value(soup, 'Genres').split()
genres

['Action', 'Adventure', 'Sci-Fi']

#### Distributor

In [25]:
distributor = get_movie_value(soup, 'Distributor')[:35]
distributor

'Walt Disney Studios Motion Pictures'

### Function to convert money and time string into integer values

In [26]:
def money_to_int(moneystring):
    
    """
    replaces the $ sign and removing the comma separator from the money string
    returns the integer value
    """
    
    moneystring = moneystring.replace('$', '').replace(',', '')
    return int(moneystring)

def runtime_to_minutes(runtimestring):
    
    """
    takes runtimestring in _hr_min format and returns it's value in number of minutes
    """
    
    runtime = runtimestring.split()
    try:
        minutes = int(runtime[0]) * 60 + int(runtime[2])
        return minutes
    except:
        return None
    
def to_date(datestring):
    date = dateutil.parser.parse(datestring)
    return date    

In [27]:
raw_dom_gross = dom_gross
domestic_gross = money_to_int(raw_dom_gross)

raw_intl_gross = intl_gross
international_gross = money_to_int(raw_intl_gross)

raw_world_gross = world_gross
worldwide_gross = money_to_int(raw_world_gross)

raw_opening_gross = dom_opening_gross
domestic_opening_gross = money_to_int(raw_opening_gross)

raw_budget = production_budget
budget = money_to_int(raw_budget)

raw_release_date = get_movie_value(soup,'Release Date').split('\n')[0]
release_date = to_date(raw_release_date)
release_date

datetime.datetime(2015, 12, 16, 0, 0)

In [28]:
raw_runtime = get_movie_value(soup, 'Running')
runtime_minutes = runtime_to_minutes(raw_runtime)
runtime_minutes

138

#### Release date

In [29]:
raw_release_date = get_movie_value(soup,'Release Date').split('\n')[0]
release_month = to_date(raw_release_date).month
release_month

12

In [30]:
release_month = rel_date.split()[0]
release_month

'December'

#### Actors

In [31]:
url = 'https://www.boxofficemojo.com/title/tt2488496/credits/?ref_=bo_tt_tab'
response = requests.get(url)
castcrew_page = response.text
castcrew_soup = BeautifulSoup(castcrew_page, 'lxml')

In [32]:
casts = castcrew_soup.find('table', id = 'principalCast').find_all('a', attrs={'class':'a-link-normal'})
actors = []
for cast in casts:
    cast = cast.text.split('\n')[0]
    actors += [cast]
actors

['Daisy Ridley', 'John Boyega', 'Oscar Isaac', 'Domhnall Gleeson']

#### Director

In [33]:
crews = castcrew_soup.find('table', id = 'principalCrew').find_all('a', attrs={'class':'a-link-normal'})[0]

In [34]:
crews.text.split('\n')[0]

'J.J. Abrams'

### Getting the domestic lifetime ranking table

In [35]:
url = 'https://www.boxofficemojo.com/chart/top_lifetime_gross/?ref_=bo_cso_ac'#change pages
response = requests.get(url)
page = response.text
soup = BeautifulSoup(page, 'lxml')
# print(soup.prettify())

In [36]:
# finding the table
table = soup.find('table')
# table

In [37]:
rows = [row for row in table.find_all('tr')]
rows[1].find_all('td')

[<td class="a-text-right mojo-header-column mojo-truncate mojo-field-type-rank">1</td>,
 <td class="a-text-left mojo-field-type-title"><a class="a-link-normal" href="/title/tt2488496/?ref_=bo_cso_table_1">Star Wars: Episode VII - The Force Awakens</a></td>,
 <td class="a-text-right mojo-field-type-money">$936,662,225</td>,
 <td class="a-text-left mojo-field-type-year"><a class="a-link-normal" href="/year/2015/?ref_=bo_cso_table_1">2015</a></td>]

In [38]:
rows[1].find_all('td')[1].find('a')['href']

'/title/tt2488496/?ref_=bo_cso_table_1'

#### Previewing Lifetime Movie Data

In [39]:
movies = {}

for row in rows[1:201]:
    items = row.find_all('td')
    link = items[1].find('a')
    title = link.text
    url = link['href']
    movies[title] = [url]+ [i.text for i in items]

movies

{'Star Wars: Episode VII - The Force Awakens': ['/title/tt2488496/?ref_=bo_cso_table_1',
  '1',
  'Star Wars: Episode VII - The Force Awakens',
  '$936,662,225',
  '2015'],
 'Avengers: Endgame': ['/title/tt4154796/?ref_=bo_cso_table_2',
  '2',
  'Avengers: Endgame',
  '$858,373,000',
  '2019'],
 'Avatar': ['/title/tt0499549/?ref_=bo_cso_table_3',
  '3',
  'Avatar',
  '$760,507,625',
  '2009'],
 'Black Panther': ['/title/tt1825683/?ref_=bo_cso_table_4',
  '4',
  'Black Panther',
  '$700,426,566',
  '2018'],
 'Avengers: Infinity War': ['/title/tt4154756/?ref_=bo_cso_table_5',
  '5',
  'Avengers: Infinity War',
  '$678,815,482',
  '2018'],
 'Titanic': ['/title/tt0120338/?ref_=bo_cso_table_6',
  '6',
  'Titanic',
  '$659,363,944',
  '1997'],
 'Jurassic World': ['/title/tt0369610/?ref_=bo_cso_table_7',
  '7',
  'Jurassic World',
  '$652,340,625',
  '2015'],
 'The Avengers': ['/title/tt0848228/?ref_=bo_cso_table_8',
  '8',
  'The Avengers',
  '$623,357,910',
  '2012'],
 'Star Wars: Episode V

In [40]:
lifetime_movies = pd.DataFrame(movies).T
lifetime_movies.columns = ['link_stub', 'rank', 'title', 'domestic_gross','release_year']
lifetime_movies

Unnamed: 0,link_stub,rank,title,domestic_gross,release_year
Star Wars: Episode VII - The Force Awakens,/title/tt2488496/?ref_=bo_cso_table_1,1,Star Wars: Episode VII - The Force Awakens,"$936,662,225",2015
Avengers: Endgame,/title/tt4154796/?ref_=bo_cso_table_2,2,Avengers: Endgame,"$858,373,000",2019
Avatar,/title/tt0499549/?ref_=bo_cso_table_3,3,Avatar,"$760,507,625",2009
Black Panther,/title/tt1825683/?ref_=bo_cso_table_4,4,Black Panther,"$700,426,566",2018
Avengers: Infinity War,/title/tt4154756/?ref_=bo_cso_table_5,5,Avengers: Infinity War,"$678,815,482",2018
...,...,...,...,...,...
Fast Five,/title/tt1596343/?ref_=bo_cso_table_196,196,Fast Five,"$209,837,675",2011
Mission: Impossible - Ghost Protocol,/title/tt1229238/?ref_=bo_cso_table_197,197,Mission: Impossible - Ghost Protocol,"$209,397,903",2011
Wedding Crashers,/title/tt0396269/?ref_=bo_cso_table_198,198,Wedding Crashers,"$209,255,921",2005
Sherlock Holmes,/title/tt0988045/?ref_=bo_cso_table_199,199,Sherlock Holmes,"$209,028,679",2009


#### Storing movie page data into a dictionary

In [42]:
def get_movie_dict(link):
    
    """
    From BoxOfficeMojo link stub, request movie html, parse with BeautifulSoup, and
    collect 
        - title 
        - domestic gross
        - runtime 
        - MPAA rating
        - full release date
    Return information as a dictionary
    
    """
    
    base_url = 'https://www.boxofficemojo.com'
    
    #Create full url to scrape
    url = base_url + link
    
    #Request HTML and parse
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page,"lxml")
    
    headers = ['title', 'domestic_opening_gross', 'budget', 'domestic_gross', 'international_gross', 
               'worldwide_gross','runtime_minutes', 'rating', 'release_month', 'release_date','genres']
    
    #Get title
    title_string = soup.find('title').text
    title = title_string[:-18]
    

    #Get domestic gross
    raw_dom_gross = soup.find(class_='mojo-performance-summary-table').find_all('span',class_='money')[0].text
    domestic_gross = money_to_int(raw_dom_gross)
    
    #dom_gross = money_to_int(dom_gross)
    
    #Get international_gross   
    try:
        raw_intl_gross = soup.find(class_='mojo-performance-summary-table').find_all('span',class_='money')[1].text
        international_gross = money_to_int(raw_intl_gross)
    except:
        international_gross = 0
    
    #Get worldwide gross
    try:
        raw_world_gross = soup.find(class_='mojo-performance-summary-table').find_all('span',class_='money')[2].text
        worldwide_gross = money_to_int(raw_world_gross)
    except:
        worldwide_gross = 0
    
    #Get domestic opening gross
    try:
        raw_opening_gross = soup.find(class_='mojo-summary-values').find_all('span', class_ = 'money')[0].text
        domestic_opening_gross = money_to_int(raw_opening_gross)
    except:
        domestic_opening_gross = 0
    
    
    #Get budget
    try:
        raw_budget = soup.find(class_='mojo-summary-values').find_all('span', class_ = 'money')[1].text
        budget = money_to_int(raw_budget)
    except:
        budget = 0
    
    #Get distributor
    try:
        distributor = get_movie_value(soup, 'Distributor')[:18]
    except:
        distributor = ''
    
    #Get runtime
    raw_runtime = get_movie_value(soup, 'Running')
    runtime_minutes = runtime_to_minutes(raw_runtime)
    
    #Get rating
    rating = get_movie_value(soup, 'MPAA')
    
    #Get genres
    genres = get_movie_value(soup, 'Genres').split()

    
    #Get release month
    raw_release_date = get_movie_value(soup,'Release Date').split('\n')[0]
    release_month = to_date(raw_release_date).month
   
    #Get release date
    raw_release_date = get_movie_value(soup,'Release Date').split('\n')[0]
    release_date = to_date(raw_release_date)
    
    
    #Create movie dictionary and return
    movie_dict = dict(zip(headers, [title,
                                    domestic_opening_gross,
                                    budget,
                                    domestic_gross,
                                    international_gross,
                                    worldwide_gross,
                                    runtime_minutes,
                                    rating,
                                    release_month,
                                    release_date,
                                    genres]))

    return movie_dict

In [None]:
lifetime_movie_data = []
for link in lifetime_movies.link_stub:
    lifetime_movie_data.append(get_movie_dict(link))

In [None]:
lifetime_movie_data

In [None]:
lifetime_movies_info = pd.DataFrame(lifetime_movie_data)
lifetime_movies_info.set_index('title', inplace = True)

In [None]:
lifetime_movies_info.info()

In [None]:
# un comment the following and change number according to page
#lifetime_movies_info_5 = lifetime_movies_info
#lifetime_movies_5 = lifetime_movies
#lifetime_movies_5 = lifetime_movies_5.merge(lifetime_movies_info_5, left_index=True, right_index = True)
#lifetime_movies_5.head()

In [None]:
lifetime_movies = lifetime_movies.merge(lifetime_movies_info, left_index=True, right_index=True)

lifetime_movies.head()

In [None]:
lifetime_movies.info()

In [None]:
# un comment the following and change with the page change
#lifetime_movies.to_csv('lifetime_movies.csv')

In [None]:
#df1 = pd.read_csv('lifetime_movies.csv')
#df2 = pd.read_csv('lifetime_movies_2.csv')
#df3 = pd.read_csv('lifetime_movies_3.csv')
#df4 = pd.read_csv('lifetime_movies_4.csv')
#df5 = pd.read_csv('lifetime_movies_5.csv')
#frames = [df1, df2, df3, df4, df5]
#mojo_data = pd.concat(frames)
#mojo_data.to_csv('mojo_data.csv')

#df = pd.read_csv('movie_list.csv')
#df = pd.read_csv('mojo_data.csv', index_col = [0])
#df.info()

In [None]:
#mojo_data.info()
