# PROJECT - 2

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re

### Getting more information from a movie page

In [2]:
url = 'https://www.boxofficemojo.com/title/tt2488496/?ref_=bo_cso_table_1'
response = requests.get(url)
page = response.text
soup = BeautifulSoup(page, 'lxml')
# soup.prettify()

#### Movie Title

In [3]:
soup.find('title').text
title_string = soup.find('title').text
title_string

'Star Wars: Episode VII - The Force Awakens - Box Office Mojo'

In [4]:
title = title_string[:-18]
title

'Star Wars: Episode VII - The Force Awakens'

#### Domestic Gross

In [5]:
soup.find(class_='mojo-performance-summary-table').find_all('span',class_='money')

[<span class="money">$936,662,225</span>,
 <span class="money">$1,131,791,908</span>,
 <span class="money">$2,068,454,133</span>]

In [6]:
domestic_gross = soup.find(class_='mojo-performance-summary-table').find_all('span',class_='money')[0].text
domestic_gross

'$936,662,225'

#### International Gross

In [7]:
international_gross = soup.find(class_='mojo-performance-summary-table').find_all('span',class_='money')[1].text
international_gross

'$1,131,791,908'

#### Worldwide Gross

In [8]:
worldwide_gross = soup.find(class_='mojo-performance-summary-table').find_all('span',class_='money')[2].text
worldwide_gross

'$2,068,454,133'

#### Domestic Distributor

In [9]:
soup.find(class_='mojo-summary-values').find_all('span', class_ = 'money')

[<span class="money">$247,966,675</span>,
 <span class="money">$245,000,000</span>]

In [10]:
distributor_reggex = re.compile('Distributor')
soup.find(text=distributor_reggex)

'Domestic Distributor'

In [11]:
dist_string = soup.find(text=distributor_reggex).findNext('span').text[:35]
dom_dist = dist_string
dom_dist

'Walt Disney Studios Motion Pictures'

#### Domestic Opening Gross

In [12]:
domopening_reggex = re.compile('Domestic Open')
soup.find(text=domopening_reggex)

'Domestic Opening'

In [13]:
domestic_opening_gross = soup.find(text=domopening_reggex).findNext(class_='money').text
domestic_opening_gross

'$247,966,675'

#### Budget

####  Release Month

In [14]:
release_reggex = re.compile('Release Date')
soup.find(text=release_reggex).findNext().text.split()[0]
release_month = soup.find(text=release_reggex).findNext().text.split()[0]
release_month

'December'

#### MPAA Rating

In [15]:
rating_reggex = re.compile('MPAA')
rating = soup.find(text=rating_reggex).findNext('span').text
rating

'PG-13'

#### Running Time

In [16]:
runtime_reggex = re.compile('Run')
rt = soup.find(text=runtime_reggex).findNext('span').text.split()
runtime_minutes = int(rt[0])*60 + int(rt[2])
runtime_minutes

138

#### Genres

In [17]:
genres_reggex = re.compile('Genres')
genres = soup.find(text=genres_reggex).findNext('span').text.split()
genres

['Action', 'Adventure', 'Sci-Fi']

In [18]:
headers = ['movie title', 'domestic total gross', 'international total gross','domestic opening gross','runtime (mins)',
           'rating', 'release date', 'genre']

movie_data = []
movie_dict = dict(zip(headers,[title,
                               domestic_gross,
                               international_gross,
                               domestic_opening_gross,
                               runtime_minutes,
                               rating,
                               release_month,
                               genres]))
movie_data.append(movie_dict)
movie_data

[{'movie title': 'Star Wars: Episode VII - The Force Awakens',
  'domestic total gross': '$936,662,225',
  'international total gross': '$1,131,791,908',
  'domestic opening gross': '$247,966,675',
  'runtime (mins)': 138,
  'rating': 'PG-13',
  'release date': 'December',
  'genre': ['Action', 'Adventure', 'Sci-Fi']}]

### Getting the lifetime ranking table

In [19]:
url = 'https://www.boxofficemojo.com/chart/top_lifetime_gross/'
response = requests.get(url)
page = response.text
soup = BeautifulSoup(page, 'lxml')
# print(soup.prettify())

In [20]:
# finding the table
table = soup.find('table')
# table

In [21]:
rows = [row for row in table.find_all('tr')]
rows[1].find_all('td')

[<td class="a-text-right mojo-header-column mojo-truncate mojo-field-type-rank">1</td>,
 <td class="a-text-left mojo-field-type-title"><a class="a-link-normal" href="/title/tt2488496/?ref_=bo_cso_table_1">Star Wars: Episode VII - The Force Awakens</a></td>,
 <td class="a-text-right mojo-field-type-money">$936,662,225</td>,
 <td class="a-text-left mojo-field-type-year"><a class="a-link-normal" href="/year/2015/?ref_=bo_cso_table_1">2015</a></td>]

In [22]:
rows[1].find_all('td')[1].find('a')['href']

'/title/tt2488496/?ref_=bo_cso_table_1'

In [23]:
movies = {}

for row in rows[1:10]:
    items = row.find_all('td')
    link = items[1].find('a')
    title = link.text
    url = link['href']
    movies[title] = [url]+ [i.text for i in items]

movies

{'Star Wars: Episode VII - The Force Awakens': ['/title/tt2488496/?ref_=bo_cso_table_1',
  '1',
  'Star Wars: Episode VII - The Force Awakens',
  '$936,662,225',
  '2015'],
 'Avengers: Endgame': ['/title/tt4154796/?ref_=bo_cso_table_2',
  '2',
  'Avengers: Endgame',
  '$858,373,000',
  '2019'],
 'Avatar': ['/title/tt0499549/?ref_=bo_cso_table_3',
  '3',
  'Avatar',
  '$760,507,625',
  '2009'],
 'Black Panther': ['/title/tt1825683/?ref_=bo_cso_table_4',
  '4',
  'Black Panther',
  '$700,426,566',
  '2018'],
 'Avengers: Infinity War': ['/title/tt4154756/?ref_=bo_cso_table_5',
  '5',
  'Avengers: Infinity War',
  '$678,815,482',
  '2018'],
 'Titanic': ['/title/tt0120338/?ref_=bo_cso_table_6',
  '6',
  'Titanic',
  '$659,363,944',
  '1997'],
 'Jurassic World': ['/title/tt0369610/?ref_=bo_cso_table_7',
  '7',
  'Jurassic World',
  '$652,295,625',
  '2015'],
 'The Avengers': ['/title/tt0848228/?ref_=bo_cso_table_8',
  '8',
  'The Avengers',
  '$623,357,910',
  '2012'],
 'Star Wars: Episode V

In [24]:
lifetime_movies = pd.DataFrame(movies).T
lifetime_movies.columns = ['link_stub', 'rank', 'title', 'lifetime_gross', 'release_year']
lifetime_movies.head()

Unnamed: 0,link_stub,rank,title,lifetime_gross,release_year
Star Wars: Episode VII - The Force Awakens,/title/tt2488496/?ref_=bo_cso_table_1,1,Star Wars: Episode VII - The Force Awakens,"$936,662,225",2015
Avengers: Endgame,/title/tt4154796/?ref_=bo_cso_table_2,2,Avengers: Endgame,"$858,373,000",2019
Avatar,/title/tt0499549/?ref_=bo_cso_table_3,3,Avatar,"$760,507,625",2009
Black Panther,/title/tt1825683/?ref_=bo_cso_table_4,4,Black Panther,"$700,426,566",2018
Avengers: Infinity War,/title/tt4154756/?ref_=bo_cso_table_5,5,Avengers: Infinity War,"$678,815,482",2018


In [25]:
def get_movie_dict(link):
    """
    From BoxOfficeMojo link stub, request movie html, parse with BeautifulSoup, and
    collect 
        - title 
        - domestic gross
        - runtime 
        - MPAA rating
        - full release date
    Return information as a dictionary
    
    """
    
    base_url = 'https://www.boxofficemojo.com'
    
    #Create full url to scrape
    url = base_url + link
    
    #Request HTML and parse
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page,"lxml")
    
    headers = ['title', 'domestic_gross', 'international_gross','worldwide_gross', 'domestic_opening_gross',
               'runtime_minutes', 'rating', 'release_month', 'genres']
    
    #Get title
    title_string = soup.find('title').text
    title = title_string[:-18]

    #Get domestic gross
    domestic_gross = soup.find(class_='mojo-performance-summary-table').find_all('span',class_='money')[0].text
    #dom_gross = money_to_int(dom_gross)
    
    #Get international_gross
    international_gross = soup.find(class_='mojo-performance-summary-table').find_all('span',class_='money')[1].text
    
    #Get worldwide gross
    worldwide_gross = soup.find(class_='mojo-performance-summary-table').find_all('span',class_='money')[2].text
    
    #Get domestic opening gross
    domestic_opening_gross = soup.find(text=domopening_reggex).findNext(class_='money').text
    
    
    #Get runtime
    runtime_reggex = re.compile('Run')
    rt = soup.find(text=runtime_reggex).findNext('span').text.split()
    runtime_minutes = int(rt[0])*60 + int(rt[2])
    
    #Get rating
    rating_reggex = re.compile('MPAA')
    rating = soup.find(text=rating_reggex).findNext('span').text
    
    #Get genres
    genres_reggex = re.compile('Genres')
    genres = soup.find(text=genres_reggex).findNext('span').text.split()
    
    #Get release month
    release_reggex = re.compile('Release Date')
    release_month = soup.find(text=release_reggex).findNext().text.split()[0]
    
    
    #Create movie dictionary and return
    movie_dict = dict(zip(headers, [title,
                                    domestic_gross,
                                    international_gross,
                                    worldwide_gross,
                                    domestic_opening_gross,
                                    runtime_minutes,
                                    rating,
                                    release_month,
                                    genres]))

    return movie_dict

In [26]:
lifetime_page_data = []

for link in lifetime_movies.link_stub:
    lifetime_page_data.append(get_movie_dict(link))

In [27]:
lifetime_page_data

[{'title': 'Star Wars: Episode VII - The Force Awakens',
  'domestic_gross': '$936,662,225',
  'international_gross': '$1,131,791,908',
  'worldwide_gross': '$2,068,454,133',
  'domestic_opening_gross': '$247,966,675',
  'runtime_minutes': 138,
  'rating': 'PG-13',
  'release_month': 'December',
  'genres': ['Action', 'Adventure', 'Sci-Fi']},
 {'title': 'Avengers: Endgame',
  'domestic_gross': '$858,373,000',
  'international_gross': '$1,939,427,564',
  'worldwide_gross': '$2,797,800,564',
  'domestic_opening_gross': '$357,115,007',
  'runtime_minutes': 181,
  'rating': 'PG-13',
  'release_month': 'April',
  'genres': ['Action', 'Adventure', 'Drama', 'Sci-Fi']},
 {'title': 'Avatar',
  'domestic_gross': '$760,507,625',
  'international_gross': '$2,029,931,467',
  'worldwide_gross': '$2,790,439,092',
  'domestic_opening_gross': '$77,025,481',
  'runtime_minutes': 162,
  'rating': 'PG-13',
  'release_month': 'December',
  'genres': ['Action', 'Adventure', 'Fantasy', 'Sci-Fi']},
 {'title':

In [28]:
lifetime_page_data = pd.DataFrame(lifetime_page_data)  
lifetime_page_data.set_index('title', inplace=True)

lifetime_page_data

Unnamed: 0_level_0,domestic_gross,international_gross,worldwide_gross,domestic_opening_gross,runtime_minutes,rating,release_month,genres
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Star Wars: Episode VII - The Force Awakens,"$936,662,225","$1,131,791,908","$2,068,454,133","$247,966,675",138,PG-13,December,"[Action, Adventure, Sci-Fi]"
Avengers: Endgame,"$858,373,000","$1,939,427,564","$2,797,800,564","$357,115,007",181,PG-13,April,"[Action, Adventure, Drama, Sci-Fi]"
Avatar,"$760,507,625","$2,029,931,467","$2,790,439,092","$77,025,481",162,PG-13,December,"[Action, Adventure, Fantasy, Sci-Fi]"
Black Panther,"$700,426,566","$647,171,407","$1,347,597,973","$202,003,951",134,PG-13,February,"[Action, Adventure, Sci-Fi]"
Avengers: Infinity War,"$678,815,482","$1,369,544,272","$2,048,359,754","$257,698,183",149,PG-13,April,"[Action, Adventure, Sci-Fi]"
Titanic,"$659,363,944","$1,812,387,978","$2,471,751,922","$28,638,131",194,PG-13,December,"[Drama, Romance]"
Jurassic World,"$652,295,625","$1,018,130,819","$1,670,426,444","$208,806,270",124,PG-13,June,"[Action, Adventure, Sci-Fi]"
The Avengers,"$623,357,910","$895,457,605","$1,518,815,515","$207,438,708",143,PG-13,April,"[Action, Adventure, Sci-Fi]"
Star Wars: Episode VIII - The Last Jedi,"$620,181,382","$712,516,117","$1,332,697,499","$220,009,584",152,PG-13,December,"[Action, Adventure, Fantasy, Sci-Fi]"


In [33]:
lifetime_movies = lifetime_movies.merge(lifetime_page_data, left_index=True, right_index=True)

lifetime_movies

Unnamed: 0,link_stub,rank,title,lifetime_gross,release_year,domestic_gross,international_gross,worldwide_gross,domestic_opening_gross,runtime_minutes,rating,release_month,genres
Star Wars: Episode VII - The Force Awakens,/title/tt2488496/?ref_=bo_cso_table_1,1,Star Wars: Episode VII - The Force Awakens,"$936,662,225",2015,"$936,662,225","$1,131,791,908","$2,068,454,133","$247,966,675",138,PG-13,December,"[Action, Adventure, Sci-Fi]"
Avengers: Endgame,/title/tt4154796/?ref_=bo_cso_table_2,2,Avengers: Endgame,"$858,373,000",2019,"$858,373,000","$1,939,427,564","$2,797,800,564","$357,115,007",181,PG-13,April,"[Action, Adventure, Drama, Sci-Fi]"
Avatar,/title/tt0499549/?ref_=bo_cso_table_3,3,Avatar,"$760,507,625",2009,"$760,507,625","$2,029,931,467","$2,790,439,092","$77,025,481",162,PG-13,December,"[Action, Adventure, Fantasy, Sci-Fi]"
Black Panther,/title/tt1825683/?ref_=bo_cso_table_4,4,Black Panther,"$700,426,566",2018,"$700,426,566","$647,171,407","$1,347,597,973","$202,003,951",134,PG-13,February,"[Action, Adventure, Sci-Fi]"
Avengers: Infinity War,/title/tt4154756/?ref_=bo_cso_table_5,5,Avengers: Infinity War,"$678,815,482",2018,"$678,815,482","$1,369,544,272","$2,048,359,754","$257,698,183",149,PG-13,April,"[Action, Adventure, Sci-Fi]"
Titanic,/title/tt0120338/?ref_=bo_cso_table_6,6,Titanic,"$659,363,944",1997,"$659,363,944","$1,812,387,978","$2,471,751,922","$28,638,131",194,PG-13,December,"[Drama, Romance]"
Jurassic World,/title/tt0369610/?ref_=bo_cso_table_7,7,Jurassic World,"$652,295,625",2015,"$652,295,625","$1,018,130,819","$1,670,426,444","$208,806,270",124,PG-13,June,"[Action, Adventure, Sci-Fi]"
The Avengers,/title/tt0848228/?ref_=bo_cso_table_8,8,The Avengers,"$623,357,910",2012,"$623,357,910","$895,457,605","$1,518,815,515","$207,438,708",143,PG-13,April,"[Action, Adventure, Sci-Fi]"
Star Wars: Episode VIII - The Last Jedi,/title/tt2527336/?ref_=bo_cso_table_9,9,Star Wars: Episode VIII - The Last Jedi,"$620,181,382",2017,"$620,181,382","$712,516,117","$1,332,697,499","$220,009,584",152,PG-13,December,"[Action, Adventure, Fantasy, Sci-Fi]"


#### Director & Actors

In [30]:
url = 'https://www.boxofficemojo.com/title/tt2488496/credits/?ref_=bo_tt_tab'
response = requests.get(url)
page = response.text
soup = BeautifulSoup(page, 'lxml')

In [31]:
casts = soup.find('table', id = 'principalCast').find_all('a', attrs={'class':'a-link-normal'})
actors = []
for cast in casts:
    cast = cast.text.split('\n')[0]
    actors += [cast]
actors

['Daisy Ridley', 'John Boyega', 'Oscar Isaac', 'Domhnall Gleeson']

In [32]:
movies = pd.DataFrame(movies).T  #transpose
movies.columns = ['link_stub', 'title', 'rank_g_movies', 
                    'lifetime_gross', 'rank_overall', 'year']

g_movies.head()

ValueError: Length mismatch: Expected axis has 5 elements, new values have 6 elements

In [None]:
g_movies_page_info_list = []

for link in g_movies.link_stub:
    g_movies_page_info_list.append(get_movie_dict(link))

### IMDB Top 100 Actors 2018

In [None]:
url = 'https://www.imdb.com/list/ls023242359/'
response = requests.get(url)
page = response.text
soup = BeautifulSoup(page, 'lxml')

In [None]:
soup.find_all('span', class_='lister-item-index')[0].text

In [None]:
soup.find('div', class_='lister-item-content').find_all('a')

In [None]:
soup.find('div', class_='lister-item-content').find('a').text.split('\n')[0]

In [None]:
top_list = soup.find('div', class_='lister-list').find_all('a')
top_list

In [None]:
soup.find('div', class_='lister-list').find_all('a')[1].text.split('\n')[0]

In [None]:

top_actor = []
for top in top_list:
    top = top.text.split('\n')[0]
    top_actor += [top]
  
top_actor

In [None]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

# chromedriver = "/Applications/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver
driver = webdriver.Chrome(chromedriver)
driver.get('https://www.imdb.com/list/ls023242359/')

assert "imdb" in driver.title

continue_link = driver.find_element_by_tag_name('a')
elem = driver.find_elements_by_xpath("//*[@href]/name*")
#x = str(continue_link)
#print(continue_link)
print(elem)

In [None]:
soup.find('div', class_='lister-list').find_all('a').get('href', ref_='nmls_hd')