# PROJECT - 2

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import dateutil.parser

### Getting  information from a movie page

In [2]:
url = 'https://www.boxofficemojo.com/title/tt2488496/?ref_=bo_cso_table_1'
response = requests.get(url)
page = response.text
soup = BeautifulSoup(page, 'lxml')
# soup.prettify()

#### Movie Title

In [3]:
soup.find('title').text
title_string = soup.find('title').text
title_string

'Star Wars: Episode VII - The Force Awakens - Box Office Mojo'

In [4]:
title = title_string[:-18]
title

'Star Wars: Episode VII - The Force Awakens'

#### Domestic Gross

In [5]:
soup.find(class_='mojo-performance-summary-table').find_all('span',class_='money')

[<span class="money">$936,662,225</span>,
 <span class="money">$1,131,791,908</span>,
 <span class="money">$2,068,454,133</span>]

In [6]:
dom_gross = soup.find(class_='mojo-performance-summary-table').find_all('span',class_='money')[0].text
dom_gross

'$936,662,225'

#### International Gross

In [7]:
intl_gross = soup.find(class_='mojo-performance-summary-table').find_all('span',class_='money')[1].text
intl_gross

'$1,131,791,908'

#### Worldwide Gross

In [8]:
world_gross = soup.find(class_='mojo-performance-summary-table').find_all('span',class_='money')[2].text
world_gross

'$2,068,454,133'

#### Domestic Distributor

In [9]:
soup.find(class_='mojo-summary-values').find_all('span', class_ = 'money')

[<span class="money">$247,966,675</span>,
 <span class="money">$245,000,000</span>]

In [10]:
distributor_reggex = re.compile('Distributor')
soup.find(text=distributor_reggex)

'Domestic Distributor'

In [11]:
distr_string = find(text=distributor_reggex).findNext('span').text

NameError: name 'find' is not defined

In [12]:
dist_string = soup.find(text=distributor_reggex).findNext('span').text[:18]
dom_dist = dist_string
dom_dist

'Walt Disney Studio'

#### Domestic Opening Gross

In [13]:
dom_open_gross = soup.find(class_='mojo-summary-values').find_all('span', class_ = 'money')
domopening_reggex = re.compile('Domestic Open')
soup.find(text=domopening_reggex)

'Domestic Opening'

In [14]:
dom_opening_gross = soup.find(text=domopening_reggex).findNext(class_='money').text
dom_opening_gross

'$247,966,675'

#### Budget

####  Release Month

In [15]:
release_reggex = re.compile('Release Date')
soup.find(text=release_reggex).findNext().text.split()[0]
release_month = soup.find(text=release_reggex).findNext().text.split()[0]
release_month

'December'

#### Get Movie Values

#### Getting findNext object value  

In [16]:
def get_movie_value(soup, field_name):
    
    """
    Get a value fro the movie page and take the string attribute of a movie on the page and return
    the value of the next sibling object or None if nothing is found.
    
    """
    obj = soup.find(text=re.compile(field_name))
    if not obj:
        return None
    next_element = obj.findNext()
    
    if next_element:
        return next_element.text
    else:
        return None
    

In [41]:
dom_open_gross = get_movie_value(soup,'Domestic Open')
print(dom_open_gross)

$247,966,675


In [18]:
runtime = get_movie_value(soup, 'Running')
print(runtime)

2 hr 18 min


In [19]:
rating = get_movie_value(soup, 'MPAA')
print(rating)

PG-13


In [20]:
rel_date = get_movie_value(soup, 'Release Date')
print(rel_date)

December 16, 2015
            (APAC, EMEA)


In [21]:
genres = get_movie_value(soup, 'Genres').split()
genres

['Action', 'Adventure', 'Sci-Fi']

In [22]:
distributor = get_movie_value(soup, 'Distributor')[:35]
distributor

'Walt Disney Studios Motion Pictures'

In [23]:
def money_to_int(moneystring):
    
    """
    replaces the $ sign and removing the comma separator from the money string
    returns the integer value
    """
    
    moneystring = moneystring.replace('$', '').replace(',', '')
    return int(moneystring)

def runtime_to_minutes(runtimestring):
    
    """
    takes runtimestring in _hr_min format and returns it's value in number of minutes
    """
    
    runtime = runtimestring.split()
    try:
        minutes = int(runtime[0]) * 60 + int(runtime[2])
        return minutes
    except:
        return None

def to_date(datestring):
    
    """
    """
    
    date = dateutil.parser.parse(datestring)
    return date

In [24]:
raw_dom_gross = dom_gross
domestic_gross = money_to_int(raw_dom_gross)

raw_intl_gross = intl_gross
international_gross = money_to_int(raw_intl_gross)

raw_world_gross = world_gross
worldwide_gross = money_to_int(raw_world_gross)

raw_opening_gross = dom_opening_gross
domestic_opening_gross = money_to_int(raw_opening_gross)


In [25]:
raw_runtime = get_movie_value(soup, 'Running')
runtime_minutes = runtime_to_minutes(raw_runtime)
runtime_minutes

138

In [26]:
release_month = rel_date.split()[0]
release_month

'December'

#### Movie Page Dictionary

In [30]:
headers = ['movie title', 'domestic total gross', 'international total gross','domestic opening gross','runtime (mins)',
           'rating', 'release month', 'genre']

movie_data = []
movie_dict = dict(zip(headers,[title,
                               domestic_gross,
                               international_gross,
                               domestic_opening_gross,
                               runtime_minutes,
                               rating,
                               release_month,
                               genres]))
movie_data.append(movie_dict)
movie_data

[{'movie title': 'Star Wars: Episode VII - The Force Awakens',
  'domestic total gross': 936662225,
  'international total gross': 1131791908,
  'domestic opening gross': 247966675,
  'runtime (mins)': 138,
  'rating': 'PG-13',
  'release month': 'December',
  'genre': ['Action', 'Adventure', 'Sci-Fi']}]

### Getting the lifetime ranking table

In [31]:
url = 'https://www.boxofficemojo.com/chart/ww_top_lifetime_gross/?area=XWW'
response = requests.get(url)
page = response.text
soup = BeautifulSoup(page, 'lxml')
# print(soup.prettify())

In [32]:
# finding the table
table = soup.find('table')
# table

In [33]:
rows = [row for row in table.find_all('tr')]
rows[1].find_all('td')

[<td class="a-text-right mojo-header-column mojo-truncate mojo-field-type-rank">1</td>,
 <td class="a-text-left mojo-field-type-title"><a class="a-link-normal" href="/title/tt4154796/?ref_=bo_cso_table_1">Avengers: Endgame</a></td>,
 <td class="a-text-right mojo-field-type-money">$2,797,800,564</td>,
 <td class="a-text-right mojo-field-type-money">$858,373,000</td>,
 <td class="a-text-right mojo-field-type-percent">30.7%</td>,
 <td class="a-text-right mojo-field-type-money">$1,939,427,564</td>,
 <td class="a-text-right mojo-field-type-percent">69.3%</td>,
 <td class="a-text-left mojo-field-type-year"><a class="a-link-normal" href="/year/world/2019/?ref_=bo_cso_table_1">2019</a></td>]

In [34]:
rows[1].find_all('td')[1].find('a')['href']

'/title/tt4154796/?ref_=bo_cso_table_1'

In [35]:
movies = {}

for row in rows[1:200]:
    items = row.find_all('td')
    link = items[1].find('a')
    title = link.text
    url = link['href']
    movies[title] = [url]+ [i.text for i in items]

movies

{'Avengers: Endgame': ['/title/tt4154796/?ref_=bo_cso_table_1',
  '1',
  'Avengers: Endgame',
  '$2,797,800,564',
  '$858,373,000',
  '30.7%',
  '$1,939,427,564',
  '69.3%',
  '2019'],
 'Avatar': ['/title/tt0499549/?ref_=bo_cso_table_2',
  '2',
  'Avatar',
  '$2,790,439,092',
  '$760,507,625',
  '27.2%',
  '$2,029,931,467',
  '72.8%',
  '2009'],
 'Titanic': ['/title/tt0120338/?ref_=bo_cso_table_3',
  '3',
  'Titanic',
  '$2,471,751,922',
  '$659,363,944',
  '26.7%',
  '$1,812,387,978',
  '73.3%',
  '1997'],
 'Star Wars: Episode VII - The Force Awakens': ['/title/tt2488496/?ref_=bo_cso_table_4',
  '4',
  'Star Wars: Episode VII - The Force Awakens',
  '$2,068,454,133',
  '$936,662,225',
  '45.3%',
  '$1,131,791,908',
  '54.7%',
  '2015'],
 'Avengers: Infinity War': ['/title/tt4154756/?ref_=bo_cso_table_5',
  '5',
  'Avengers: Infinity War',
  '$2,048,359,754',
  '$678,815,482',
  '33.1%',
  '$1,369,544,272',
  '66.9%',
  '2018'],
 'Jurassic World': ['/title/tt0369610/?ref_=bo_cso_table_

In [36]:
lifetime_movies = pd.DataFrame(movies).T
lifetime_movies.columns = ['link_stub', 'rank', 'title', 'WW_gross', 'domestic_gross','dom_%', 'foreign_gross','for_%','release_year']
lifetime_movies.head()

Unnamed: 0,link_stub,rank,title,WW_gross,domestic_gross,dom_%,foreign_gross,for_%,release_year
Avengers: Endgame,/title/tt4154796/?ref_=bo_cso_table_1,1,Avengers: Endgame,"$2,797,800,564","$858,373,000",30.7%,"$1,939,427,564",69.3%,2019
Avatar,/title/tt0499549/?ref_=bo_cso_table_2,2,Avatar,"$2,790,439,092","$760,507,625",27.2%,"$2,029,931,467",72.8%,2009
Titanic,/title/tt0120338/?ref_=bo_cso_table_3,3,Titanic,"$2,471,751,922","$659,363,944",26.7%,"$1,812,387,978",73.3%,1997
Star Wars: Episode VII - The Force Awakens,/title/tt2488496/?ref_=bo_cso_table_4,4,Star Wars: Episode VII - The Force Awakens,"$2,068,454,133","$936,662,225",45.3%,"$1,131,791,908",54.7%,2015
Avengers: Infinity War,/title/tt4154756/?ref_=bo_cso_table_5,5,Avengers: Infinity War,"$2,048,359,754","$678,815,482",33.1%,"$1,369,544,272",66.9%,2018


In [37]:
def get_movie_dict(link):
    """
    From BoxOfficeMojo link stub, request movie html, parse with BeautifulSoup, and
    collect 
        - title 
        - domestic gross
        - runtime 
        - MPAA rating
        - full release date
    Return information as a dictionary
    
    """
    
    base_url = 'https://www.boxofficemojo.com'
    
    #Create full url to scrape
    url = base_url + link
    
    #Request HTML and parse
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page,"lxml")
    
    headers = ['title', 'domestic_gross', 'domestic_opening_gross', 'international_gross', 'worldwide_gross',
               'runtime_minutes', 'rating', 'release_month', 'genres']
    
    #Get title
    title_string = soup.find('title').text
    title = title_string[:-18]

    #Get domestic gross
    raw_dom_gross = soup.find(class_='mojo-performance-summary-table').find_all('span',class_='money')[0].text
    domestic_gross = money_to_int(raw_dom_gross)
    
    #dom_gross = money_to_int(dom_gross)
    
    #Get international_gross
    raw_intl_gross = soup.find(class_='mojo-performance-summary-table').find_all('span',class_='money')[1].text
    international_gross = money_to_int(raw_intl_gross)
    
    #Get worldwide gross
    raw_world_gross = soup.find(class_='mojo-performance-summary-table').find_all('span',class_='money')[2].text
    worldwide_gross = money_to_int(raw_world_gross)
    
    #Get domestic opening gross
    raw_opening_gross = dom_opening_gross
    domestic_opening_gross = money_to_int(raw_opening_gross)
    
    #Get distributor
    distributor = get_movie_value(soup, 'Distributor')[:18]
    
    #Get runtime
    raw_runtime = get_movie_value(soup, 'Running')
    runtime_minutes = runtime_to_minutes(raw_runtime)
    
    #Get rating
    rating = get_movie_value(soup, 'MPAA')
    
    #Get genres
    genres = get_movie_value(soup, 'Genres').split()
    
    #Get release month
    rel_date = get_movie_value(soup, 'Release Date')
    release_month = rel_date.split()[0]
    
    
    #Create movie dictionary and return
    movie_dict = dict(zip(headers, [title,
                                    domestic_gross,
                                    domestic_opening_gross,
                                    international_gross,
                                    worldwide_gross,
                                    runtime_minutes,
                                    rating,
                                    release_month,
                                    genres]))

    return movie_dict

In [38]:
lifetime_movie_data = []
for link in lifetime_movies.link_stub:
    lifetime_movie_data.append(get_movie_dict(link))

In [40]:
lifetime_movie_data

[{'title': 'Avengers: Endgame',
  'domestic_gross': 858373000,
  'domestic_opening_gross': 247966675,
  'international_gross': 1939427564,
  'worldwide_gross': 2797800564,
  'runtime_minutes': 181,
  'rating': 'PG-13',
  'release_month': 'April',
  'genres': ['Action', 'Adventure', 'Drama', 'Sci-Fi']},
 {'title': 'Avatar',
  'domestic_gross': 760507625,
  'domestic_opening_gross': 247966675,
  'international_gross': 2029931467,
  'worldwide_gross': 2790439092,
  'runtime_minutes': 162,
  'rating': 'PG-13',
  'release_month': 'December',
  'genres': ['Action', 'Adventure', 'Fantasy', 'Sci-Fi']},
 {'title': 'Titanic',
  'domestic_gross': 659363944,
  'domestic_opening_gross': 247966675,
  'international_gross': 1812387978,
  'worldwide_gross': 2471751922,
  'runtime_minutes': 194,
  'rating': 'PG-13',
  'release_month': 'December',
  'genres': ['Drama', 'Romance']},
 {'title': 'Star Wars: Episode VII - The Force Awakens',
  'domestic_gross': 936662225,
  'domestic_opening_gross': 2479666

In [None]:
lifetime_movies_info = pd.DataFrame(lifetime_movie_data)
lifetime_movies_info.set_index('title', inplace = True)

In [None]:
lifetime_movies_info.head()

In [None]:
lifetime_movies = lifetime_movies.merge(lifetime_movies_info, left_index=True, right_index=True)

lifetime_movies.head()

#### Director & Actors

In [39]:
url = 'https://www.boxofficemojo.com/title/tt2488496/credits/?ref_=bo_tt_tab'
response = requests.get(url)
page = response.text
soup = BeautifulSoup(page, 'lxml')

In [None]:
casts = soup.find('table', id = 'principalCast').find_all('a', attrs={'class':'a-link-normal'})
actors = []
for cast in casts:
    cast = cast.text.split('\n')[0]
    actors += [cast]
actors

In [None]:
movies = pd.DataFrame(movies).T  #transpose
movies.columns = ['link_stub', 'title', 'rank_g_movies', 
                    'lifetime_gross', 'rank_overall', 'year']

g_movies.head()

In [None]:
g_movies_page_info_list = []

for link in g_movies.link_stub:
    g_movies_page_info_list.append(get_movie_dict(link))

### IMDB Top 100 Actors 2018

In [None]:
url = 'https://www.imdb.com/list/ls023242359/'
response = requests.get(url)
page = response.text
soup = BeautifulSoup(page, 'lxml')

In [None]:
soup.find_all('span', class_='lister-item-index')[0].text

In [None]:
soup.find('div', class_='lister-item-content').find_all('a')

In [None]:
soup.find('div', class_='lister-item-content').find('a').text.split('\n')[0]

In [None]:
top_list = soup.find('div', class_='lister-list').find_all('a')
top_list

In [None]:
soup.find('div', class_='lister-list').find_all('a')[1].text.split('\n')[0]

In [None]:

top_actor = []
for top in top_list:
    top = top.text.split('\n')[0]
    top_actor += [top]
  
top_actor

In [None]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

# chromedriver = "/Applications/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver
driver = webdriver.Chrome(chromedriver)
driver.get('https://www.imdb.com/list/ls023242359/')

assert "imdb" in driver.title

continue_link = driver.find_element_by_tag_name('a')
elem = driver.find_elements_by_xpath("//*[@href]/name*")
#x = str(continue_link)
#print(continue_link)
print(elem)

In [None]:
soup.find('div', class_='lister-list').find_all('a').get('href', ref_='nmls_hd')