# PROJECT - 2

In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re

### Getting more information from a movie page

In [None]:
url = 'https://www.boxofficemojo.com/title/tt2488496/?ref_=bo_cso_table_1'
response = requests.get(url)
page = response.text
soup = BeautifulSoup(page, 'lxml')
# soup.prettify()

#### Movie Title

In [None]:
soup.find('title').text
title_string = soup.find('title').text
title_string

In [None]:
title = title_string[:-18]
title

#### Domestic Gross

In [None]:
soup.find(class_='mojo-performance-summary-table').find_all('span',class_='money')

In [None]:
domestic_gross = soup.find(class_='mojo-performance-summary-table').find_all('span',class_='money')[0].text
domestic_gross

#### International Gross

In [None]:
international_gross = soup.find(class_='mojo-performance-summary-table').find_all('span',class_='money')[1].text
international_gross

#### Worldwide Gross

In [None]:
worldwide_gross = soup.find(class_='mojo-performance-summary-table').find_all('span',class_='money')[2].text
worldwide_gross

#### Domestic Distributor

In [None]:
soup.find(class_='mojo-summary-values').find_all('span', class_ = 'money')

In [None]:
distributor_reggex = re.compile('Distributor')
soup.find(text=distributor_reggex)

In [None]:
dist_string = soup.find(text=distributor_reggex).findNext('span').text[:35]
dom_dist = dist_string
dom_dist

#### Domestic Opening Gross

In [None]:
domopening_reggex = re.compile('Domestic Open')
soup.find(text=domopening_reggex)

In [None]:
domestic_opening_gross = soup.find(text=domopening_reggex).findNext(class_='money').text
domestic_opening_gross

#### Budget

####  Release Month

In [None]:
release_reggex = re.compile('Release Date')
soup.find(text=release_reggex).findNext().text.split()[0]
release_month = soup.find(text=release_reggex).findNext().text.split()[0]
release_month

#### MPAA Rating

In [None]:
rating_reggex = re.compile('MPAA')
rating = soup.find(text=rating_reggex).findNext('span').text
rating

#### Running Time

In [None]:
runtime_reggex = re.compile('Run')
rt = soup.find(text=runtime_reggex).findNext('span').text.split()
runtime_minutes = int(rt[0])*60 + int(rt[2])
runtime_minutes

#### Genres

In [None]:
genres_reggex = re.compile('Genres')
genres = soup.find(text=genres_reggex).findNext('span').text.split()
genres

In [None]:
headers = ['movie title', 'domestic total gross', 'international total gross','domestic opening gross','runtime (mins)',
           'rating', 'release date', 'genre']

movie_data = []
movie_dict = dict(zip(headers,[title,
                               domestic_gross,
                               international_gross,
                               domestic_opening_gross,
                               runtime_minutes,
                               rating,
                               release_month,
                               genres]))
movie_data.append(movie_dict)
movie_data

### Getting the lifetime ranking table

In [None]:
url = 'https://www.boxofficemojo.com/chart/top_lifetime_gross/'
response = requests.get(url)
page = response.text
soup = BeautifulSoup(page, 'lxml')
# print(soup.prettify())

In [None]:
# finding the table
table = soup.find('table')
# table

In [None]:
rows = [row for row in table.find_all('tr')]
rows[1].find_all('td')

In [None]:
rows[1].find_all('td')[1].find('a')['href']

In [None]:
movies = {}

for row in rows[1:10]:
    items = row.find_all('td')
    link = items[1].find('a')
    title = link.text
    url = link['href']
    movies[title] = [url]+ [i.text for i in items]

movies

In [None]:
lifetime_movies = pd.DataFrame(movies).T
lifetime_movies.columns = ['link_stub', 'rank', 'title', 'lifetime_gross', 'release_year']
lifetime_movies.head()

In [None]:
def get_movie_dict(link):
    """
    From BoxOfficeMojo link stub, request movie html, parse with BeautifulSoup, and
    collect 
        - title 
        - domestic gross
        - runtime 
        - MPAA rating
        - full release date
    Return information as a dictionary
    
    """
    
    base_url = 'https://www.boxofficemojo.com'
    
    #Create full url to scrape
    url = base_url + link
    
    #Request HTML and parse
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page,"lxml")
    
    headers = ['title', 'domestic_gross', 'international_gross','worldwide_gross', 'domestic_opening_gross',
               'runtime_minutes', 'rating', 'release_month', 'genres']
    
    #Get title
    title_string = soup.find('title').text
    title = title_string[:-18]

    #Get domestic gross
    domestic_gross = soup.find(class_='mojo-performance-summary-table').find_all('span',class_='money')[0].text
    #dom_gross = money_to_int(dom_gross)
    
    #Get international_gross
    international_gross = soup.find(class_='mojo-performance-summary-table').find_all('span',class_='money')[1].text
    
    #Get worldwide gross
    worldwide_gross = soup.find(class_='mojo-performance-summary-table').find_all('span',class_='money')[2].text
    
    #Get domestic opening gross
    domestic_opening_gross = soup.find(text=domopening_reggex).findNext(class_='money').text
    
    
    #Get runtime
    runtime_reggex = re.compile('Run')
    rt = soup.find(text=runtime_reggex).findNext('span').text.split()
    runtime_minutes = int(rt[0])*60 + int(rt[2])
    
    #Get rating
    rating_reggex = re.compile('MPAA')
    rating = soup.find(text=rating_reggex).findNext('span').text
    
    #Get genres
    genres_reggex = re.compile('Genres')
    genres = soup.find(text=genres_reggex).findNext('span').text.split()
    
    #Get release month
    release_reggex = re.compile('Release Date')
    release_month = soup.find(text=release_reggex).findNext().text.split()[0]
    
    
    #Create movie dictionary and return
    movie_dict = dict(zip(headers, [title,
                                    domestic_gross,
                                    international_gross,
                                    worldwide_gross,
                                    domestic_opening_gross,
                                    runtime_minutes,
                                    rating,
                                    release_month,
                                    genres]))

    return movie_dict

In [None]:
lifetime_page_data = []

for link in lifetime_movies.link_stub:
    lifetime_page_data.append(get_movie_dict(link))

In [None]:
lifetime_page_data

In [None]:
lifetime_page_data = pd.DataFrame(lifetime_page_data)  
lifetime_page_data.set_index('title', inplace=True)

lifetime_page_data

In [None]:
# lifetime_movies = lifetime_movies.merge(lifetime_page_data, left_index=True, right_index=True)

# lifetime_movies

#### Director & Actors

In [None]:
url = 'https://www.boxofficemojo.com/title/tt2488496/credits/?ref_=bo_tt_tab'
response = requests.get(url)
page = response.text
soup = BeautifulSoup(page, 'lxml')

In [None]:
casts = soup.find('table', id = 'principalCast').find_all('a', attrs={'class':'a-link-normal'})
actors = []
for cast in casts:
    cast = cast.text.split('\n')[0]
    actors += [cast]
actors

In [None]:
movies = pd.DataFrame(movies).T  #transpose
movies.columns = ['link_stub', 'title', 'rank_g_movies', 
                    'lifetime_gross', 'rank_overall', 'year']

g_movies.head()

In [None]:
g_movies_page_info_list = []

for link in g_movies.link_stub:
    g_movies_page_info_list.append(get_movie_dict(link))

### IMDB Top 100 Actors 2018

In [None]:
url = 'https://www.imdb.com/list/ls023242359/'
response = requests.get(url)
page = response.text
soup = BeautifulSoup(page, 'lxml')

In [None]:
soup.find_all('span', class_='lister-item-index')[0].text

In [None]:
soup.find('div', class_='lister-item-content').find_all('a')

In [None]:
soup.find('div', class_='lister-item-content').find('a').text.split('\n')[0]

In [None]:
top_list = soup.find('div', class_='lister-list').find_all('a')
top_list

In [None]:
soup.find('div', class_='lister-list').find_all('a')[1].text.split('\n')[0]

In [None]:

top_actor = []
for top in top_list:
    top = top.text.split('\n')[0]
    top_actor += [top]
  
top_actor

In [None]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

# chromedriver = "/Applications/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver
driver = webdriver.Chrome(chromedriver)
driver.get('https://www.imdb.com/list/ls023242359/')

assert "imdb" in driver.title

continue_link = driver.find_element_by_tag_name('a')
elem = driver.find_elements_by_xpath("//*[@href]/name*")
#x = str(continue_link)
#print(continue_link)
print(elem)

In [None]:
soup.find('div', class_='lister-list').find_all('a').get('href', ref_='nmls_hd')