Scraping from IMDB

In [1]:
from bs4 import BeautifulSoup
import requests
import time, os
import re
import pandas as pd
import time
import pickle

from currency_converter import CurrencyConverter
from datetime import date

Loading previous pickled dataframe

In [2]:
earliestrelease_notdomestic_movies = pd.read_pickle('earliestrelease_notdomestic_movies.pickle')

In [3]:
earliestrelease_notdomestic_movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1659 entries, Hero to News from Planet Mars
Data columns (total 20 columns):
 #   Column                                    Non-Null Count  Dtype         
---  ------                                    --------------  -----         
 0   link_stub                                 1659 non-null   object        
 1   gross_rank                                1659 non-null   float64       
 2   lifetime_gross                            1659 non-null   float64       
 3   max_theaters                              1651 non-null   float64       
 4   domestic_opening                          1591 non-null   float64       
 5   num_opening_theaters                      1593 non-null   float64       
 6   release_date                              1659 non-null   datetime64[ns]
 7   domestic_distributor                      1609 non-null   object        
 8   domestic_total_gross                      1659 non-null   float64       
 9   international_t

In [8]:
earliestrelease_notdomestic_movies.link_stub.head()

Hero               /title/tt0299977/?ref_=bo_ge_table_3
Parasite           /title/tt6751668/?ref_=bo_ge_table_4
Pan's Labyrinth    /title/tt0457430/?ref_=bo_ge_table_6
Amélie             /title/tt0211915/?ref_=bo_ge_table_7
Fearless           /title/tt0446059/?ref_=bo_ge_table_8
Name: link_stub, dtype: object

In [12]:
new_link_stubs = []
for link in earliestrelease_notdomestic_movies.link_stub:
    new_link_stubs.append(link.split('?')[0])

new_link_stubs

['/title/tt0299977/',
 '/title/tt6751668/',
 '/title/tt0457430/',
 '/title/tt0211915/',
 '/title/tt0446059/',
 '/title/tt4849438/',
 '/title/tt0373074/',
 '/title/tt0318462/',
 '/title/tt0265343/',
 '/title/tt0245574/',
 '/title/tt0441909/',
 '/title/tt0427954/',
 '/title/tt5935704/',
 '/title/tt0405094/',
 '/title/tt0237534/',
 '/title/tt0385004/',
 '/title/tt2338151/',
 '/title/tt0450188/',
 '/title/tt1675434/',
 '/title/tt1132620/',
 '/title/tt0245429/',
 '/title/tt0287467/',
 '/title/tt4643580/',
 '/title/tt1833673/',
 '/title/tt6452574/',
 '/title/tt1668200/',
 '/title/tt1216487/',
 '/title/tt0317248/',
 '/title/tt0464141/',
 '/title/tt1832382/',
 '/title/tt1602620/',
 '/title/tt1462900/',
 '/title/tt0473444/',
 '/title/tt0344510/',
 '/title/tt1305806/',
 '/title/tt4832640/',
 '/title/tt0161860/',
 '/title/tt0362225/',
 '/title/tt1035736/',
 '/title/tt7605074/',
 '/title/tt0313196/',
 '/title/tt0416044/',
 '/title/tt1566501/',
 '/title/tt5956100/',
 '/title/tt0363163/',
 '/title/t

In [13]:
def get_movie_value(soup, field_name):
    
    '''Grab a value from Box Office Mojo HTML
    
    Takes a string attribute of a movie on the page and returns the string in
    the next sibling object (the value for that attribute) or None if nothing is found.
    '''
    
    obj = soup.find(text=re.compile(field_name))
    
    if not obj: 
        return None
    
    # this works for most of the values
    next_element = obj.findNext()
    
    if next_element:
        return next_element.text 
    else:
        return None

In [14]:
def get_movie_dict(link):
    base_url = 'https://www.imdb.com'
    
    #Create full url to scrape
    url = base_url + link
    
    #Request HTML and parse
    for i in range(5): 
        try:
            response = requests.get(url)
        except:
            time.sleep(10) 
            continue
        else:
            break
    page = response.text
    soup = BeautifulSoup(page,"lxml")
    
    headers = ['link_stub', 'language', 'budget']
    
    try:
        language = get_movie_value(soup, 'Language:')
    except:
        language = None 
    
    try:
        first = soup.find(text=re.compile('Budget:'))
        next_element = first.findNext()
        raw_budget = next_element.previous
        budget = raw_budget.split('\n')[0]
    except: 
        budget = None 
        
    #Create movie dictionary and return
    movie_dict = dict(zip(headers, [link, language, budget]))

    return movie_dict

In [15]:
fl_movies_imdb_list = [] #saving the info to a dictionary OUTSIDE the function

for link in new_link_stubs:
    fl_movies_imdb_list.append(get_movie_dict(link))
    
fl_movies_imdb = pd.DataFrame(fl_movies_imdb_list)  #converting list of dict to df
fl_movies_imdb.set_index('link_stub', inplace=True) 

import pickle #immediately pickling dataframe
fl_movies_imdb.to_pickle('fl_movies_imdb.pickle')

In [16]:
fl_movies_imdb['budget'].isna().sum()

1081

In [22]:
fl_movies_imdb.shape

(1659, 2)

In [19]:
fl_movies_imdb.tail(10)

Unnamed: 0_level_0,language,budget
link_stub,Unnamed: 1_level_1,Unnamed: 2_level_1
/title/tt4948452/,French,"EUR5,831,000"
/title/tt2167715/,Dutch,
/title/tt1289783/,French,
/title/tt2361110/,French,
/title/tt4428762/,French,
/title/tt1503777/,French,
/title/tt0312358/,German,
/title/tt5091538/,French,
/title/tt0821462/,Korean,"$2,000,000"
/title/tt5038358/,French,"EUR6,300,000"


In [2]:
earliestrelease_notdomestic_movies = pd.read_pickle('earliestrelease_notdomestic_movies.pickle')

In [3]:
fl_movies_imdb = pd.read_pickle('fl_movies_imdb.pickle')

In [4]:
earliestrelease_notdomestic_movies.shape

(1659, 20)

In [5]:
earliestrelease_notdomestic_movies['link_stub'] = earliestrelease_notdomestic_movies['link_stub'].apply(lambda x: x.split('?')[0])  
earliestrelease_notdomestic_movies.head(2)

Unnamed: 0,link_stub,gross_rank,lifetime_gross,max_theaters,domestic_opening,num_opening_theaters,release_date,domestic_distributor,domestic_total_gross,international_total_gross,budget,earliest_release_location,rating,runtime,genres,original_release_markets_num,crew,cast,earliest_release_location_opening_gross,earliest_release_location_original_gross
Hero,/title/tt0299977/,3.0,53710019.0,2175.0,17800000.0,2031.0,2004-08-27,Miramax,53710019.0,123685538.0,31000000.0,"APAC, China",PG-13,,Action Adventure History,,"Yimou Zhang Director, Feng Li Writer, Yimou Zh...","Jet Li, Tony Chiu-Wai Leung, Maggie Cheung, Zi...",17800000.0,53710019.0
Parasite,/title/tt6751668/,4.0,53369749.0,2001.0,393216.0,3.0,2019-10-11,Neon,53369749.0,205484917.0,11400000.0,South Korea,R,132.0,Comedy Drama Thriller,34.0,"Bong Joon Ho Director, Bong Joon Ho Writer, Bo...","Kang-ho Song, Sun-kyun Lee, Yeo-jeong Cho, Woo...",393216.0,53369749.0


In [6]:
earliestrelease_notdomestic_movies.reset_index(inplace=True)

In [7]:
earliestrelease_notdomestic_movies.rename(columns={"index": "title"}, inplace=True)

In [8]:
fl_movies_wbudget = earliestrelease_notdomestic_movies.merge(fl_movies_imdb, how="outer", on='link_stub')
fl_movies_wbudget.head()

Unnamed: 0,title,link_stub,gross_rank,lifetime_gross,max_theaters,domestic_opening,num_opening_theaters,release_date,domestic_distributor,domestic_total_gross,...,rating,runtime,genres,original_release_markets_num,crew,cast,earliest_release_location_opening_gross,earliest_release_location_original_gross,language,budget_y
0,Hero,/title/tt0299977/,3.0,53710019.0,2175.0,17800000.0,2031.0,2004-08-27,Miramax,53710019.0,...,PG-13,,Action Adventure History,,"Yimou Zhang Director, Feng Li Writer, Yimou Zh...","Jet Li, Tony Chiu-Wai Leung, Maggie Cheung, Zi...",17800000.0,53710019.0,Mandarin,"$31,000,000"
1,Parasite,/title/tt6751668/,4.0,53369749.0,2001.0,393216.0,3.0,2019-10-11,Neon,53369749.0,...,R,132.0,Comedy Drama Thriller,34.0,"Bong Joon Ho Director, Bong Joon Ho Writer, Bo...","Kang-ho Song, Sun-kyun Lee, Yeo-jeong Cho, Woo...",393216.0,53369749.0,Korean,"$11,400,000"
2,Pan's Labyrinth,/title/tt0457430/,6.0,37634615.0,1143.0,568641.0,17.0,2006-12-29,Picturehouse,37634615.0,...,R,118.0,Drama Fantasy War,,"Guillermo del Toro Director, Guillermo del Tor...","Ivana Baquero, Ariadna Gil, Sergi López, Marib...",568641.0,37634615.0,Spanish,"$19,000,000"
3,Amélie,/title/tt0211915/,7.0,33225499.0,303.0,136470.0,3.0,2001-11-02,Miramax,33225499.0,...,R,122.0,Comedy Romance,20.0,"Jean-Pierre Jeunet Director, Guillaume Laurant...","Audrey Tautou, Mathieu Kassovitz, Rufus, Lorel...",136470.0,33225499.0,French,"$10,000,000"
4,Fearless,/title/tt0446059/,8.0,24633730.0,1810.0,10564000.0,1806.0,2006-09-22,Rogue Pictures,24633730.0,...,PG-13,104.0,Action Biography Drama Sport,,"Ronny Yu Director, Chris Chow Writer, Richard ...","Jet Li, Li Sun, Yong Dong, Yun Qu",10564000.0,24633730.0,Mandarin,


In [9]:
fl_movies_wbudget['budget_y'].isna().sum()

1081

In [10]:
fl_movies_wbudget.drop(columns=["budget_x", "rating", "original_release_markets_num"], inplace=True)

In [11]:
fl_movies_wbudget.drop(columns=["link_stub", "gross_rank"], inplace=True)

In [12]:
fl_movies_wbudget.domestic_distributor.fillna('Missing', inplace=True)

In [13]:
fl_movies_wbudget.crew.fillna('Missing', inplace=True)
fl_movies_wbudget.cast.fillna('Missing', inplace=True)

In [14]:
fl_movies_wbudget.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1659 entries, 0 to 1658
Data columns (total 18 columns):
 #   Column                                    Non-Null Count  Dtype         
---  ------                                    --------------  -----         
 0   title                                     1659 non-null   object        
 1   lifetime_gross                            1659 non-null   float64       
 2   max_theaters                              1651 non-null   float64       
 3   domestic_opening                          1591 non-null   float64       
 4   num_opening_theaters                      1593 non-null   float64       
 5   release_date                              1659 non-null   datetime64[ns]
 6   domestic_distributor                      1659 non-null   object        
 7   domestic_total_gross                      1659 non-null   float64       
 8   international_total_gross                 1659 non-null   float64       
 9   earliest_release_location     

In [15]:
fl_movies_wbudget.rename(columns={"budget_y": "budget"}, inplace=True)

In [16]:
fl_movies_wbudget['budget'][0]

'$31,000,000'

In [17]:
currency = []
for budget in fl_movies_wbudget['budget']:
    if budget == None:
        currency.append(None)
    elif budget[0:1] == '$':
        currency.append('USD')
    else:
        currency.append(budget[0:3])
currency      

['USD',
 'USD',
 'USD',
 'USD',
 None,
 'INR',
 'USD',
 None,
 'INR',
 'USD',
 'EUR',
 'THB',
 None,
 'USD',
 'USD',
 'CNY',
 None,
 'USD',
 'EUR',
 'USD',
 'USD',
 None,
 'USD',
 None,
 'USD',
 'EUR',
 'EUR',
 'BRL',
 'EUR',
 'USD',
 'USD',
 'USD',
 'USD',
 'USD',
 'USD',
 'USD',
 'EUR',
 'EUR',
 'USD',
 'USD',
 'USD',
 'USD',
 'USD',
 'USD',
 'EUR',
 'EUR',
 None,
 None,
 'USD',
 'EUR',
 'INR',
 None,
 'JPY',
 'USD',
 'EUR',
 'USD',
 'USD',
 'EUR',
 None,
 None,
 'USD',
 'USD',
 None,
 'USD',
 'INR',
 None,
 None,
 'INR',
 'USD',
 'EUR',
 'USD',
 'EUR',
 None,
 None,
 None,
 None,
 None,
 None,
 'EUR',
 None,
 'EUR',
 None,
 None,
 'INR',
 None,
 'INR',
 'USD',
 'FRF',
 'EUR',
 None,
 None,
 'USD',
 'FRF',
 'INR',
 'USD',
 'EUR',
 None,
 None,
 'USD',
 'USD',
 'INR',
 'USD',
 'EUR',
 'EUR',
 None,
 None,
 'INR',
 None,
 None,
 'INR',
 'USD',
 None,
 'USD',
 None,
 'EUR',
 'USD',
 None,
 'USD',
 None,
 None,
 None,
 'INR',
 None,
 None,
 None,
 None,
 None,
 None,
 'USD',
 None,
 'EUR

In [18]:
fl_movies_wbudget['currency'] = currency 

In [19]:
budget_new = []
for budget in fl_movies_wbudget['budget']:
    if budget == None:
        budget_new.append(None)
    elif budget[0:1] == '$':
        budget_new.append(budget[1:])
    else:
        budget_new.append(budget[3:])

In [20]:
budget_new

['31,000,000',
 '11,400,000',
 '19,000,000',
 '10,000,000',
 None,
 '2,500,000,000',
 '20,000,000',
 None,
 '7,000,000',
 '2,000,000',
 '7,450,000',
 '200,000,000',
 None,
 '2,000,000',
 '29,000,000',
 '100,000,000',
 None,
 '25,000,000',
 '9,500,000',
 '13,000,000',
 '19,000,000',
 None,
 '5,300,000',
 None,
 '14,000,000',
 '10,000,000',
 '4,000,000',
 '3,300,000',
 '3,400,000',
 '500,000',
 '8,900,000',
 '38,600,000',
 '45,000,000',
 '56,600,000',
 '2,000,000',
 '13,430,000',
 '7,000,000',
 '11,700,000',
 '23,000,000',
 '48,000,000',
 '1,800,000',
 '18,000,000',
 '12,000,000',
 '20,470,000',
 '13,500,000',
 '4,200,000',
 None,
 None,
 '5,000,000',
 '4,000,000',
 '1,390,000,000',
 None,
 '370,000,000',
 '18,000,000',
 '3,600,000',
 '21,000,000',
 '15,000,000',
 '4,300,000',
 None,
 None,
 '1,000,000',
 '21,000,000',
 None,
 '1,000,000',
 '450,000,000',
 None,
 None,
 '5,420,000,000',
 '1,100,000',
 '4,800,000',
 '12,000,000',
 '4,000,000',
 None,
 None,
 None,
 None,
 None,
 None,
 '2

In [21]:
len(budget_new)

1659

In [22]:
fl_movies_wbudget['budget_new'] = budget_new

In [23]:
fl_movies_wbudget['budget_new'] = fl_movies_wbudget['budget_new'].str.replace(",","")

In [24]:
fl_movies_wbudget.loc[fl_movies_wbudget['budget_new'] == '', 'budget_new'] = None 

In [25]:
fl_movies_wbudget['budget_new'] = fl_movies_wbudget['budget_new'].astype(float)

In [29]:
fl_movies_wbudget.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1659 entries, 0 to 1658
Data columns (total 20 columns):
 #   Column                                    Non-Null Count  Dtype         
---  ------                                    --------------  -----         
 0   title                                     1659 non-null   object        
 1   lifetime_gross                            1659 non-null   float64       
 2   max_theaters                              1651 non-null   float64       
 3   domestic_opening                          1591 non-null   float64       
 4   num_opening_theaters                      1593 non-null   float64       
 5   release_date                              1659 non-null   datetime64[ns]
 6   domestic_distributor                      1659 non-null   object        
 7   domestic_total_gross                      1659 non-null   float64       
 8   international_total_gross                 1659 non-null   float64       
 9   earliest_release_location     

In [27]:
import pickle #immediately pickling dataframe
fl_movies_wbudget.to_pickle('fl_movies_wbudget.pickle')

In [4]:
fl_movies_wbudget = pd.read_pickle('fl_movies_wbudget.pickle')

Attempting to convert currencies 

https://stackoverflow.com/questions/45660529/forex-python-in-custom-function-for-pandas-dataframe
https://stackoverflow.com/questions/50114132/python-pandas-creating-a-new-column-using-currency-converter

In [6]:
'''from forex_python.converter import CurrencyRates
c = CurrencyRates()
import numpy as np
from datetime import datetime, timedelta
from time import sleep
    
def try_convert(amount, currency, date1):
    try:
        res = c.convert(base_cur=currency,dest_cur='USD',amount=amount, date_obj=date1)
    except:
        res = 'Could Not Convert'
    return res

def convert_rates(amount,currency,date1):
    if currency == None:
        return None 
    elif currency != 'USD':
        return try_convert(amount, currency, date1)
    else:
        return amount '''

In [8]:
'''from forex_python.converter import CurrencyRates
c = CurrencyRates()
import numpy as np
from datetime import datetime, timedelta
from time import sleep
    

def convert_rates(amount,currency,date1):
    if currency == None:
        return None 
    elif currency != 'USD':
        return c.convert(base_cur=currency,dest_cur='USD',amount=amount, date_obj=date1)
    else:
        return amount'''

In [14]:
#fl_movies_wbudget['budget_usd'] = np.vectorize(convert_rates)(amount=fl_movies_wbudget['budget_new'],currency=fl_movies_wbudget['currency'],date1=fl_movies_wbudget['release_date'])

In [None]:
#c = CurrencyConverter() 

#def currency_convertor(row, new_currency):
    #amount = row['budget']
    #curr = row['currency']
    #date_r = row['release_date']
    #new_curr = c.convert(amount,curr,new_currency,date=date(date_r))
    #return new_curr

#fl_movies_wbudget['new_budget'] = fl_movies_wbudget.apply(lambda x: currency_convertor(x,new_currency="USD"), axis=1)

In [11]:
!pip install tqdm
from tqdm import tqdm

def converter(base_currency, amount_):
    c = CurrencyRates()
    converted_values = []
    bad_rows = []
    for i in tqdm(range(len(base_currency))):
        if base_currency[i] == 'USD':
            converted_values.append(amount_[i])
        else:
            try:
                value = c.convert(base_cur = base_currency[i], amount = amount_[i], dest_cur = 'USD')
                converted_values.append(value)
            except:
                bad_rows.append(i)
                converted_values.append('Failed to convert.')
    return converted_values

x = fl_movies_wbudget['currency'].values
y = fl_movies_wbudget['budget_new'].values

usd_budget = converter(x,y)

fl_movies_wbudget['budget_usd'] = usd_budget



100%|██████████| 1659/1659 [02:45<00:00, 10.02it/s]


In [17]:
import pickle #immediately pickling dataframe
fl_movies_wbudget.to_pickle('fl_movies_wbudget.pickle')

In [12]:
fl_movies_wbudget.head()

Unnamed: 0,title,lifetime_gross,max_theaters,domestic_opening,num_opening_theaters,release_date,domestic_distributor,domestic_total_gross,international_total_gross,earliest_release_location,...,genres,crew,cast,earliest_release_location_opening_gross,earliest_release_location_original_gross,language,budget,currency,budget_new,budget_usd
0,Hero,53710019.0,2175.0,17800000.0,2031.0,2004-08-27,Miramax,53710019.0,123685538.0,"APAC, China",...,Action Adventure History,"Yimou Zhang Director, Feng Li Writer, Yimou Zh...","Jet Li, Tony Chiu-Wai Leung, Maggie Cheung, Zi...",17800000.0,53710019.0,Mandarin,"$31,000,000",USD,31000000.0,31000000.0
1,Parasite,53369749.0,2001.0,393216.0,3.0,2019-10-11,Neon,53369749.0,205484917.0,South Korea,...,Comedy Drama Thriller,"Bong Joon Ho Director, Bong Joon Ho Writer, Bo...","Kang-ho Song, Sun-kyun Lee, Yeo-jeong Cho, Woo...",393216.0,53369749.0,Korean,"$11,400,000",USD,11400000.0,11400000.0
2,Pan's Labyrinth,37634615.0,1143.0,568641.0,17.0,2006-12-29,Picturehouse,37634615.0,46215652.0,Spain,...,Drama Fantasy War,"Guillermo del Toro Director, Guillermo del Tor...","Ivana Baquero, Ariadna Gil, Sergi López, Marib...",568641.0,37634615.0,Spanish,"$19,000,000",USD,19000000.0,19000000.0
3,Amélie,33225499.0,303.0,136470.0,3.0,2001-11-02,Miramax,33225499.0,140699243.0,France,...,Comedy Romance,"Jean-Pierre Jeunet Director, Guillaume Laurant...","Audrey Tautou, Mathieu Kassovitz, Rufus, Lorel...",136470.0,33225499.0,French,"$10,000,000",USD,10000000.0,10000000.0
4,Fearless,24633730.0,1810.0,10564000.0,1806.0,2006-09-22,Rogue Pictures,24633730.0,43439118.0,"APAC, China",...,Action Biography Drama Sport,"Ronny Yu Director, Chris Chow Writer, Richard ...","Jet Li, Li Sun, Yong Dong, Yun Qu",10564000.0,24633730.0,Mandarin,,,,


In [None]:
import pandas as pd

my_dict = {key:value,key:value,key:value,...}
df = pd.DataFrame(list(my_dict.items()),columns = ['column1','column2'])

In [16]:
fl_movies_wbudget[fl_movies_wbudget['budget'].isna()].title[19:40]

77     Portrait of a Lady on Fire
79                 Mission Mangal
81                   Om Shanti Om
82               A Man Called Ove
84                    Shoplifters
89                        Phoenix
90         I've Loved You So Long
96               The Band's Visit
97                         Bharat
104              Monsieur Ibrahim
105             My Brother's Wife
107          In the Mood for Love
108                      RamLeela
111                 Walk on Water
113              Bharath Ane Nenu
116                 Kapoor & Sons
118                     Bang Bang
119              Man on the Train
120          Do It Like An Hombre
122                          Kick
123                  Love Aaj Kal
Name: title, dtype: object

In [None]:
# Create DataFrame
other_budgets_df = pd.DataFrame(data)

In [None]:
other_budgets = {'title': ['Padmaavat','PK','Dhoom 3','Biutiful',
                           'Pain and Glory','Ong-Bak: The Thai Warrior','Ae Dil Hai Mushkil','With a Friend Like Harry...',
                           'Yeh Jawaani Hai Deewani','Ida','The Class','Portrait of a Lady on Fire',
                          'Om Shanti Om','A Man Called Ove', 'Shoplifters', 'Phoenix']
                           
                'budget_USD':['30000000','12000000','30000000','35000000',
                              '10800000','1100000','15000000','3000000',
                              '8300000','2600000','3700000', '5400000',
                             '7000000', '300000', '110000000', '']}

  
# Create DataFrame
other_budgets_df = pd.DataFrame(other_budgets)