In [164]:
# imports
import requests
import pandas as pd
import urllib
import re
import locale
from bs4 import BeautifulSoup as bs
from time import sleep
pd.set_option('display.float_format', lambda x: '%.2f' % x)
from tqdm.notebook import tqdm

In [165]:
# Constants
URL = "./../mdata/thenum/top_level_movies.csv"
MOVIE_ATTRIBUTES = {
                    "domestic_box_office", "international_box_office", "worldwide_box_office",
                    "opening_weekend", "legs", "production_budget", "theater_counts", "mpaa_rating",
                    "running_time", "franchise", "genre", "production_method", "creative_type",
                    "production_companies"
                    }

In [194]:
replace_chars = [
    ("\\xc3\\x83\\xc2\\xa9", "é"),
    ("\\xc3\\xa2\\xc2\\x80\\xc2\\xa6", "..."),
    ("\\xc3\\xa2\\xc2\\x80\\xc2\\x9c", "'"),
    ("\\xc3\\xa2\\xc2\\x80\\xc2\\x9d", "'"),
    ("\\n", ""),
    ("\\", ""),
    ("\xc2\\x94", "--"),
    ('â\x80\x99', "'"),
    ("xc3xa2xc2x80xc2x99", "'"),   
    ("xc3xa2xc2x80xc2x94", "—")
    
]

def preprocess_label(label):
    """
    This function will convert label into lower case, combine separate words with an '_' and remove all
    white spaces and other charaters from both the ends.
    """
    
    label = label.lower()
    label = re.sub("[^A-Za-z0-9\s]", "", label)
    label = re.sub("\s{2,}", " ", label)
    label = label.strip()
    label = re.sub("\s", "_", label)
    return label

In [198]:
def get_movie_details(links, MOVIE_ATTRIBUTES, verbose=False):
    all_movies_info = []
    for link in tqdm(links):
        try:
            r = requests.get(link)
            soup = bs(r.text)
            
            movie = {}

            all_tables = soup.find_all("table")

            # Getting title and year 
            name_area = soup.find("h1", itemprop="name")
            name_s = name_area.string
            name_year = name_s.strip("()").split(' (')
            movie["name"] = name_year[0]
            for rc in replace_chars:
                movie["name"] = movie["name"].replace(rc[0], rc[1])
            movie["name"] = movie["name"]

            movie["year"] = name_year[1]
            
            # All box office
            stats_table = all_tables[0]
            stats_table_data = stats_table.findAll("td")
            for index, row in enumerate(stats_table_data):
                data = row.string
                if data:
                    label = preprocess_label(data)
                    if label in MOVIE_ATTRIBUTES:
                        movie[label] = stats_table_data[index+1].string.replace("$","").replace(",","")

            # Opening weekend and budget
            metric_table = all_tables[2]
            metric_table_data = metric_table.findAll("td")
            for index, row in enumerate(metric_table_data):
                data = row.string
                if data:
                    label = preprocess_label(data)
                    if label in {"opening_weekend", "production_budget"}:
                        movie[label] = metric_table_data[index + 1].string.split(" ")[0].replace("$","").replace(",","")

                    if label in {"legs"}:
                        movie[label] = metric_table_data[index + 1].string.split(" ")[0]

                    if label in {"theater_counts"}:
                        pre_count = metric_table_data[index + 1].string.split(", ")[1]
                        movie[label] = pre_count.split(" w")[0]
            
            # Runtime, distributor, mpaa, production method, creative type, production companies, and genre
            details_table = all_tables[4]
            details_table_data = details_table.findAll("td")
            for index, row in enumerate(details_table_data):
                data = row.string
                if data:
                    label = preprocess_label(data)
                    if label in {"running_time"}:
                        movie[label] = details_table_data[index + 1].string.split(" ")[0]
                        
                    if label in {"genre"}:
                        movie[label] = details_table_data[index + 1].string.split(" ")[0]
                    
                    if label in {"creative_type"}:
                        movie[label] = details_table_data[index + 1].string
                        
                    if label in {"production_method"}:
                        movie[label] = details_table_data[index + 1].string
                    
                    if label in {"mpaa_rating"}:
                        movie[label] = details_table_data[index + 1].find("a").string
                    
                    if label in {"production_companies"}:
                        movie[label] = details_table_data[index + 1].find("a").string
            
            # Synopsis
            synopsis_section = soup.find("div", {"id": "summary"})
            synopsis = synopsis_section.find("p")
            if synopsis:
                synopsis_u = str(synopsis.encode(encoding = "UTF-8"))
                synopsis_u = synopsis_u[5:].split("<")[0]
                for rc in replace_chars:
                    synopsis_u = synopsis_u.replace(rc[0], rc[1])
                movie["synopsis"] = synopsis_u
            movie['link'] = link
            all_movies_info.append(movie)
            
            if verbose:
                print(movie)
                
        except Exception as ex:
            print(ex)
            
    return all_movies_info

In [199]:
top_level_data = pd.read_csv(URL)
titles = top_level_data["title"]
links = top_level_data["link"]
all_movies = get_movie_details(links, MOVIE_ATTRIBUTES, False)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))




In [201]:
pd.options.display.max_seq_items = 2000
cols_to_num = ['legs', 'theater_counts', 'opening_weekend', 'domestic_box_office', 'international_box_office',
               'worldwide_box_office', 'production_budget', 'year'
              ]
df = pd.DataFrame(all_movies)
for c in cols_to_num:
    df[c] = pd.to_numeric(df[c])

df

Unnamed: 0,name,year,domestic_box_office,international_box_office,worldwide_box_office,opening_weekend,legs,production_budget,theater_counts,mpaa_rating,running_time,genre,production_method,creative_type,production_companies,synopsis,link
0,Star Wars Ep. VII: The Force Awakens,2015,936662225,1131561399,2068223624,247966675,3.78,306000000,9.20,PG-13,136,Adventure,Animation/Live Action,Science Fiction,Lucasfilm,"Rey, a scavenger from the planet Jakku, and re...",https://www.the-numbers.com/movie/Star-Wars-Ep...
1,Avengers: Endgame,2019,858373000,1939427564,2797800564,357115007,2.40,400000000,8.10,PG-13,181,Action,Animation/Live Action,Super Hero,Marvel Studios,The grave course of events set in motion by Th...,https://www.the-numbers.com/movie/Avengers-End...
2,Avatar,2009,760507625,2028193712,2788701337,77025481,9.87,237000000,13.20,PG-13,162,Action,Animation/Live Action,Science Fiction,Dune Entertainment,"Jake Sully is a wounded ex-marine, thrust into...",https://www.the-numbers.com/movie/Avatar#tab=s...
3,Black Panther,2018,700059566,646853595,1346913161,202003951,3.47,200000000,9.90,PG-13,120,Action,Live Action,Super Hero,Marvel Studios,"King T'Challa returns home to the reclusive, t...",https://www.the-numbers.com/movie/Black-Panthe...
4,Avengers: Infinity War,2018,678815482,1369544272,2048359754,257698183,2.63,300000000,7.70,PG-13,156,Action,Animation/Live Action,Super Hero,Marvel Studios,As the Avengers and their allies have continue...,https://www.the-numbers.com/movie/Avengers-Inf...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Harry Potter and the Order of the Phoenix,2007,292137260,650939197,943076457,77108414,3.79,150000000,6.10,PG-13,138,Adventure,Animation/Live Action,Fantasy,Heyday Films,,https://www.the-numbers.com/movie/Harry-Potter...
96,"The Chronicles of Narnia: The Lion, the Witch ...",2005,291710957,428828615,720539572,65556312,4.45,180000000,8.70,PG,140,Adventure,Animation/Live Action,Fantasy,Walt Disney Pictures,Four young adventurers playing hide and-seek i...,https://www.the-numbers.com/movie/Chronicles-o...
97,Man of Steel,2013,291045518,376954000,667999518,116619362,2.50,225000000,5.00,PG-13,142,Action,Live Action,Super Hero,Warner Bros.,A young boy learns that he has extraordinary p...,https://www.the-numbers.com/movie/Man-of-Steel...
98,Star Wars Ep. V: The Empire Strikes Back,1980,290271960,257697044,547969004,4910483,13.21,23000000,13.10,PG,124,Adventure,Live Action,Science Fiction,Lucasfilm,,https://www.the-numbers.com/movie/Star-Wars-Ep...


In [202]:
df.to_csv('/Users/armaan/src/bbox/mdata/thenum/movie_details.csv', index=False)