In [32]:
# Import section
import requests
import pandas as pd
import urllib
import re
import locale
from bs4 import BeautifulSoup as bs
from time import sleep
pd.set_option('display.float_format', lambda x: '%.2f' % x)
from tqdm import tqdm

In [2]:
# Constants
URL = "./../thenum/top_level_movies.csv"
MOVIE_ATTRIBUTES = {
                    "domestic_box_office", "international_box_office", "worldwide_box_office",
                    "opening_weekend", "legs", "production_budget", "theater_counts", "mpaa_rating",
                    "running_time", "franchise", "genre", "production_method", "creative_type",
                    "production_companies"
                    }

In [3]:
def preprocess_label(label):
    """
    This function will convert label into lower case, combine separate words with an '_' and remove all
    white spaces and other charaters from both the ends.
    """
    
    label = label.lower()
    label = re.sub("[^A-Za-z0-9\s]", "", label)
    label = re.sub("\s{2,}", " ", label)
    label = label.strip()
    label = re.sub("\s", "_", label)
    return label

In [33]:
def get_movie_details(links, MOVIE_ATTRIBUTES, verbose=False):
    all_movies_info = []
    for link in tqdm(links):
        try:
            r = requests.get(link)
            soup = bs(r.text)
            
            movie = {}

            all_tables = soup.find_all("table")

            # Getting title and year 
            name_area = soup.find("h1", itemprop="name")
            name_s = name_area.string
            name_year = name_s.strip("()").split('(')
            movie["name"] = name_year[0]
            movie["year"] = name_year[1]
            
            # All box office
            stats_table = all_tables[0]
            stats_table_data = stats_table.findAll("td")
            for index, row in enumerate(stats_table_data):
                data = row.string
                if data:
                    label = preprocess_label(data)
                    if label in MOVIE_ATTRIBUTES:
                        movie[label] = stats_table_data[index+1].string.replace("$","").replace(",","")

            # Opening weekend and budget
            metric_table = all_tables[2]
            metric_table_data = metric_table.findAll("td")
            for index, row in enumerate(metric_table_data):
                data = row.string
                if data:
                    label = preprocess_label(data)
                    if label in {"opening_weekend", "production_budget"}:
                        movie[label] = metric_table_data[index + 1].string.split(" ")[0].replace("$","").replace(",","")

                    if label in {"legs"}:
                        movie[label] = metric_table_data[index + 1].string.split(" ")[0]

                    if label in {"theater_counts"}:
                        pre_count = metric_table_data[index + 1].string.split(", ")[1]
                        movie[label] = pre_count.split(" w")[0]
            
            # Runtime, distributor,mpaa, production method, creative type, production companies, and genre
            details_table = all_tables[4]
            details_table_data = details_table.findAll("td")
            for index, row in enumerate(details_table_data):
                data = row.string
                if data:
                    label = preprocess_label(data)
                    if label in {"running_time"}:
                        movie[label] = details_table_data[index + 1].string.split(" ")[0]
            
            # Synopsis
            synopsis_section = soup.find("div", {"id": "summary"})
            synopsis_p = synopsis_section.find("p")
            synopsis_u = (synopsis_p.string).encode(encoding = "UTF-8")
            synopsis_u = str(synopsis_u)
            synopsis = synopsis_u.replace("\\xc3\\xa2\\xc2\\x80\\xc2\\x99", "'").replace("\\xc3\\xa2\\xc2\\x80\\xc2\\x94", "--").replace("\\xc3\\xa2\\xc2\\x80\\xc2\\xa6", "...").replace("\\xc3\\xa2\\xc2\\x80\\xc2\\x9c", "'").replace("\\xc3\\xa2\\xc2\\x80\\xc2\\x9d", "'").replace("\\xc3\\x83\\xc2\\xa9", "é").replace("\\n", "").replace("\\", "").strip("\\n")
            movie['synopsis'] = synopsis

            movie['link'] = link
            all_movies_info.append(movie)
            
            if verbose:
                print(movie)
                
        except Exception as ex:
            print(ex)
            
    return all_movies_info

In [35]:
top_level_data = pd.read_csv(URL)
titles = top_level_data["title"]
links = top_level_data["link"]
all_movies = get_movie_details(links[:7], MOVIE_ATTRIBUTES, False)

100%|██████████| 7/7 [00:11<00:00,  1.69s/it]


In [31]:
pd.options.display.max_seq_items = 2000
pd.set_option('display.max_colwidth', -1)
pd.DataFrame(all_movies).T

Unnamed: 0,0,1,2
domestic_box_office,936662225,858373000,760507625
international_box_office,1131561399,1939427564,2028193712
legs,3.78,2.40,9.87
link,https://www.the-numbers.com/movie/Star-Wars-Ep-VII-The-Force-Awakens#tab=summary,https://www.the-numbers.com/movie/Avengers-Endgame-(2019)#tab=summary,https://www.the-numbers.com/movie/Avatar#tab=summary
name,Star Wars Ep. VII: The Force Awakens,Avengers: Endgame,Avatar
opening_weekend,247966675,357115007,77025481
production_budget,306000000,400000000,237000000
running_time,136,181,162
synopsis,"b'Rey, a scavenger from the planet Jakku, and renegade stormtrooper FN-2187 (nickname 'Finn') embark with resistance pilot Poe Dameron on a quest to destroy Starkiller Base and search for Luke Skywalker, the last remaining Jedi.'","b'The grave course of events set in motion by Thanos that wiped out half the universe and fractured the Avengers ranks compels the remaining Avengers to take one final stand in Marvel Studios' grand conclusion to twenty-two films, 'Avengers: Endgame.''","b'Jake Sully is a wounded ex-marine, thrust into an effort to settle and exploit Pandora, an exotic moon rich in bio-diversity and inhabited by the Na'vi, a ten-foot-tall humanoid species. After Neytiri, a female Na'vi, rescues Jake after he becomes separated from his team, he learns more about the planet and eventually crosses over to lead the indigenous race in a battle for survival.'"
theater_counts,9.2,8.1,13.2


In [None]:
all_movies.to_csv('C:/Users/adave/src/bbox/mdata/thenum/4-7-20/movie_details.csv', index=False)

In [86]:
cols_to_num = ['legs', 'average_run', 'opening_weekend', 'domestic_box_office', 'international_box_office',
               'worldwide_box_office'
              ]
for c in cols_to_num:
    all_movies[c] = pd.to_numeric(all_movies[c])

In [152]:
cols = ['MPAA Rating','Running Time']
movie_data = {}
for row in details_table_data:
    if val of td1 in cols:
        # movie_data[td1 value] = td2 value

SyntaxError: invalid syntax (<ipython-input-152-58b7f48d972f>, line 4)