Andy Snitgen  
Professor Karen Jin  
Comp 574 - Applied Computing II  
October 22, 2021  

In [1]:
import requests, json, time, pandas as pd
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup

# Create the data frame
column_names = ['Title', 'Year', 'Actor', 'US Box Office', 'International Revenue', 'Adjusted Revenue', 'Average Rating', 'IMDb Rating', 'Metacritic']
df = pd.DataFrame(columns = column_names)

In [None]:
# Create a function that grabs the relevant data and returns the new row
def data(movie_title):
    # Set initial API string for IMDB
    api_key = "k_nh5thpq8"
    base_url = "https://imdb-api.com/en/API/SearchMovie/"
    complete_url = base_url + api_key + "/" + movie_title

    # Get the title ID
    payload = requests.get(complete_url).json()
    df_temp = pd.json_normalize(payload['results'])
    id_num = df_temp.id.iloc[0]

    # use the id number to get the required info
    base_url = "https://imdb-api.com/en/API/Title/"
    complete_url = base_url + api_key + "/" + id_num 
    payload = requests.get(complete_url).json()
    df_temp = pd.json_normalize(payload)
    df_temp2 = pd.json_normalize(payload['starList'])

    # Extract relevant data and put it into variables
    title = df_temp.title.iloc[0]
    year = df_temp.year.iloc[0]
    actor = df_temp2.name.iloc[0]
    us_box = df_temp['boxOffice.grossUSA'].iloc[0]
    itl_box = df_temp["boxOffice.cumulativeWorldwideGross"].iloc[0]
    critic = payload['metacriticRating']
    critic = critic + '%'
    rating = payload['imDbRating']
    if rating == '':
        rating = critic
        rating = rating.replace('%', '')
        rating = float(rating)
        rating = rating / 10
        rating = str(rating) + ' / 10 (est.)'
    else:
        rating = rating + ' / 10'

    # Check if International Box Office Data exists, if it doesn't replace the value with 'No Data'
    if title == 'No Time to Die':  # No Time to Die has to be put in manually as API doesn't have data yet
        us_box = '$120,357,453'
        itl_box = '$525,482,453'
    us_box_float = floater(us_box)
    itl_box_float =  floater(itl_box)
    if abs((us_box_float - itl_box_float) / us_box_float) <= 0.01:
        itl_box = "No Data"

    # Put variables into main data frame
    new_row = {'Title': title, 'Year': year, 'Actor': actor, 'US Box Office': us_box, "International Revenue": itl_box, 'IMDb Rating': rating, 'Metacritic': critic}
    return new_row
     

In [5]:
# Create a function that converts $ strings into floats
def floater(input_str):
    floater = input_str.strip('$')
    floater = floater.replace(',','')
    floater = floater.replace(' (est.)', '')
    floater = float(floater)
    return floater

In [3]:
# Create a function that turns an IMDb rating string into a float
def replace_rating(input_str):
    ret_value = input_str.replace(' / 10', '')
    ret_value = ret_value.replace(' (est.)', '')
    ret_value = float(ret_value)
    ret_value = ret_value * 10
    return ret_value

In [4]:
# Create a function that turns a Metacritic score into a float
def replace_critic(input_str):
    ret_value = input_str.replace('%', '')
    ret_value = float(ret_value)
    return ret_value

In [None]:
# Create the list of movies to collect data on
movie_list = \
['dr no',
'from russia with love',
'goldfinger',
'thunderball', 
'you only live twice',
'on her majestys secret service',
'diamonds are forever',
'live and let die',
'the man with the golden gun',
'the spy who loved me',
'moonraker',
'for your eyes only',
'octopussy',
'view to a kill',
'the living daylights',
'licence to kill',
'goldeneye',
'tomorrow never dies',
'the world is not enough',
'die another day',
'casino royale',
'quantum of solace',
'skyfall',
'spectre',
'no time to die']

In [None]:
# These commented out lines are use for debugging
#movie_list = movie_list[20:25] # slices small sections of the list 
#print(movie_list)


# Iterate through the list and add results to data frame 
for movie_title in movie_list:
    print('working on ' + movie_title)
    time.sleep(0.1)  # Put into reduce hang up error on home PC
    new_row = data(movie_title)
    df = df.append(new_row, ignore_index=True)

In [None]:
# Calculate the Average change from Domestic to International Gross 
total = 0
for index, row in df.iterrows():
    if df['International Revenue'].values[index] != 'No Data':
        usbo = floater(str(df['US Box Office'].values[index]))
        itlbo = floater(str(df['International Revenue'].values[index]))
        total = total + (itlbo/usbo)
counter = df['International Revenue'].str.contains("No Data").value_counts()[0]
change = total/counter

# Apply the average change by multiplying US Box Office by 'change' variable to replace 'No Data'
for index, row in df.iterrows():
    if df['International Revenue'].values[index] == 'No Data':
        usbo = floater(str(df['US Box Office'].values[index]))
        itlbo = usbo*change
        itlbo = "${:,.0f} (est.)".format(itlbo)
        df['International Revenue'][index] = itlbo

In [None]:
# Scrape inflation data from the internet
page = requests.get('https://www.minneapolisfed.org/about-us/monetary-policy/inflation-calculator/consumer-price-index-1913-')
soup = BeautifulSoup(page.content, 'html.parser')
data = soup.find('tbody')
tables = data.find_all('td')
info_list = []
for elem in tables:
    info = elem.findChildren()[0].contents[0]
    info_list.append(info)

# Put the data into lists and combine the lists into a dictionary
year = []
cpi = []
for value in info_list[::3]:
    value = value.replace('*', '')
    value = value.replace(u'\xa0', '')
    year.append(value)
for value in info_list[1::3]:
    cpi.append(value)
zip_iterator = zip(year, cpi)
inf_dict = dict(zip_iterator)

In [None]:
# Calculate inflation for each movie and place it into a new column 'Adjusted Revenue'
curr_cpi = floater(inf_dict.get('2021'))
for index, row in df.iterrows():
    key = df['Year'].values[index]
    old_cpi = floater(inf_dict.get(key))
    base = floater(df['International Revenue'].values[index])
    adjusted = base * (curr_cpi/old_cpi)
    adjusted = "${:,.0f}".format(adjusted)
    df['Adjusted Revenue'][index] = adjusted

In [None]:
# Calculate the average of IMDb and Metacritic ratings and put it into new column Average Rating
for index, row in df.iterrows():
    rate = replace_rating(df['IMDb Rating'].values[index])
    crit = replace_critic(df['Metacritic'].values[index])
    average = (rate + crit)/2
    average = str(average) + '%'
    df['Average Rating'][index] = average
df

In [None]:
# Write the data frame to a csv file
f = open('project007.csv', 'w', newline='')
writer = csv.writer(f)
writer.writerow(['Title', 'Year', 'Actor', 'US Box Office', 'International Revenue', 'Adjusted Revenue', 'Average Rating', 'IMDb Rating', 'Metacritic']) 
for index, row in df.iterrows():
    title = df.Title[index]
    year = df.Year[index]
    actor = df.Actor[index]
    us_box_office = df['US Box Office'][index]
    international_revenue = df['International Revenue'][index]
    adujusted_revenue = df['Adjusted Revenue'][index]
    average_rating = df['Average Rating'][index]
    imdb_rating = df['IMDb Rating'][index]
    metacritic = df['Metacritic']
    writer.writerow([title, year, actor, us_box_office, international_revenue, adjusted_revenue, average_rating, imdb_rating, metacritic])
f.close()

In [9]:
# function to test if 'No Time to Die' has data yet
def checker():
    movie_title = 'no time to die'

    # Set initial API string for IMDB
    api_key = "k_nh5thpq8"
    base_url = "https://imdb-api.com/en/API/SearchMovie/"
    complete_url = base_url + api_key + "/" + movie_title

    # Get the title ID
    payload = requests.get(complete_url).json()
    df_temp = pd.json_normalize(payload['results'])
    id_num = df_temp.id.iloc[0]

    # use the id number to get the required info
    base_url = "https://imdb-api.com/en/API/Title/"
    complete_url = base_url + api_key + "/" + id_num 
    payload = requests.get(complete_url).json()
    df_temp = pd.json_normalize(payload)

    # Extract relevant data and put it into variables
    us_box = df_temp['boxOffice.grossUSA'].iloc[0]
    rating = payload['imDbRating']
    
    # Print the relevant data and decide if No Time To Die is up to date
    print(f'No Time to Die US Box Office: {us_box}')
    print(f'No Time to Die IMDb Rating: {rating}')
    if us_box != '':
        print('No Time to Die will is updated and will use API data')
    else: 
        print('No time to Die will is out of date and will use hard-coded Box Office Data')

  if us_box is not '':
