In [2]:
# Dependencies
import requests
import json
from config import tmdb_key, omdb_api_key
from pprint import pprint
import pandas as pd

In [3]:
# Set the number of results per page and the total number of desired results
total_result_per_page = 20
total_wanted_results = 100
# Calculate the total number of pages needed to retrieve the desired results
total_pages = int(total_wanted_results / total_result_per_page) #50
current_page = 1

# Function to fetch data from a specific page using the API
def fetch_page(page_number):
        # Construct the URL with the page number to fetch data from
    url = f"https://api.themoviedb.org/3/discover/movie?&include_video=false&language=en-US&sort_by=popularity.desc&page={page_number}&region=US&with_origin_country=US"
     # Set the required headers for the API request
    headers = {
        "accept": "application/json",
        "Authorization": tmdb_key
    }
    # Make a GET request to the API and retrieve the data in JSON format
    response = requests.get(url, headers=headers).json()
    # Return the results from the response
    return response["results"]

# Initialize an empty list to store all retrieved movies
all_movies = []
#Loop through the range of page numbers to fetch data from each page
for page_number in range(1, total_pages + 1):
    all_movies += fetch_page(page_number)

#Print the total number of movies retrieved
print(len(all_movies))    

#Please uncomment the following if you'd like to check the json outcome
# print(len(response["results"]))
# print(json.dumps(response, indent=4, sort_keys=True))

100


In [4]:
# #collect titles for omdb fetch
# imdb_titles = []
# for imdb_id in all_movies:
#     imdb_titles.append(imdb_id['title'])

# imdb_titles   

In [5]:
# Function to fetch details of a specific movie using the API, from this details we wanted the finantial information
def fetch_movie_details(movie_id):
    # Construct the URL with the movie ID to fetch details
    url = f"https://api.themoviedb.org/3/movie/{movie_id}"
    headers = {
        "accept": "application/json",
        "Authorization": tmdb_key
    }
    # Make a GET request to the API and retrieve the movie details in JSON format
    response = requests.get(url, headers=headers).json()
    # Return the movie details from the response
    return response

In [6]:
# Function to fetch omdb data 
def fetch_movie_omdb(movie_title):
    url = "http://www.omdbapi.com/?apikey=" + omdb_api_key + "&t="
    # Make a request to the OMDb API for each movie in the list.
    omdb_data = requests.get(url + movie_title).json()
    # Return the omdb data from the response
    return omdb_data

In [7]:
# Iterate through each movie in the list of all movies
for movie in all_movies:
    # Fetch details of the current movie using its ID
    movie_details = fetch_movie_details(movie["id"])
    ## Fetch additional movie details(ratings) from the OMDb API using the movie title
    movie_omdb = fetch_movie_omdb(movie['title'])
    # Update the movie dictionary with the fetched movie details
    movie.update(movie_details)
    movie.update(movie_omdb)

    #Create a key,value pair to determine if movie is financially successful based on revenue and budget comparison
    movie["financial_success"] = movie["revenue"] > movie["budget"]

In [22]:
# extract rating data one layer up
imdb_rating_name = "Internet Movie Database"
rotten_tomatoes_rating_name = "Rotten Tomatoes"
metacritic_rating_name =  'Metacritic'
    # search for imdb ratings 
for movie in all_movies:
    for rating in movie["Ratings"]:
        if imdb_rating_name == rating["Source"]:
            movie["imdb_rating"] = float(rating["Value"].split("/")[0])
        elif rotten_tomatoes_rating_name == rating["Source"]:
            movie["rotten_tomatoes_rating"] = float(rating["Value"].split("%")[0])
        elif metacritic_rating_name == rating["Source"]:
            movie["metacritic_rating"] = float(rating["Value"].split("/")[0])

In [23]:
#Pretty printing our first outcome 
pprint(all_movies[0])

{'Actors': 'Ryan Reynolds, Morena Baccarin, T.J. Miller',
 'Awards': '29 wins & 78 nominations',
 'BoxOffice': '$363,070,709',
 'Country': 'United States',
 'DVD': 'N/A',
 'Director': 'Tim Miller',
 'Genre': 'Action, Comedy',
 'Language': 'English',
 'Metascore': '65',
 'Plot': 'A wisecracking mercenary gets experimented on and becomes immortal '
         'yet hideously scarred, and sets out to track down the man who ruined '
         'his looks.',
 'Poster': 'https://m.media-amazon.com/images/M/MV5BYzE5MjY1ZDgtMTkyNC00MTMyLThhMjAtZGI5OTE1NzFlZGJjXkEyXkFqcGdeQXVyNjU0OTQ0OTY@._V1_SX300.jpg',
 'Production': 'N/A',
 'Rated': 'R',
 'Ratings': [{'Source': 'Internet Movie Database', 'Value': '8.0/10'},
             {'Source': 'Rotten Tomatoes', 'Value': '85%'},
             {'Source': 'Metacritic', 'Value': '65/100'}],
 'Released': '12 Feb 2016',
 'Response': 'True',
 'Runtime': '108 min',
 'Title': 'Deadpool',
 'Type': 'movie',
 'Website': 'N/A',
 'Writer': 'Rhett Reese, Paul Wernick',
 'Ye

In [47]:
#Creating a dataframe
all_movies_df = pd.DataFrame(all_movies)
all_movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 56 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   adult                   100 non-null    bool   
 1   backdrop_path           100 non-null    object 
 2   genre_ids               100 non-null    object 
 3   id                      100 non-null    int64  
 4   original_language       100 non-null    object 
 5   original_title          100 non-null    object 
 6   overview                100 non-null    object 
 7   popularity              100 non-null    float64
 8   poster_path             100 non-null    object 
 9   release_date            100 non-null    object 
 10  title                   100 non-null    object 
 11  video                   100 non-null    bool   
 12  vote_average            100 non-null    float64
 13  vote_count              100 non-null    int64  
 14  belongs_to_collection   52 non-null     obj

In [48]:
#Displaying all columns 
all_movies_df.columns

Index(['adult', 'backdrop_path', 'genre_ids', 'id', 'original_language',
       'original_title', 'overview', 'popularity', 'poster_path',
       'release_date', 'title', 'video', 'vote_average', 'vote_count',
       'belongs_to_collection', 'budget', 'genres', 'homepage', 'imdb_id',
       'origin_country', 'production_companies', 'production_countries',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'Title',
       'Year', 'Rated', 'Released', 'Runtime', 'Genre', 'Director', 'Writer',
       'Actors', 'Plot', 'Language', 'Country', 'Awards', 'Poster', 'Ratings',
       'Metascore', 'imdbRating', 'imdbVotes', 'imdbID', 'Type', 'DVD',
       'BoxOffice', 'Production', 'Website', 'Response', 'financial_success',
       'imdb_rating', 'rotten_tomatoes_rating', 'metacritic_rating'],
      dtype='object')

In [31]:
# Selecting specific columns from the DataFrame
all_movies_df = all_movies_df[["imdb_id", "title","Genre","popularity","vote_average", "vote_count","budget","revenue","financial_success","Rated","Actors","imdbRating","imdbVotes","imdb_rating", "rotten_tomatoes_rating","metacritic_rating"]]
#Displaying DataFrame
all_movies_df

Unnamed: 0,imdb_id,title,Genre,popularity,vote_average,vote_count,budget,revenue,financial_success,Rated,Actors,imdbRating,imdbVotes,imdb_rating,rotten_tomatoes_rating,metacritic_rating
0,tt6263850,Deadpool & Wolverine,"Action, Comedy",3094.232,7.756,2654,200000000,1262160117,True,R,"Ryan Reynolds, Morena Baccarin, T.J. Miller",8.0,1159415,8.0,85.0,65.0
1,tt22022452,Inside Out 2,"Animation, Adventure, Comedy",1513.584,7.678,3338,200000000,1667589398,True,PG,"Amy Poehler, Maya Hawke, Kensington Tallman",7.8,111985,7.8,91.0,
2,tt4978420,Borderlands,"Action, Adventure, Comedy",1463.361,5.727,342,115000000,30863794,False,PG-13,"Cate Blanchett, Kevin Hart, Edgar Ramírez",4.4,6140,4.4,8.0,
3,tt7510222,Despicable Me 4,"Animation, Adventure, Comedy",1346.005,7.222,1439,100000000,915971490,True,PG,"Steve Carell, Kristen Wiig, Pierre Coffin",6.2,26168,6.2,56.0,
4,tt26753003,Trap,"Crime, Horror, Mystery",1166.020,6.545,810,30000000,76500703,True,PG-13,"Josh Hartnett, Ariel Donoghue, Saleka Shyamalan",6.2,25924,6.2,50.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,tt9044418,MR-9: Do or Die,"Action, Drama, Thriller",149.219,6.500,139,6700000,7100000,True,,"Frank Grillo, Michael Jai White, Kelly Greyson",3.7,1109,3.7,,
96,tt13622970,Moana 2,"Animation, Adventure, Comedy",148.164,0.000,0,0,0,False,,"Dwayne Johnson, Auli'i Cravalho, Alan Tudyk",,,,,
97,tt9214772,Monkey Man,"Action, Crime, Thriller",147.932,7.005,789,10000000,34578510,True,R,"Dev Patel, Sharlto Copley, Pitobash",6.9,68063,6.9,89.0,
98,tt23137904,Rebel Moon - Part Two: The Scargiver,"Action, Adventure, Drama",160.593,6.127,1050,83000000,0,False,PG-13,"Sofia Boutella, Djimon Hounsou, Ed Skrein",5.3,50842,5.3,,


In [32]:
# Drop rows with any missing values
all_movies_df.dropna(inplace=True)
# Display the length of the DataFrame after dropping rows with missing values
print(len(all_movies_df))
# Display the first few rows of the cleaned DataFrame
all_movies_df.head()

37


Unnamed: 0,imdb_id,title,Genre,popularity,vote_average,vote_count,budget,revenue,financial_success,Rated,Actors,imdbRating,imdbVotes,imdb_rating,rotten_tomatoes_rating,metacritic_rating
0,tt6263850,Deadpool & Wolverine,"Action, Comedy",3094.232,7.756,2654,200000000,1262160117,True,R,"Ryan Reynolds, Morena Baccarin, T.J. Miller",8.0,1159415,8.0,85.0,65.0
19,tt27682129,Prey,"Action, Adventure, Horror",645.6,6.433,210,0,0,False,R,"Amber Midthunder, Dakota Beavers, Dane DiLiegro",7.1,233550,7.1,94.0,71.0
20,tt0094721,Beetlejuice,"Comedy, Fantasy",636.789,7.385,6762,15000000,74849333,True,PG,"Alec Baldwin, Geena Davis, Michael Keaton",7.5,348874,7.5,86.0,70.0
28,tt0295701,xXx,"Action, Adventure, Thriller",406.596,5.937,4172,70000000,277448382,True,PG-13,"Vin Diesel, Asia Argento, Marton Csokas",5.8,187525,5.8,48.0,48.0
33,tt23778968,Thelma,"Drama, Fantasy, Horror",294.782,7.1,93,5000000,9818454,True,Not Rated,"Eili Harboe, Kaya Wilkins, Henrik Rafaelsen",7.0,37145,7.0,92.0,74.0


In [38]:
# outcome
# rules: financial_success == true && imdb_rating > 7 && rotten_tomatoes_rating > 60 && metacritic_rating > 50

# Create a mask for each condition
mask_financial_success = all_movies_df['financial_success'] == True
mask_imdb_rating = all_movies_df['imdb_rating'] > 7.0
mask_rotten_tomatoes_rating = all_movies_df['rotten_tomatoes_rating'] > 60
mask_metacritic_rating = all_movies_df['metacritic_rating'] > 50

In [42]:
# Combine the masks using the bitwise AND operator &
all_movies_df['outcome'] = mask_financial_success & mask_imdb_rating & mask_rotten_tomatoes_rating & mask_metacritic_rating
all_movies_df['outcome'] = all_movies_df['outcome'].astype(int)

In [43]:
#display dataframe
all_movies_df.head()

Unnamed: 0,imdb_id,title,Genre,popularity,vote_average,vote_count,budget,revenue,financial_success,Rated,Actors,imdbRating,imdbVotes,imdb_rating,rotten_tomatoes_rating,metacritic_rating,outcome
0,tt6263850,Deadpool & Wolverine,"Action, Comedy",3094.232,7.756,2654,200000000,1262160117,True,R,"Ryan Reynolds, Morena Baccarin, T.J. Miller",8.0,1159415,8.0,85.0,65.0,1
19,tt27682129,Prey,"Action, Adventure, Horror",645.6,6.433,210,0,0,False,R,"Amber Midthunder, Dakota Beavers, Dane DiLiegro",7.1,233550,7.1,94.0,71.0,0
20,tt0094721,Beetlejuice,"Comedy, Fantasy",636.789,7.385,6762,15000000,74849333,True,PG,"Alec Baldwin, Geena Davis, Michael Keaton",7.5,348874,7.5,86.0,70.0,1
28,tt0295701,xXx,"Action, Adventure, Thriller",406.596,5.937,4172,70000000,277448382,True,PG-13,"Vin Diesel, Asia Argento, Marton Csokas",5.8,187525,5.8,48.0,48.0,0
33,tt23778968,Thelma,"Drama, Fantasy, Horror",294.782,7.1,93,5000000,9818454,True,Not Rated,"Eili Harboe, Kaya Wilkins, Henrik Rafaelsen",7.0,37145,7.0,92.0,74.0,0


In [45]:
all_movies_df.to_csv("movies.csv")