In [1]:
# dependencies
import requests
import json
import pandas as pd
from pprint import pprint
from api_keys import api_key

In [2]:
# parameters
base_url = "https://api.themoviedb.org/3/discover/movie"
region = 'US'
years = range(2000,2024,1)
pages = range(1,100,1)

# empty list for movie ID, name, popularity, avg rating, # of votes, release year
movie_ids = []
movie_names = []
movie_pops = []
movie_vote_avgs = []
movie_vote_counts = []
movie_release_yrs = []

for year in years:
    # set variable to keep track of pages without results
    bad_page = 0
    for page in pages:
        # finish setting up parameters
        params = {
            'api_key':api_key,
            'include_adult':False,
            'region':region,
            'primary_release_year':year,
            'sort_by':'popularity.desc',
            'page':page
        }
        # get data for given year
        response = requests.get(base_url, params=params).json()
        
        # set variables to see if there's useful data on the page 
        movie_found = False
        
        # go through data
        try:
            # counter for movies that don't fit the requirements so it can break earlier
            bad_result = 0
            # loop through data
            for result in response['results']:
                if (result['popularity'] > 20) & (result['vote_count'] > 100):
                    movie_found = True
                    movie_ids.append(result['id'])
                    movie_names.append(result['title'])
                    movie_pops.append(result['popularity'])
                    movie_vote_avgs.append(result['vote_average'])
                    movie_vote_counts.append(result['vote_count'])
                    movie_release_yrs.append(year)
                else:
                    # if 10 results (half) on page don't fit requirements, move on
                    bad_result += 1
                    if bad_result >= 10:
                        break
                    else:
                        pass
        except:
            print(f'Had an issue getting data from {year}, page: {page}.')
            pass
        # print if no data was found on the page
        if movie_found:
            bad_page = 0
            print(f'Got data from year: {year}, page: {page}.')
        else:
            bad_page += 1
            print(f'Not enough useful data on year: {year}, page: {page}.')
        # skip to next year if multiple pages in a row with NO useful info
        if bad_page >= 2:
            print(f'Multiple pages w/o useful data, skipping the rest of {year}.')
            break

Got data from year: 2000, page: 1.
Got data from year: 2000, page: 2.
Got data from year: 2000, page: 3.
Got data from year: 2000, page: 4.
Not enough useful data on year: 2000, page: 5.
Not enough useful data on year: 2000, page: 6.
Multiple pages w/o useful data, skipping the rest of 2000.
Got data from year: 2001, page: 1.
Got data from year: 2001, page: 2.
Got data from year: 2001, page: 3.
Got data from year: 2001, page: 4.
Not enough useful data on year: 2001, page: 5.
Not enough useful data on year: 2001, page: 6.
Multiple pages w/o useful data, skipping the rest of 2001.
Got data from year: 2002, page: 1.
Got data from year: 2002, page: 2.
Got data from year: 2002, page: 3.
Got data from year: 2002, page: 4.
Not enough useful data on year: 2002, page: 5.
Not enough useful data on year: 2002, page: 6.
Multiple pages w/o useful data, skipping the rest of 2002.
Got data from year: 2003, page: 1.
Got data from year: 2003, page: 2.
Got data from year: 2003, page: 3.
Got data from ye

Got data from year: 2021, page: 5.
Got data from year: 2021, page: 6.
Got data from year: 2021, page: 7.
Got data from year: 2021, page: 8.
Got data from year: 2021, page: 9.
Got data from year: 2021, page: 10.
Got data from year: 2021, page: 11.
Got data from year: 2021, page: 12.
Got data from year: 2021, page: 13.
Got data from year: 2021, page: 14.
Not enough useful data on year: 2021, page: 15.
Not enough useful data on year: 2021, page: 16.
Multiple pages w/o useful data, skipping the rest of 2021.
Got data from year: 2022, page: 1.
Got data from year: 2022, page: 2.
Got data from year: 2022, page: 3.
Got data from year: 2022, page: 4.
Got data from year: 2022, page: 5.
Got data from year: 2022, page: 6.
Got data from year: 2022, page: 7.
Got data from year: 2022, page: 8.
Got data from year: 2022, page: 9.
Got data from year: 2022, page: 10.
Got data from year: 2022, page: 11.
Got data from year: 2022, page: 12.
Got data from year: 2022, page: 13.
Got data from year: 2022, page:

In [3]:
# assemble df
movie_df = pd.DataFrame({
    'Movie ID': movie_ids,
    'Title': movie_names,
    'Release Year': movie_release_yrs,
    'Popularity Score': movie_pops,
    'Average Rating': movie_vote_avgs,
    'Number of Ratings': movie_vote_counts
})

movie_df = movie_df.set_index('Movie ID')

# export
movie_df.to_csv('../Raw Data/2000s_US_IDs.csv')

movie_df

Unnamed: 0_level_0,Title,Release Year,Popularity Score,Average Rating,Number of Ratings
Movie ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
9600,Big Momma's House,2000,95.366,5.8,2095
11688,The Emperor's New Groove,2000,95.121,7.5,6003
4247,Scary Movie,2000,90.579,6.3,6240
98,Gladiator,2000,74.966,8.2,16701
10867,Malena,2000,74.228,7.4,1961
...,...,...,...,...,...
881164,Boston Strangler,2023,27.471,6.7,434
722149,Luther: The Fallen Sun,2023,27.272,6.8,672
844417,Marlowe,2023,23.636,6.2,160
1068141,Mighty Morphin Power Rangers: Once & Always,2023,21.831,6.6,196


In [None]:
# code by Peter Solis

In [None]:
# code by Azi Farooquee

In [108]:
#this section pulls in movie details for each movie in the movie_df and stores those details in respective list

movid = []
budget = []
revenue = []
genre = []
releasedate = []
runtime = []

for i in range(len(movie_df)):
    try:
        mymovie = movie_df.index[i]
        params = {'api_key':api_key}
        detail_url  = f"https://api.themoviedb.org/3/movie/{mymovie}"

        detailresponse = requests.get(detail_url, params=params).json()
        budgetr = detailresponse["budget"]
        budget.append(budgetr)
        revenuer = detailresponse["revenue"]
        revenue.append(revenuer)
        releasedater = detailresponse["release_date"]
        releasedate.append(releasedater)
        runtimer = detailresponse["runtime"]
        runtime.append(runtimer)
        genrer = detailresponse["genres"] #this returns a list of dictionaries
        genre.append(genrer)
        movidr = detailresponse["id"]
        movid.append(movidr)
    except:
        print(f"Data error with movie {mymovie}")
        
    
print(len(budget))
print(len(revenue))
print(len(genre))
print(len(releasedate))
print(len(runtime))


3124
3124
3124
3124
3124


In [110]:
#this section merges the movie details list into a dictionary to create a datafram
moviedetails_df = pd.DataFrame({
    "Movie ID": movid,
    "Budget": budget,
    "Revenue": revenue,
    "Genre": genre,
    "Release Date": releasedate,
    "Run Time": runtime
})

In [111]:
#show the movie details DF
moviedetails_df

Unnamed: 0,Movie ID,Budget,Revenue,Genre,Release Date,Run Time
0,9600,30000000,173959438,"[{'id': 80, 'name': 'Crime'}, {'id': 35, 'name...",2000-05-31,98
1,11688,100000000,169327687,"[{'id': 12, 'name': 'Adventure'}, {'id': 16, '...",2000-12-15,78
2,4247,19000000,278019771,"[{'id': 35, 'name': 'Comedy'}]",2000-07-07,88
3,98,103000000,465361176,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",2000-05-04,155
4,10867,0,14493284,"[{'id': 18, 'name': 'Drama'}]",2000-03-16,109
...,...,...,...,...,...,...
3119,881164,0,0,"[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...",2023-03-17,112
3120,722149,0,0,"[{'id': 80, 'name': 'Crime'}]",2023-02-24,129
3121,844417,20000000,6262663,"[{'id': 9648, 'name': 'Mystery'}, {'id': 53, '...",2023-02-15,109
3122,1068141,0,0,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",2023-04-19,59


In [112]:
#export moviedetails_df to csv file
moviedetails_df.to_csv("../Raw Data/movie_details.csv")

In [120]:
#This section creates respective lists to store first 3 actors per movie and directors

creditmovie = []
actor1_list = []
actor2_list = []
actor3_list = []
directorslist = []

for i in range(len(movie_df)):
    try:
        mymovie2 = movie_df.index[i]
        params = {'api_key':api_key}
        credit_url  = f"https://api.themoviedb.org/3/movie/{mymovie2}/credits"

        response3 = requests.get(credit_url, params=params).json()
        crmovier = response3["id"]
        creditmovie.append(crmovier)
        actor = response3["cast"][0]["name"]
        actor1_list.append(actor)
        actor = response3["cast"][1]["name"]
        actor2_list.append(actor)
        try:
            actor = response3["cast"][2]["name"]
            actor3_list.append(actor)
        except:
            actor3_list.append("null")

        for dirname in response3["crew"]:
            if dirname["job"].lower() == "director":
                director = dirname["name"]
                directorslist.append(director)
                break
    except IndexError:
        print(f"Index error with movie {mymovie}")
        
    
print(len(actor1_list))
print(len(actor2_list))
print(len(actor3_list))
print(len(directorslist))

3124
3124
3124
3124


In [121]:
print(len(creditmovie))

3124


In [123]:
#this section merges the movie credits list into a dictionary to create a datafram
moviecredits_df = pd.DataFrame({
    "Movie ID": creditmovie,
    "Actor 1": actor1_list,
    "Actor 2": actor2_list,
    "Actor 3": actor3_list,
    "Director": directorslist
})

In [124]:
#Preview the movie credits df
moviecredits_df.head(5)

Unnamed: 0,Movie ID,Actor 1,Actor 2,Actor 3,Director
0,9600,Martin Lawrence,Nia Long,Paul Giamatti,Raja Gosnell
1,11688,David Spade,John Goodman,Eartha Kitt,Mark Dindal
2,4247,Anna Faris,Jon Abrahams,Marlon Wayans,Keenen Ivory Wayans
3,98,Russell Crowe,Joaquin Phoenix,Connie Nielsen,Ridley Scott
4,10867,Monica Bellucci,Giuseppe Sulfaro,Luciano Federico,Giuseppe Tornatore


In [125]:
#export moviecredits_df to csv file
moviecredits_df.to_csv("../Raw Data/movie_credits.csv")