In [None]:
# dependencies
import requests
import json
import pandas as pd
from pprint import pprint
from api_keys import api_key

In [1]:
# parameters
base_url = "https://api.themoviedb.org/3/discover/movie"
region = 'US'
years = range(2000,2024,1)
pages = range(1,100,1)

# empty list for movie ID, name, popularity, avg rating, # of votes, release year
movie_ids = []
movie_names = []
movie_pops = []
movie_vote_avgs = []
movie_vote_counts = []
movie_release_yrs = []

for year in years:
    # set variable to keep track of pages without results
    bad_page = 0
    for page in pages:
        # finish setting up parameters
        params = {
            'api_key':api_key,
            'include_adult':False,
            'region':region,
            'primary_release_year':year,
            'sort_by':'popularity.desc',
            'page':page
        }
        # get data for given year
        response = requests.get(base_url, params=params).json()
        
        # set variables to see if there's useful data on the page 
        movie_found = False
        
        # go through data
        try:
            # counter for movies that don't fit the requirements so it can break earlier
            bad_result = 0
            # loop through data
            for result in response['results']:
                if (result['popularity'] > 20) & (result['vote_count'] > 100):
                    movie_found = True
                    movie_ids.append(result['id'])
                    movie_names.append(result['title'])
                    movie_pops.append(result['popularity'])
                    movie_vote_avgs.append(result['vote_average'])
                    movie_vote_counts.append(result['vote_count'])
                    movie_release_yrs.append(year)
                else:
                    # if 10 results (half) on page don't fit requirements, move on
                    bad_result += 1
                    if bad_result >= 10:
                        break
                    else:
                        pass
        except:
            print(f'Had an issue getting data from {year}, page: {page}.')
            pass
        # print if no data was found on the page
        if movie_found:
            bad_page = 0
            print(f'Got data from year: {year}, page: {page}.')
        else:
            bad_page += 1
            print(f'Not enough useful data on year: {year}, page: {page}.')
        # skip to next year if multiple pages in a row with NO useful info
        if bad_page >= 2:
            print(f'Multiple pages w/o useful data, skipping the rest of {year}.')
            break

Got data from year: 2000, page: 1.
Got data from year: 2000, page: 2.
Got data from year: 2000, page: 3.
Got data from year: 2000, page: 4.
Not enough useful data on year: 2000, page: 5.
Not enough useful data on year: 2000, page: 6.
Multiple pages w/o useful data, skipping the rest of 2000.
Got data from year: 2001, page: 1.
Got data from year: 2001, page: 2.
Got data from year: 2001, page: 3.
Got data from year: 2001, page: 4.
Not enough useful data on year: 2001, page: 5.
Not enough useful data on year: 2001, page: 6.
Multiple pages w/o useful data, skipping the rest of 2001.
Got data from year: 2002, page: 1.
Got data from year: 2002, page: 2.
Got data from year: 2002, page: 3.
Got data from year: 2002, page: 4.
Not enough useful data on year: 2002, page: 5.
Not enough useful data on year: 2002, page: 6.
Multiple pages w/o useful data, skipping the rest of 2002.
Got data from year: 2003, page: 1.
Got data from year: 2003, page: 2.
Got data from year: 2003, page: 3.
Got data from ye

Got data from year: 2021, page: 7.
Got data from year: 2021, page: 8.
Got data from year: 2021, page: 9.
Got data from year: 2021, page: 10.
Got data from year: 2021, page: 11.
Got data from year: 2021, page: 12.
Got data from year: 2021, page: 13.
Got data from year: 2021, page: 14.
Not enough useful data on year: 2021, page: 15.
Not enough useful data on year: 2021, page: 16.
Multiple pages w/o useful data, skipping the rest of 2021.
Got data from year: 2022, page: 1.
Got data from year: 2022, page: 2.
Got data from year: 2022, page: 3.
Got data from year: 2022, page: 4.
Got data from year: 2022, page: 5.
Got data from year: 2022, page: 6.
Got data from year: 2022, page: 7.
Got data from year: 2022, page: 8.
Got data from year: 2022, page: 9.
Got data from year: 2022, page: 10.
Got data from year: 2022, page: 11.
Got data from year: 2022, page: 12.
Got data from year: 2022, page: 13.
Got data from year: 2022, page: 14.
Got data from year: 2022, page: 15.
Got data from year: 2022, pag

In [2]:
# assemble df
movie_df = pd.DataFrame({
    'Movie ID': movie_ids,
    'Title': movie_names,
    'Release Year': movie_release_yrs,
    'Popularity Score': movie_pops,
    'Average Rating': movie_vote_avgs,
    'Number of Ratings': movie_vote_counts
})

movie_df = movie_df.set_index('Movie ID')

# export
movie_df.to_csv('../Raw Data/2000s_US_IDs.csv')

movie_df

Unnamed: 0_level_0,Title,Release Year,Popularity Score,Average Rating,Number of Ratings
Movie ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
11688,The Emperor's New Groove,2000,87.669,7.5,6005
4247,Scary Movie,2000,85.388,6.3,6244
9600,Big Momma's House,2000,84.006,5.8,2096
98,Gladiator,2000,72.746,8.2,16703
3134,Baise-moi,2000,67.329,4.9,373
...,...,...,...,...,...
932430,Prom Pact,2023,29.444,7.2,103
818648,Love at First Kiss,2023,28.077,6.2,152
844417,Marlowe,2023,25.164,6.2,160
858408,Dog Gone,2023,21.577,6.3,135


In [None]:
# code by Peter Solis

In [None]:
# code by Azi Farooquee

In [4]:
#this section pulls in movie details for each movie in the movie_df and stores those details in respective list

movid = []
budget = []
revenue = []
genre1 = []
genre2 = []
genre3 = []
genre4 = []
genre5 = []
releasedate = []
runtime = []

detail_count = 0
fifty_count = 0

for i in range(len(movie_df)):
    try:
        mymovie = movie_df.index[i]
        params = {'api_key':api_key}
        detail_url  = f"https://api.themoviedb.org/3/movie/{mymovie}"

        detailresponse = requests.get(detail_url, params=params).json()
        budgetr = detailresponse["budget"]
        budget.append(budgetr)
        revenuer = detailresponse["revenue"]
        revenue.append(revenuer)
        releasedater = detailresponse["release_date"]
        releasedate.append(releasedater)
        runtimer = detailresponse["runtime"]
        runtime.append(runtimer)
        genrer = detailresponse["genres"] #this returns a list of dictionaries
        # edit by Peter to format to just the genre names
        genre_parsed = [genre['name'] for genre in genrer] 
        while len(genre_parsed) < 5:
            genre_parsed.append('N/A')
        genre1.append(genre_parsed[0])
        genre2.append(genre_parsed[1])
        genre3.append(genre_parsed[2])
        genre4.append(genre_parsed[3])
        genre5.append(genre_parsed[4])
        # edit done
        movidr = detailresponse["id"]
        movid.append(movidr)
        
        # edit by Peter - just a bit to give some output so I know it's working lol
        detail_count += 1
        if detail_count == 50:
            fifty_count += 1
            detail_count = 0
            print(f'50 * {fifty_count} movies done.')
    except:
        print(f"Data error with movie {mymovie}")
        
    
print(len(budget))
print(len(revenue))
print(len(genre1))
print(len(genre2))
print(len(genre3))
print(len(genre4))
print(len(genre5))
print(len(releasedate))
print(len(runtime))


50 * 1 movies done.
50 * 2 movies done.
50 * 3 movies done.
50 * 4 movies done.
50 * 5 movies done.
50 * 6 movies done.
50 * 7 movies done.
50 * 8 movies done.
50 * 9 movies done.
50 * 10 movies done.
50 * 11 movies done.
50 * 12 movies done.
50 * 13 movies done.
50 * 14 movies done.
50 * 15 movies done.
50 * 16 movies done.
50 * 17 movies done.
50 * 18 movies done.
50 * 19 movies done.
50 * 20 movies done.
50 * 21 movies done.
50 * 22 movies done.
50 * 23 movies done.
50 * 24 movies done.
50 * 25 movies done.
50 * 26 movies done.
50 * 27 movies done.
50 * 28 movies done.
50 * 29 movies done.
50 * 30 movies done.
50 * 31 movies done.
50 * 32 movies done.
50 * 33 movies done.
50 * 34 movies done.
50 * 35 movies done.
50 * 36 movies done.
50 * 37 movies done.
50 * 38 movies done.
50 * 39 movies done.
50 * 40 movies done.
50 * 41 movies done.
50 * 42 movies done.
50 * 43 movies done.
50 * 44 movies done.
50 * 45 movies done.
50 * 46 movies done.
50 * 47 movies done.
50 * 48 movies done.
5

In [5]:
#this section merges the movie details list into a dictionary to create a datafram
moviedetails_df = pd.DataFrame({
    "Movie ID": movid,
    "Budget": budget,
    "Revenue": revenue,
    "Genre 1": genre1,
    "Genre 2": genre2,
    "Genre 3": genre3,
    "Genre 4": genre4,
    "Genre 5": genre5,
    "Release Date": releasedate,
    "Run Time": runtime
})

In [6]:
#show the movie details DF
moviedetails_df

Unnamed: 0,Movie ID,Budget,Revenue,Genre 1,Genre 2,Genre 3,Genre 4,Genre 5,Release Date,Run Time
0,11688,100000000,169327687,Adventure,Animation,Comedy,Family,Fantasy,2000-12-15,78
1,4247,19000000,278019771,Comedy,,,,,2000-07-07,88
2,9600,30000000,173959438,Crime,Comedy,,,,2000-05-31,98
3,98,103000000,465361176,Action,Drama,Adventure,,,2000-05-04,155
4,3134,0,940944,Crime,Drama,Romance,Thriller,,2000-06-28,77
...,...,...,...,...,...,...,...,...,...,...
3163,932430,0,0,Romance,Comedy,,,,2023-03-30,98
3164,818648,0,0,Romance,Comedy,,,,2023-03-03,96
3165,844417,20000000,6262663,Mystery,Thriller,Crime,,,2023-02-15,109
3166,858408,0,0,Family,Adventure,Drama,,,2023-01-13,95


In [7]:
#export moviedetails_df to csv file
moviedetails_df.to_csv("../Raw Data/movie_details.csv")

In [22]:
#This section creates respective lists to store first 3 actors per movie and directors

creditmovie = []
actor1_list = []
actor2_list = []
actor3_list = []
directorslist = []

detail_count = 0
fifty_count = 0

for i in range(len(movie_df)):
    try:
        mymovie2 = movie_df.index[i]
        params = {'api_key':api_key}
        credit_url  = f"https://api.themoviedb.org/3/movie/{mymovie2}/credits"

        response3 = requests.get(credit_url, params=params).json()
        crmovier = response3["id"]
        creditmovie.append(crmovier)
        # edit here by Peter to deal with empty / missing cast
        actor = []
        for i in range(3):
            try:
                actor.append(response3['cast'][i]['name'])
            except:
                actor.append('N/A')
        actor1_list.append(actor[0])
        actor2_list.append(actor[1])
        actor3_list.append(actor[2])

        for dirname in response3["crew"]:
            if dirname["job"].lower() == "director":
                director = dirname["name"]
                directorslist.append(director)
                break
                
        # edit by Peter - just a bit to give some output so I know it's working lol
        detail_count += 1
        if detail_count == 50:
            fifty_count += 1
            detail_count = 0
            print(f'50 * {fifty_count} movies done.')
    except IndexError:
        print(f"Index error with movie id - {mymovie2} (index - {i})")
        
    
print(len(actor1_list))
print(len(actor2_list))
print(len(actor3_list))
print(len(directorslist))
print(len(creditmovie))

50 * 1 movies done.
50 * 2 movies done.
50 * 3 movies done.
50 * 4 movies done.
50 * 5 movies done.
50 * 6 movies done.
50 * 7 movies done.
50 * 8 movies done.
50 * 9 movies done.
50 * 10 movies done.
50 * 11 movies done.
50 * 12 movies done.
50 * 13 movies done.
50 * 14 movies done.
50 * 15 movies done.
50 * 16 movies done.
50 * 17 movies done.
50 * 18 movies done.
50 * 19 movies done.
50 * 20 movies done.
50 * 21 movies done.
50 * 22 movies done.
50 * 23 movies done.
50 * 24 movies done.
50 * 25 movies done.
50 * 26 movies done.
50 * 27 movies done.
50 * 28 movies done.
50 * 29 movies done.
50 * 30 movies done.
50 * 31 movies done.
50 * 32 movies done.
50 * 33 movies done.
50 * 34 movies done.
50 * 35 movies done.
50 * 36 movies done.
50 * 37 movies done.
50 * 38 movies done.
50 * 39 movies done.
50 * 40 movies done.
50 * 41 movies done.
50 * 42 movies done.
50 * 43 movies done.
50 * 44 movies done.
50 * 45 movies done.
50 * 46 movies done.
50 * 47 movies done.
50 * 48 movies done.
5

In [23]:
#this section merges the movie credits list into a dictionary to create a datafram
moviecredits_df = pd.DataFrame({
    "Movie ID": creditmovie,
    "Actor 1": actor1_list,
    "Actor 2": actor2_list,
    "Actor 3": actor3_list,
    "Director": directorslist
})

In [24]:
#Preview the movie credits df
moviecredits_df.head(5)

Unnamed: 0,Movie ID,Actor 1,Actor 2,Actor 3,Director
0,11688,David Spade,John Goodman,Eartha Kitt,Mark Dindal
1,4247,Anna Faris,Jon Abrahams,Marlon Wayans,Keenen Ivory Wayans
2,9600,Martin Lawrence,Nia Long,Paul Giamatti,Raja Gosnell
3,98,Russell Crowe,Joaquin Phoenix,Connie Nielsen,Ridley Scott
4,3134,Karen Lancaume,Raffaëla Anderson,Ouassini Embarek,Virginie Despentes


In [25]:
#export moviecredits_df to csv file
moviecredits_df.to_csv("../Raw Data/movie_credits.csv")