In [None]:
# code done by Azi Farooquee and Peter Solis
# dependencies
import requests
import json
import pandas as pd
from pprint import pprint
from api_keys import api_key

In [None]:
# parameters
base_url = "https://api.themoviedb.org/3/discover/movie"
region = 'US'
years = range(2000,2024,1)
pages = range(1,100,1)

# empty list for movie ID, name, popularity, avg rating, # of votes, release year
movie_ids = []
movie_names = []
movie_pops = []
movie_vote_avgs = []
movie_vote_counts = []
movie_release_yrs = []

for year in years:
    # set variable to keep track of pages without results
    bad_page = 0
    for page in pages:
        # finish setting up parameters
        params = {
            'api_key':api_key,
            'include_adult':False,
            'region':region,
            'primary_release_year':year,
            'sort_by':'popularity.desc',
            'page':page
        }
        # get data for given year
        response = requests.get(base_url, params=params).json()
        
        # set variables to see if there's useful data on the page 
        movie_found = False
        
        # go through data
        try:
            # counter for movies that don't fit the requirements so it can break earlier
            bad_result = 0
            # loop through data
            for result in response['results']:
                if (result['popularity'] > 15) & (result['vote_count'] > 100):
                    movie_found = True
                    movie_ids.append(result['id'])
                    movie_names.append(result['title'])
                    movie_pops.append(result['popularity'])
                    movie_vote_avgs.append(result['vote_average'])
                    movie_vote_counts.append(result['vote_count'])
                    movie_release_yrs.append(year)
                else:
                    # if 10 results (half) on page don't fit requirements, move on
                    bad_result += 1
                    if bad_result >= 10:
                        break
                    else:
                        pass
        except:
            print(f'Had an issue getting data from {year}, page: {page}.')
            pass
        # print if no data was found on the page
        if movie_found:
            bad_page = 0
            print(f'Got data from year: {year}, page: {page}.')
        else:
            bad_page += 1
            print(f'Not enough useful data on year: {year}, page: {page}.')
        # skip to next year if multiple pages in a row with NO useful info
        if bad_page >= 2:
            print(f'Multiple pages w/o useful data, skipping the rest of {year}.')
            break

In [None]:
# assemble df
movie_df = pd.DataFrame({
    'Movie ID': movie_ids,
    'Title': movie_names,
    'Release Year': movie_release_yrs,
    'Popularity Score': movie_pops,
    'Average Rating': movie_vote_avgs,
    'Number of Ratings': movie_vote_counts
})

movie_df = movie_df.set_index('Movie ID')

# export
movie_df.to_csv('../Raw Data/2000s_US_IDs.csv')

movie_df

In [None]:
# code by Peter Solis ^^^^^^^^

In [None]:
# code by Azi Farooquee vvvvvvvvvv

In [None]:
#this section pulls in movie details for each movie in the movie_df and stores those details in respective list

movid = []
budget = []
revenue = []
genre1 = []
genre2 = []
genre3 = []
genre4 = []
genre5 = []
releasedate = []
runtime = []

detail_count = 0
fifty_count = 0

for i in range(len(movie_df)):
    try:
        mymovie = movie_df.index[i]
        params = {'api_key':api_key}
        detail_url  = f"https://api.themoviedb.org/3/movie/{mymovie}"

        detailresponse = requests.get(detail_url, params=params).json()
        budgetr = detailresponse["budget"]
        budget.append(budgetr)
        revenuer = detailresponse["revenue"]
        revenue.append(revenuer)
        releasedater = detailresponse["release_date"]
        releasedate.append(releasedater)
        runtimer = detailresponse["runtime"]
        runtime.append(runtimer)
        genrer = detailresponse["genres"] #this returns a list of dictionaries
        # edit by Peter to format to just the genre names
        genre_parsed = [genre['name'] for genre in genrer] 
        while len(genre_parsed) < 5:
            genre_parsed.append('N/A')
        genre1.append(genre_parsed[0])
        genre2.append(genre_parsed[1])
        genre3.append(genre_parsed[2])
        genre4.append(genre_parsed[3])
        genre5.append(genre_parsed[4])
        # edit done
        movidr = detailresponse["id"]
        movid.append(movidr)
        
        # edit by Peter - just a bit to give some output so I know it's working lol
        detail_count += 1
        if detail_count == 50:
            fifty_count += 1
            detail_count = 0
            print(f'50 * {fifty_count} movies done.')
    except:
        print(f"Data error with movie {mymovie}")
        
    
print(len(budget))
print(len(revenue))
print(len(genre1))
print(len(genre2))
print(len(genre3))
print(len(genre4))
print(len(genre5))
print(len(releasedate))
print(len(runtime))


In [None]:
#this section merges the movie details list into a dictionary to create a datafram
moviedetails_df = pd.DataFrame({
    "Movie ID": movid,
    "Budget": budget,
    "Revenue": revenue,
    "Genre 1": genre1,
    "Genre 2": genre2,
    "Genre 3": genre3,
    "Genre 4": genre4,
    "Genre 5": genre5,
    "Release Date": releasedate,
    "Run Time": runtime
})

In [None]:
#show the movie details DF
moviedetails_df

In [None]:
#export moviedetails_df to csv file
moviedetails_df.to_csv("../Raw Data/movie_details.csv")

In [None]:
#This section creates respective lists to store first 3 actors per movie and directors

creditmovie = []
actor1_list = []
actor2_list = []
actor3_list = []
actor4_list = []
actor5_list = []
directorslist = []

detail_count = 0
fifty_count = 0

for i in range(len(movie_df)):
    try:
        mymovie2 = movie_df.index[i]
        params = {'api_key':api_key}
        credit_url  = f"https://api.themoviedb.org/3/movie/{mymovie2}/credits"

        response3 = requests.get(credit_url, params=params).json()
        crmovier = response3["id"]
        creditmovie.append(crmovier)
        # edit here by Peter to deal with empty / missing cast
        actor = []
        for i in range(5):
            try:
                actor.append(response3['cast'][i]['name'])
            except:
                actor.append('N/A')
        actor1_list.append(actor[0])
        actor2_list.append(actor[1])
        actor3_list.append(actor[2])
        actor4_list.append(actor[3])
        actor5_list.append(actor[4])

        for dirname in response3["crew"]:
            if dirname["job"].lower() == "director":
                director = dirname["name"]
                directorslist.append(director)
                break
                
        # edit by Peter - just a bit to give some output so I know it's working lol
        detail_count += 1
        if detail_count == 50:
            fifty_count += 1
            detail_count = 0
            print(f'50 * {fifty_count} movies done.')
    except IndexError:
        print(f"Index error with movie id - {mymovie2} (index - {i})")
        
    
print(len(actor1_list))
print(len(actor2_list))
print(len(actor3_list))
print(len(actor4_list))
print(len(actor5_list))
print(len(directorslist))
print(len(creditmovie))

In [None]:
#this section merges the movie credits list into a dictionary to create a datafram
moviecredits_df = pd.DataFrame({
    "Movie ID": creditmovie,
    "Actor 1": actor1_list,
    "Actor 2": actor2_list,
    "Actor 3": actor3_list,
    "Actor 4": actor4_list,
    "Actor 5": actor5_list,
    "Director": directorslist
})

In [None]:
#Preview the movie credits df
moviecredits_df

In [None]:
#export moviecredits_df to csv file
moviecredits_df.to_csv("../Raw Data/movie_credits.csv")