Data is from The Movie DB API https://www.themoviedb.org/movie/top-rated

In [48]:
import requests
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from scipy import stats


# with open("api_key.txt","r") as api_key_file:
#     api_key = api_key_file.readline()

## Create DataFrame for the Top Rated Movies

Only really interested in getting the IDs for each movie

In [None]:
list_of_top_movies = []

for page_num in range(1, 26):
    url = f"https://api.themoviedb.org/3/movie/top_rated?page={page_num}"

    headers = {
        "accept": "application/json",
        "Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiI4MGUxNGJjMjI0MjIyMjk5OWU3OGRlMDVlYjQwNzkyYiIsIm5iZiI6MTcyNzE0MzIxNy4yNjI0NzIsInN1YiI6IjY2ZjFkMGIxYTk3ODgwMTQ4ZjNiNDM4YSIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.7Lkqw_A_fJjuTix4LOYY0bXHP9sRdYZsiS4VI984IUA"
    }

    response = requests.get(url, headers=headers)
    json_response = response.json()

    for movie in json_response['results']:
        list_of_top_movies.append(movie)

top_movies = pd.DataFrame(list_of_top_movies)
top_movies

## Create DataFrame for the top rated movies with all detailed info

In [None]:
list_of_detailed_movies = []

for index, row in top_movies.iterrows():
    id = row['id']

    url = f"https://api.themoviedb.org/3/movie/{id}"

    headers = {
        "accept": "application/json",
        "Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiI4MGUxNGJjMjI0MjIyMjk5OWU3OGRlMDVlYjQwNzkyYiIsIm5iZiI6MTcyNzE0MzIxNy4yNjI0NzIsInN1YiI6IjY2ZjFkMGIxYTk3ODgwMTQ4ZjNiNDM4YSIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.7Lkqw_A_fJjuTix4LOYY0bXHP9sRdYZsiS4VI984IUA"
    }

    response = requests.get(url, headers=headers)
    json_response = response.json()
    list_of_detailed_movies.append(json_response)

movies_detailed = pd.DataFrame(list_of_detailed_movies)

movies_detailed

Cleaning out unwanted columns from detailed movie dataframe a bit

In [None]:
movies = movies_detailed[['budget', 'id', 'origin_country', 'original_language', 'popularity','release_date', 'revenue', 'runtime', 'title', 'vote_average', 'vote_count']]

movies['release_date'] = pd.to_datetime(movies['release_date'])

movies['release_year'] = movies['release_date'].dt.year
movies['release_month'] = movies['release_date'].dt.month

movies = movies[movies['revenue'] > 0]
movies = movies[movies['runtime'] >= 60]
movies['runtime'] = movies['runtime'].astype(int)
movies['vote_average'] = movies['vote_average'].astype(float)
display(movies)

## Runtime vs Rating

In [None]:
# Filter out movies with runtime less than 60 and make a copy

sns.set_theme()
ax = sns.regplot(x='runtime', y='vote_average', data=movies, ci=None)
ax.set_title('Runtime vs Vote Average')

print('length: ' + str(len(movies)))
print( 'std: ' + str(np.std(movies.runtime)))
print('average runtime: ' + str(movies['runtime'].mean()))
display(stats.pearsonr(movies.runtime, movies.vote_average))
display(ax)


## Revenue vs Rating

In [None]:
# look at adding new attribute that is normailzing budget and revenue | percent of revenue : ((revenue - budget) / budget) * 100
movies['budget'] = movies['budget'].astype(int)
movies_revenue = movies[(movies['budget'] != 0) & (movies['revenue'] != 0)].copy() # removing value over 6000 that is swaying p-value
movies_revenue['percent_of_revenue'] = ((movies_revenue['revenue'] - movies_revenue['budget']) / movies_revenue['budget']) * 100
display(movies_revenue[movies_revenue['percent_of_revenue'] > 6000])
movies_revenue = movies_revenue[movies_revenue['percent_of_revenue'] < 6000]

ax = sns.regplot(x='vote_average', y='percent_of_revenue', data=movies_revenue, ci=None)
ax.set_title("Revenue vs Rating")
display(stats.pearsonr(movies_revenue.percent_of_revenue, movies_revenue.vote_average))
display(ax)

## Number of Released Movies by Month and Mean Revenue by Month Released
Plot the number of top rated movies that were released in any given month and the average revenue of those movies by month

In [None]:
data = movies.groupby(['release_month']).agg({'revenue' : ['sum', 'mean'], 'release_month' : ['count']}).reset_index()

data.columns = ['release_month', 'revenue_sum', 'revenue_mean', 'count']
display(data)

fig, axes = plt.subplots(1, 2, figsize=(14, 7))
sns.barplot(ax=axes[0], data=data, x='release_month', y='count').set(title='Number of Top Rated Movies Released by Month')

sns.barplot(ax=axes[1], data=data, x='release_month', y='revenue_mean', color='green').set(title='Average Revenue of Top Rated Movies by Month')

# sns.barplot(ax=axes[2], data=data, x='release_month', y='revenue_sum')


## Popular Movie Average Ratings
Histogram of the voted average rating of current popular movies

Trimmed data a bit more to get rid of movies that didn't have many votes

In [None]:
list_of_popular_movies = []

for page_num in range(1, 26):
    url = f"https://api.themoviedb.org/3/movie/popular?page={page_num}"

    headers = {
        "accept": "application/json",
        "Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiI4MGUxNGJjMjI0MjIyMjk5OWU3OGRlMDVlYjQwNzkyYiIsIm5iZiI6MTcyNzE0MzIxNy4yNjI0NzIsInN1YiI6IjY2ZjFkMGIxYTk3ODgwMTQ4ZjNiNDM4YSIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.7Lkqw_A_fJjuTix4LOYY0bXHP9sRdYZsiS4VI984IUA"
    }

    response = requests.get(url, headers=headers)
    json_response = response.json()

    for movie in json_response['results']:
        list_of_popular_movies.append(movie)

popular_movies = pd.DataFrame(list_of_popular_movies)
popular_movies = popular_movies[popular_movies['vote_count'] > 10]
popular_movies

In [None]:
print('average rating: ' + str(popular_movies['vote_average'].mean()))
print( 'std: ' + str(np.std(popular_movies.vote_average)))

sns.displot(data=popular_movies, x="vote_average").set(title='Distribution of Popular Movie Ratings')