In [1]:
import requests
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from scipy import stats


with open("api_key.txt","r") as api_key_file:
    api_key = api_key_file.readline()


## Create DataFrame for the Top Rated Movies

Only really interested in getting the IDs for each movie

In [None]:
list_of_top_movies = []

for page_num in range(1, 21):
    url = f"https://api.themoviedb.org/3/movie/top_rated?api_key={api_key}&page={page_num}"

    headers = {"accept": "application/json"}

    response = requests.get(url, headers=headers)
    json_response = response.json()

    for movie in json_response['results']:
        list_of_top_movies.append(movie)

top_movies = pd.DataFrame(list_of_top_movies)
top_movies

## Create DataFrame for the top rated movies with all detailed info

In [None]:
list_of_detailed_movies = []

for index, row in top_movies.iterrows():
    id = row['id']

    url = f"https://api.themoviedb.org/3/movie/{id}?api_key={api_key}"

    headers = {
        "accept": "application/json",
    }

    response = requests.get(url, headers=headers)
    json_response = response.json()
    list_of_detailed_movies.append(json_response)

movies_detailed = pd.DataFrame(list_of_detailed_movies)

movies_detailed

## Runtime vs Rating

In [None]:
# Filter out movies with runtime less than 60 and make a copy
movies_filtered = movies_detailed[movies_detailed['runtime'] >= 60].copy()

movies_filtered['runtime'] = movies_filtered['runtime'].astype(int)
movies_filtered['vote_average'] = movies_filtered['vote_average'].astype(float)

sns.set_theme()
ax = sns.regplot(x='runtime', y='vote_average', data=movies_filtered, ci=None)
ax.set_title('Runtime vs Vote Average')

print('length: ' + str(len(movies_filtered)))
print( 'std: ' + str(np.std(movies_filtered.runtime)))
print('average runtime: ' + str(movies_filtered['runtime'].mean()))
display(stats.pearsonr(movies_filtered.runtime, movies_filtered.vote_average))
display(ax)


## Revenue vs Rating

In [None]:
# look at adding new attribute that is normailzing budget and revenue | percent of revenue : ((revennue - budget) / budget) * 100
movies_filtered['budget'] = movies_filtered['budget'].astype(int)
movies_revenue = movies_filtered[(movies_filtered['budget'] != 0) & (movies_filtered['revenue'] != 0)].copy() # removing value over 6000 that is swaying p-value
movies_revenue['percent_of_revenue'] = ((movies_revenue['revenue'] - movies_revenue['budget']) / movies_revenue['budget']) * 100
movies_revenue = movies_revenue[movies_revenue['percent_of_revenue'] < 6000]
#movies_revenue = movies_revenue.dropna()
ax = sns.regplot(x='vote_average', y='percent_of_revenue', data=movies_revenue, ci=None)
ax.set_title("Revenue vs Rating")
display(stats.pearsonr(movies_revenue.percent_of_revenue, movies_revenue.vote_average))
display(ax)

Cleaning out unwanted columns from detailed movie dataframe a bit

In [None]:
movies = movies_detailed[['budget', 'id', 'origin_country', 'original_language', 'popularity','release_date', 'revenue', 'runtime', 'title', 'vote_average', 'vote_count']]

movies['release_date'] = pd.to_datetime(movies['release_date'])

movies['release_year'] = movies['release_date'].dt.year
movies['release_month'] = movies['release_date'].dt.month

movies = movies[movies['revenue'] > 0]

display(movies)

# Number of Released Movies by Month and Mean Revenue by Month Released

In [None]:
data = movies.groupby(['release_month']).agg({'revenue' : ['sum', 'mean'], 'release_month' : ['count']}).reset_index()

data.columns = ['release_month', 'revenue_sum', 'revenue_mean', 'count']
display(data)

fig, axes = plt.subplots(1, 2, figsize=(12, 6))
sns.barplot(ax=axes[0], data=data, x='release_month', y='count')

sns.barplot(ax=axes[1], data=data, x='release_month', y='revenue_mean', color='green')

# sns.barplot(ax=axes[2], data=data, x='release_month', y='revenue_sum')


# Popular Movie Average Ratings

In [None]:
list_of_popular_movies = []

for page_num in range(1, 21):
    url = f"https://api.themoviedb.org/3/movie/popular?api_key={api_key}&page={page_num}"

    headers = {"accept": "application/json"}

    response = requests.get(url, headers=headers)
    json_response = response.json()

    for movie in json_response['results']:
        list_of_popular_movies.append(movie)

popular_movies = pd.DataFrame(list_of_popular_movies)
popular_movies = popular_movies[popular_movies['vote_count'] > 10]
popular_movies

In [None]:
sns.displot(data=popular_movies, x="vote_average")