# Project 3, Part 4 - Hypothesis Testing

## Author:  Sheneka Allen



In [35]:
# For part 4 of the project, you will be using your MySQL database from part 3 
# to answer meaningful questions for your stakeholder. They want you to use 
# your hypothesis testing and statistics knowledge to answer 3 questions about 
# what makes a successful movie.

# Questions to Answer

# The stakeholder's first question is: does the MPAA rating of a movie (G/PG/PG-13/R) 
# affect how much revenue the movie generates?

# They want you to perform a statistical test to get a mathematically-supported answer.
# They want you to report if you found a significant difference between ratings.
# If so, what was the p-value of you analysis?
# And which rating earns the most revenue?
# They want you to prepare a visualization that supports your finding.
# Think of 2 additional hypotheses to test that your stakeholder may want to know.

# Examples:

# Do movies that are over 2.5 hours long earn more revenue than movies that 
# are 1.5 hours long (or less)?

# Do movies released in 2020 earn less revenue than movies released in 2018?
# How do the years compare for movie ratings?
# Do some movie genres earn more revenue than others?
# Are some genres higher rated than others? etc.


# Deliverables

# You should use the same project repository you have been using for Parts 1-3 (for your portfolio).
# Create a new notebook in your project repository just for the hypothesis testing 
# (like "Part 4 - Hypothesis Testing.ipynb")
# Make sure the results and visualization for all 3 hypotheses are in your notebook.

In [36]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os, json, math, time
import tmdbsimple as tmdb
from tqdm.notebook import tqdm_notebook

import pymysql
pymysql.install_as_MySQLdb()

from urllib.parse import quote_plus as urlquote # must have for special char pwd
from sqlalchemy import create_engine
from sqlalchemy_utils import create_database, database_exists

In [37]:
# load basics_cleaned .csv.gz file into basics variable
basics = pd.read_csv('Data/title_basics_cleaned.csv.gz')
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205530 entries, 0 to 205529
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          205530 non-null  object
 1   titleType       205530 non-null  object
 2   primaryTitle    205530 non-null  object
 3   originalTitle   205530 non-null  object
 4   isAdult         205530 non-null  int64 
 5   startYear       205530 non-null  int64 
 6   endYear         205530 non-null  int64 
 7   runtimeMinutes  205530 non-null  int64 
 8   genres          205530 non-null  object
dtypes: int64(4), object(5)
memory usage: 14.1+ MB


In [38]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0011801,movie,Tötet nicht mehr,Tötet nicht mehr,0,2019,0,0,"Action,Crime"
1,tt0015414,movie,La tierra de los toros,La tierra de los toros,0,2000,0,60,Missing
2,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,0,118,"Comedy,Fantasy,Romance"
3,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,0,70,Drama
4,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,0,122,Drama


In [39]:
basics.startYear.unique()

array([2019, 2000, 2001, 2020, 2018, 2005, 2004, 2016, 2002, 2017, 2006,
       2008, 2009, 2003, 2007, 2010, 2012, 2011, 2015, 2021, 2013, 2014])

In [40]:
# Load my TMDb login credentials
with open('/Users/shenekaallen/.secret/tmdb_api.json', 'r') as f:
    login = json.load(f)
    
## Display the keys of the loaded dict
login.keys()

dict_keys(['API Key', 'Authorization'])

In [41]:
# set the tmdbapi variable equal to my unique TMDb "API Key(v3 auth)" entry in JSON file
tmdb.API_KEY =  login['API Key']

## Setup to use TMDB API

Define functions, Specify movie Years to extract and folder to save results

### Defined Function: get_movie_with_rating

In [42]:
# function that 1) accepts the movie_id as an argument and
# 2) returns a dictionary of results that includes certification
def get_movie_with_rating(movie_id):
    ## Get movie and release dates
    movie = tmdb.Movies(movie_id)
    ## Construct output dict
    movie_info = movie.info()
    releases = movie.releases()
    # Loop through countries in releases
    for c in releases['countries']:
        # if the country abbreviation==US
        if c['iso_3166_1' ] =='US':
            ## save a "certification" key in the info dict with the certification
            movie_info['certification'] = c['certification']
    return movie_info

### Defined Function: write_json

In [43]:
def write_json(new_data, filename):
    """Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""
    
    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        ## Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
            file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file)

In [44]:
# Define years to collect from movie db and store in a variable
YEARS_TO_GET = [2009,2010]

In [45]:
# Specify folder for saving data in Python vs Jupyter Notebook
FOLDER = "Data/"
# list current files in Data/
os.listdir(FOLDER)

['tmdb_api_results_2010.json',
 'title_basics_cleaned.csv.gz',
 'title.akas.tsv.gz',
 '.DS_Store',
 'title_ratings_cleaned.csv.gz',
 'title.akas.tsv',
 'tmdb_api_results_2000.json',
 'final_tmdb_data_2000.csv.gz',
 'title.basics.tsv.gz',
 'tmdb_api_results_2001.json',
 'final_tmdb_data_2010.csv.gz',
 'title.ratings.tsv.gz',
 'final_tmdb_data_2019.csv.gz',
 'final_tmdb_data_2009.csv.gz',
 'tmdb_api_results_2019.json',
 'final_tmdb_data_2001.csv.gz',
 '.ipynb_checkpoints',
 'tmdb_results_combined.csv.gz',
 'tmdb_api_results_2009.json',
 'title_akas_cleaned.csv.gz']

### Test data extraction for 2009 & 2019 Year Movie Releases

### OUTER Loop to collect data by YEAR

Checks if JSON file exists, if not, creates and writes select query data to the file.

Identifies the designated FOLDER (Data/) and names the file based on the current year. Saves data in separate .csv.gz files BY YEAR.

### INNER Loop to collect index and movie ID

In [46]:
# Start of OUTER loop
for YEAR in tqdm_notebook(YEARS_TO_GET,desc='YEARS',position=0):
    
    #Defining the JSON file to store results for year
    JSON_FILE = f'{FOLDER}tmdb_api_results_{YEAR}.json'
    # Check if file exists
    file_exists = os.path.isfile(JSON_FILE)
     # If it does not exist: create it
    if file_exists == False:
        ## If it does not exist:
        print('The year', YEAR, 'file does not exist.  Creating empty file.')
        # save an empty dict with just "imdb_id" to the new json file.
        with open(JSON_FILE,'w') as f:
            json.dump([{'imdb_id':0}],f)
    else:
        print('The year', YEAR, 'file already exists.')
        

    #Saving new year as the current df
    df = basics.loc[basics['startYear'] == YEAR].copy()
    # saving movie ids to list
    movie_id = df['tconst'].copy()#.to_list()
 
    # Load existing data from json into a dataframe called "previous_basics"
    previous_df = pd.read_json(JSON_FILE)
    # filter out any ids that are already in the JSON_FILE
    movie_id_to_get = movie_id[~movie_id.isin(previous_df['imdb_id'])]
        
    # Get index and movie id from list
    # This loop uses 2 functions: 1) "get_movie_with_rating" to add the certification to the .info results 
    # and 2) "write_json" to extend/append the results to the .json file. 
    
    # INNER Loop
    for movie_id in tqdm_notebook(movie_id_to_get,
                          desc=f'Movies from {YEAR}',
                          position=1,
                          leave=True):
        # Attempt to retrieve the data for the movie id
        try:
            temp = get_movie_with_rating(movie_id)  #This uses your pre-made function
            # Append/extend results to existing file using a pre-made function
            write_json(temp,JSON_FILE)
            # Short 20 ms sleep to prevent overwhelming server
            time.sleep(0.02)
            # If it fails,  make a dict with just the id and None for certification.
        except Exception as e: 
            continue

    ## Saving filtered file as csv.gz
    final_year_df = pd.read_json(JSON_FILE)
    final_year_df.to_csv(f"{FOLDER}final_tmdb_data_{YEAR}.csv.gz", compression="gzip", index=False)

YEARS:   0%|          | 0/2 [00:00<?, ?it/s]

The year 2009 file already exists.


Movies from 2009:   0%|          | 0/3933 [00:00<?, ?it/s]

KeyboardInterrupt: 

## Does the MPAA rating of a movie (G/PG/PG-13/R) affect how much revenue the movie generates?

In [None]:
#tmdb_data_2010 = pd.read_csv('Data/final_tmdb_data_2010.csv.gz')
#tmdb_data_2010.info()
#tmdb_data_2010.head()