# Project 3

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory=False)
ratings = pd.read_csv("Data/title_ratings.csv.gz", low_memory=False)
akas = pd.read_csv("Data/title_akas.csv.gz", low_memory=False)

basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020.0,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
3,tt0079644,movie,November 1828,November 1828,0,2001.0,,140,"Drama,War"
4,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"


In [3]:
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1910
1,tt0000002,5.8,256
2,tt0000003,6.5,1714
3,tt0000004,5.6,169
4,tt0000005,6.2,2528


In [4]:
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle


## Preprocessing

### Replace "\N" with np.nan

In [5]:
basics['runtimeMinutes'] = basics['runtimeMinutes'].replace({'\\N':np.nan})

### Eliminate movies that are null for runtimeMinutes and Eliminate movies that are null for genre

In [6]:
basics['runtimeMinutes'] = basics['runtimeMinutes'].dropna()
basics['genres'] = basics['genres'].dropna()

In [7]:
basics.isna().sum()

tconst                 0
titleType              0
primaryTitle           0
originalTitle          0
isAdult                0
startYear              0
endYear           136283
runtimeMinutes         0
genres                 0
dtype: int64

### keep only titleType==Movie

In [8]:
basics['titleType'] = basics['titleType'].replace({'\\N':np.nan})
basics['titleType'] = basics.drop(basics.loc[basics['titleType']!="movie"].index, inplace=True)
basics['titleType'].value_counts()

Series([], Name: titleType, dtype: int64)

### keep startYear 2000-2022

In [9]:
basics['startYear'].describe()

count    136283.000000
mean       2012.738038
std           5.737555
min        2000.000000
25%        2009.000000
50%        2014.000000
75%        2018.000000
max        2021.000000
Name: startYear, dtype: float64

In [10]:
basics['startYear'] = basics['startYear'].replace({'\\N':np.nan})
basics['startYear'].describe()
basics['startYear'] = basics['startYear'].dropna()
basics['startYear'] = basics['startYear'].astype("float")

#### There's a problem here, I don't know what

In [11]:
basics['startYear'] = basics.drop(basics.loc[(basics['startYear'] < 2022) & (basics['startYear'] > 1999)].index, inplace=True)
basics['startYear'].describe()

count       0
unique      0
top       NaN
freq      NaN
Name: startYear, dtype: object

In [12]:
basics.dtypes

tconst             object
titleType          object
primaryTitle       object
originalTitle      object
isAdult             int64
startYear          object
endYear           float64
runtimeMinutes      int64
genres             object
dtype: object

### Eliminate movies that include "Documentary" in genre (see tip below)

In [13]:
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]

### Keep only US movies

In [14]:
akas['region'] = akas['region'].replace({'\\N':np.nan})
akas['region'] = akas.drop(akas.loc[akas['region']!="us"].index, inplace=True)
akas['region'].value_counts()

Series([], Name: region, dtype: int64)

In [15]:
keepers =basics['tconst'].isin(akas['titleId'])
keepers

Series([], Name: tconst, dtype: bool)

In [16]:
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres


## Saving Files
### Creating a "Data" folder.

In [17]:
import os
os.makedirs('Data/',exist_ok=True) 
os.listdir("Data/")

['.ipynb_checkpoints',
 'final_tmdb_data_2000.csv.gz',
 'final_tmdb_data_2001.csv.gz',
 'title_akas.csv.gz',
 'title_basics.csv.gz',
 'title_ratings.csv.gz',
 'tmdb_api_results_2000.json',
 'tmdb_api_results_2001.json']

### Saving Compressed Files

In [18]:
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

In [19]:
ratings.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)

In [20]:
akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)

## API Work

In [3]:
import json
with open('/Users/Test/.secret/tmdb_api.json', 'r') as f:
    login = json.load(f)
login.keys()

dict_keys(['api-key'])

In [4]:
import tmdbsimple as tmdb
tmdb.API_KEY =  login['api-key']
tmdb.API_KEY

'bdf430059a47190d8a36935999c0ad7a'

In [5]:
movie = tmdb.Movies(641)
info = movie.info()
info

{'adult': False,
 'backdrop_path': '/g1U311jgL4DJbGgvIZgiCfHp44Q.jpg',
 'belongs_to_collection': None,
 'budget': 4500000,
 'genres': [{'id': 80, 'name': 'Crime'}, {'id': 18, 'name': 'Drama'}],
 'homepage': 'http://www.requiemforadream.com/',
 'id': 641,
 'imdb_id': 'tt0180093',
 'original_language': 'en',
 'original_title': 'Requiem for a Dream',
 'overview': 'The hopes and dreams of four ambitious people are shattered when their drug addictions begin spiraling out of control. A look into addiction and how it overcomes the mind and body.',
 'popularity': 16.382,
 'poster_path': '/nOd6vjEmzCT0k4VYqsA2hwyi87C.jpg',
 'production_companies': [{'id': 380,
   'logo_path': None,
   'name': 'Thousand Words',
   'origin_country': 'US'},
  {'id': 2188,
   'logo_path': None,
   'name': 'Artisan Entertainment',
   'origin_country': 'US'},
  {'id': 7503,
   'logo_path': '/3K8wbNkTn7O4wX89ucnp1ZYR1XF.png',
   'name': 'Protozoa Pictures',
   'origin_country': 'US'}],
 'production_countries': [{'iso_

In [6]:
movie_good_time = tmdb.Movies('tt4846232')
info_gt = movie_good_time.info()
info_gt['tagline']

'Are you ready for a Good Time?'

Test is complete

## Create JSON

In [7]:
import os

In [8]:
JSON_FILE = "/Data/tmdb.json"
JSON_FILE

'/Data/tmdb.json'

In [9]:
file_exists = os.path.isfile(JSON_FILE)
if file_exists == False:    
    folder = os.path.dirname(JSON_FILE)
    if len(folder)>0:
        os.makedirs(folder,exist_ok=True)
        
        
    print(f"[i] {JSON_FILE} not found. Saving empty list to file.")
    
    
    with open(JSON_FILE,'w') as f:
        json.dump([],f)  
else:
    print(f"[i] {JSON_FILE} already exists.")

[i] /Data/tmdb.json already exists.


In [10]:
with open(JSON_FILE,'r') as f:
    previous_results = json.load(f)
    
n_results = len(previous_results)
print(f'- {n_results} previous results found.')

- 0 previous results found.


### Creating function for ratings

In [11]:
def get_movie_with_rating(movie_id):
    # From https://login.codingdojo.com/m/376/12529/88081
    movie = tmdb.Movies(movie_id)
    
    info = movie.info()
    
    releases = movie.releases()
    
    for c in releases['countries']:
        if c['iso_3166_1'] == 'US':
            info['certification'] = c['certification']
    return info

#### Test

In [12]:
test = get_movie_with_rating("tt0848228") #Avengers test
test

{'adult': False,
 'backdrop_path': '/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg',
 'belongs_to_collection': {'id': 86311,
  'name': 'The Avengers Collection',
  'poster_path': '/yFSIUVTCvgYrpalUktulvk3Gi5Y.jpg',
  'backdrop_path': '/zuW6fOiusv4X9nnW3paHGfXcSll.jpg'},
 'budget': 220000000,
 'genres': [{'id': 878, 'name': 'Science Fiction'},
  {'id': 28, 'name': 'Action'},
  {'id': 12, 'name': 'Adventure'}],
 'homepage': 'https://www.marvel.com/movies/the-avengers',
 'id': 24428,
 'imdb_id': 'tt0848228',
 'original_language': 'en',
 'original_title': 'The Avengers',
 'overview': 'When an unexpected enemy emerges and threatens global safety and security, Nick Fury, director of the international peacekeeping agency known as S.H.I.E.L.D., finds himself in need of a team to pull the world back from the brink of disaster. Spanning the globe, a daring recruitment effort begins!',
 'popularity': 268.601,
 'poster_path': '/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg',
 'production_companies': [{'id': 420,
   'logo_path

#### Try and Except loop

In [13]:
test_ids = ["tt0848228", "tt0115937","tt0848228","tt0332280"]
results = []
errors = []
for movie_id in test_ids:
    
    try:
        movie_info = get_movie_with_rating(movie_id)
        results.append(movie_info)
        
    except Exception as e: 
        errors.append([movie_id, e])
    
pd.DataFrame(results)

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,False,/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg,"{'id': 86311, 'name': 'The Avengers Collection...",220000000,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",https://www.marvel.com/movies/the-avengers,24428,tt0848228,en,The Avengers,...,1518815515,143,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Some assembly required.,The Avengers,False,7.705,27536,PG-13
1,False,/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg,"{'id': 86311, 'name': 'The Avengers Collection...",220000000,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",https://www.marvel.com/movies/the-avengers,24428,tt0848228,en,The Avengers,...,1518815515,143,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Some assembly required.,The Avengers,False,7.705,27536,PG-13
2,False,/qom1SZSENdmHFNZBXbtJAU0WTlC.jpg,,29000000,"[{'id': 10749, 'name': 'Romance'}, {'id': 18, ...",http://www.newline.com/properties/notebookthe....,11036,tt0332280,en,The Notebook,...,115603229,123,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Behind every great love is a great story.,The Notebook,False,7.88,9826,PG-13


In [14]:
print(f"- Number of errors: {len(errors)}")
errors

- Number of errors: 1


[['tt0115937',
  requests.exceptions.HTTPError('404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/tt0115937?api_key=bdf430059a47190d8a36935999c0ad7a')]]

In [15]:
import time
FOLDER = "Data/"
os.makedirs(FOLDER, exist_ok=True)
os.listdir(FOLDER)

['.ipynb_checkpoints',
 'final_tmdb_data_2000.csv.gz',
 'final_tmdb_data_2001.csv.gz',
 'title_akas.csv.gz',
 'title_basics.csv.gz',
 'title_ratings.csv.gz',
 'tmdb_api_results_2000.json',
 'tmdb_api_results_2001.json']

In [16]:
def write_json(new_data, filename):   # From https://login.codingdojo.com/m/376/12529/88082
    
    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        ## Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file)


### Load in the Title Basics data

In [17]:
basics = pd.read_csv('Data/title_basics.csv.gz')
YEARS_TO_GET = [2000,2001]
errors = [ ]

#### Progress bar

In [18]:
from tqdm.notebook import tqdm_notebook
for YEAR in tqdm_notebook(YEARS_TO_GET, desc='YEARS', position=0):
    time.sleep(.2)

    JSON_FILE = f'{FOLDER}tmdb_api_results_{YEAR}.json'
    file_exists = os.path.isfile(JSON_FILE)
    if file_exists == False:
        with open(JSON_FILE,'w') as f:
            json.dump([{'imdb_id':0}],f)

    df = basics.loc[ basics['startYear']==YEAR].copy()
    movie_ids = df['tconst'].copy()

    previous_df = pd.read_json(JSON_FILE)

    movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]

    ### Inner Loop

    for movie_id in tqdm_notebook(movie_ids_to_get,
                          desc=f'Movies from {YEAR}',
                          position=1,
                          leave=True):
        try:
            temp = get_movie_with_rating(movie_id)  
            write_json(temp,JSON_FILE)
            time.sleep(0.02)
        except Exception as e:
            errors.append([movie_id, e])

    final_year_df = pd.read_json(JSON_FILE)
    final_year_df.to_csv(f"{FOLDER}final_tmdb_data_{YEAR}.csv.gz", compression="gzip", index=False)

YEARS:   0%|          | 0/2 [00:00<?, ?it/s]

Movies from 2000:   0%|          | 0/524 [00:00<?, ?it/s]

Movies from 2001:   0%|          | 0/2116 [00:00<?, ?it/s]

In [19]:
print(f"- Total errors: {len(errors)}")

- Total errors: 1108


In [40]:
table1 = pd.read_csv("Data/final_tmdb_data_2000.csv.gz", low_memory=False)
table1.head()

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt0113026,0.0,/vMFs7nw6P0bIV1jDsQpxAieAVnH.jpg,,10000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '...",,62127.0,en,The Fantasticks,...,0.0,86.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Try to remember the first time magic happened,The Fantasticks,0.0,5.5,22.0,
2,tt0113092,0.0,,,0.0,"[{'id': 878, 'name': 'Science Fiction'}]",,110977.0,en,For the Cause,...,0.0,100.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,The ultimate showdown on a forbidden planet.,For the Cause,0.0,5.1,8.0,
3,tt0116391,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",,442869.0,hi,Gang,...,0.0,152.0,"[{'english_name': 'Hindi', 'iso_639_1': 'hi', ...",Released,,Gang,0.0,0.0,0.0,
4,tt0116748,0.0,/wr0hTHwkYIRC82MwNbhOvqrw27N.jpg,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,579396.0,hi,Karobaar,...,0.0,180.0,"[{'english_name': 'Hindi', 'iso_639_1': 'hi', ...",Released,The Business of Love,Karobaar,0.0,5.5,2.0,


In [41]:
table2 = pd.read_csv("Data/final_tmdb_data_2001.csv.gz", low_memory=False)
table2.head()

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt0035423,0.0,/ab5yL8zgRotrICzGbEl10z24N71.jpg,,48000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 14, 'nam...",,11232.0,en,Kate & Leopold,...,76019048.0,118.0,"[{'english_name': 'Italian', 'iso_639_1': 'it'...",Released,If they lived in the same century they'd be pe...,Kate & Leopold,0.0,6.32,1142.0,PG-13
2,tt0079644,0.0,/79axmuH1UGkB7m72jjB9rPff9om.jpg,,0.0,"[{'id': 10752, 'name': 'War'}]",,285529.0,id,November 1828,...,0.0,140.0,"[{'english_name': 'Indonesian', 'iso_639_1': '...",Released,,November 1828,0.0,0.0,0.0,
3,tt0089067,0.0,,,0.0,"[{'id': 35, 'name': 'Comedy'}]",,210258.0,es,El día de los albañiles 2,...,0.0,90.0,"[{'english_name': 'Spanish', 'iso_639_1': 'es'...",Released,The laborers are back full of love and laughs.,El día de los albañiles 2,0.0,7.2,71.0,
4,tt0114447,0.0,,,0.0,"[{'id': 53, 'name': 'Thriller'}, {'id': 28, 'n...",,151007.0,en,The Silent Force,...,0.0,90.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,They left him for dead... They should have fin...,The Silent Force,0.0,5.0,3.0,


#### Concatenate the data into 1 dataframe for the remainder of the analysis.

In [126]:
conc = pd.concat([table1, table2])
conc

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt0113026,0.0,/vMFs7nw6P0bIV1jDsQpxAieAVnH.jpg,,10000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '...",,62127.0,en,The Fantasticks,...,0.0,86.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Try to remember the first time magic happened,The Fantasticks,0.0,5.5,22.0,
2,tt0113092,0.0,,,0.0,"[{'id': 878, 'name': 'Science Fiction'}]",,110977.0,en,For the Cause,...,0.0,100.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,The ultimate showdown on a forbidden planet.,For the Cause,0.0,5.1,8.0,
3,tt0116391,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",,442869.0,hi,Gang,...,0.0,152.0,"[{'english_name': 'Hindi', 'iso_639_1': 'hi', ...",Released,,Gang,0.0,0.0,0.0,
4,tt0116748,0.0,/wr0hTHwkYIRC82MwNbhOvqrw27N.jpg,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,579396.0,hi,Karobaar,...,0.0,180.0,"[{'english_name': 'Hindi', 'iso_639_1': 'hi', ...",Released,The Business of Love,Karobaar,0.0,5.5,2.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2253,tt8795764,0.0,,,0.0,"[{'id': 27, 'name': 'Horror'}]",https://www.utahwolf.com/films/coming-soon-new...,871624.0,en,New Breed,...,0.0,57.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,New Breed,0.0,0.0,0.0,NR
2254,tt8929248,0.0,,,0.0,"[{'id': 10751, 'name': 'Family'}, {'id': 18, '...",,78417.0,ta,அழகான நாட்கள்,...,0.0,150.0,"[{'english_name': 'Tamil', 'iso_639_1': 'ta', ...",Released,,Azhagana Naatkal,0.0,0.0,0.0,
2255,tt9071078,0.0,,,0.0,"[{'id': 28, 'name': 'Action'}]",http://www.hkcinemagic.com/en/movie.asp?id=6627,201706.0,cn,致命密函,...,0.0,90.0,"[{'english_name': 'Cantonese', 'iso_639_1': 'c...",Released,,Chinese Heroes,0.0,3.0,2.0,
2256,tt9099724,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}]",,616033.0,ja,Rokushukan Private Moment,...,0.0,102.0,"[{'english_name': 'Japanese', 'iso_639_1': 'ja...",Released,,Rokushukan Private Moment,0.0,0.0,0.0,


### Once you have your data from the API, they would like you to perform some light EDA to show:

Please exclude any movies with 0's for budget AND revenue from the remaining visualizations.

How many movies are there in each of the certification categories (G/PG/PG-13/R)?

What is the average revenue per certification category?

What is the average budget per certification category?

#### How many movies had at least some valid financial information (values > 0 for budget OR revenue)?

In [104]:
conc['budget'].describe()

count    4.448000e+03
mean     3.027768e+06
std      1.318161e+07
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.400000e+08
Name: budget, dtype: float64

In [92]:
conc.isna().sum()

imdb_id                     0
adult                       2
backdrop_path            2518
belongs_to_collection    4202
budget                      2
genres                      2
homepage                 4219
id                          2
original_language           2
original_title              2
overview                  357
popularity                  2
poster_path               705
production_companies        2
production_countries        2
release_date               41
revenue                     2
runtime                     2
spoken_languages            2
status                      2
tagline                  3215
title                       2
video                       2
vote_average                2
vote_count                  2
certification            3630
dtype: int64

In [105]:
conc.dtypes

imdb_id                   object
adult                    float64
backdrop_path             object
belongs_to_collection     object
budget                   float64
genres                    object
homepage                  object
id                       float64
original_language         object
original_title            object
overview                  object
popularity               float64
poster_path               object
production_companies      object
production_countries      object
release_date              object
revenue                  float64
runtime                  float64
spoken_languages          object
status                    object
tagline                   object
title                     object
video                    float64
vote_average             float64
vote_count               float64
certification             object
dtype: object

In [106]:
print(f'There are {4.448000e+03} values for budget counting NaN and 0')

There are 4448.0 values for budget counting NaN and 0


In [107]:
conc['budget'].describe()

count    4.448000e+03
mean     3.027768e+06
std      1.318161e+07
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.400000e+08
Name: budget, dtype: float64

In [127]:
conc.duplicated().sum()

1

In [128]:
conc = conc.drop_duplicates()

In [129]:
conc['budget'].fillna(0, inplace=True)
conc['budget'].describe()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  conc['budget'].fillna(0, inplace=True)


count    4.449000e+03
mean     3.027088e+06
std      1.318021e+07
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.400000e+08
Name: budget, dtype: float64

In [130]:
conc['revenue'].fillna(0, inplace=True)
conc['revenue'].describe()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  conc['revenue'].fillna(0, inplace=True)


count    4.449000e+03
mean     6.414795e+06
std      4.043338e+07
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      9.764756e+08
Name: revenue, dtype: float64

In [112]:
conc.isna().sum()

imdb_id                     0
adult                       1
backdrop_path            2517
belongs_to_collection    4201
budget                      0
genres                      1
homepage                 4218
id                          1
original_language           1
original_title              1
overview                  356
popularity                  1
poster_path               704
production_companies        1
production_countries        1
release_date               40
revenue                     0
runtime                     1
spoken_languages            1
status                      1
tagline                  3214
title                       1
video                       1
vote_average                1
vote_count                  1
certification            3629
dtype: int64

There are 3847 movies without budgets listed and 3961 without revenue.

In [134]:
conc.drop(conc.loc[(conc['budget']== 0) & (conc['revenue']== 0)].index, inplace=True)
conc['budget'].describe()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  conc.drop(conc.loc[(conc['budget']== 0) & (conc['revenue']== 0)].index, inplace=True)


count    1.710000e+02
mean     1.945531e+07
std      2.545140e+07
min      0.000000e+00
25%      1.000000e+06
50%      1.000000e+07
75%      2.800000e+07
max      1.020000e+08
Name: budget, dtype: float64

In [135]:
conc['revenue'].describe()

count    1.710000e+02
mean     3.922740e+07
std      8.919252e+07
min      0.000000e+00
25%      0.000000e+00
50%      4.186931e+06
75%      3.823886e+07
max      8.713684e+08
Name: revenue, dtype: float64

In [137]:
conc.shape

(171, 26)

There are 171 movies with either budget or revenue info.

#### How many movies are there in each of the certification categories (G/PG/PG-13/R)?

In [140]:
conc['certification'].value_counts()

R        74
PG-13    29
PG       11
NR        2
G         1
Name: certification, dtype: int64

#### What is the average revenue per certification category?

In [141]:
conc['revenue'].mean()

39227395.25146199

#### What is the average budget per certification category?

In [143]:
conc.groupby('certification')['budget'].mean()

certification
G        3.700000e+07
NR       7.500000e+05
PG       3.729091e+07
PG-13    4.281034e+07
R        1.960811e+07
Name: budget, dtype: float64