# Data Exploration
This notebook opens and explores data from Box Office Mojo, IMDB, Rotten Tomatoes, and TheMovieDB

## Tables to Dataframes

In [1]:
import pandas as pd

In [2]:
bommoviegross_df = pd.read_csv('data/bom.movie_gross.csv.gz')
imdbnamebasics_df = pd.read_csv('data/imdb.name.basics.csv.gz')
imdbtitleakas_df = pd.read_csv('data/imdb.title.akas.csv.gz')
imdbtitlebasics_df = pd.read_csv('data/imdb.title.basics.csv.gz')
imdbtitlecrew_df = pd.read_csv('data/imdb.title.crew.csv.gz')
imdbtitleprincipals_df = pd.read_csv('data/imdb.title.principals.csv.gz')
imdbtitleratings_df = pd.read_csv('data/imdb.title.ratings.csv.gz')
rtmovieinfo_df = pd.read_csv("data/rt.movie_info.tsv.gz", delimiter = '\t')
tmdbmovies_df = pd.read_csv('data/tmdb.movies.csv.gz')
budgets_df = pd.read_csv('data/tn.movie_budgets.csv.gz')

## Table Analysis
Taking stock of the information each table contains, as well as how the tables relate to each other and what data is missing

### Box Office Mojo-Movie Gross
Note: roughly a third of the titles included are missing data for foreign earnings

In [3]:
bommoviegross_df.head()

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
0,Toy Story 3,BV,415000000.0,652000000,2010
1,Alice in Wonderland (2010),BV,334200000.0,691300000,2010
2,Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,664300000,2010
3,Inception,WB,292600000.0,535700000,2010
4,Shrek Forever After,P/DW,238700000.0,513900000,2010


In [4]:
bommoviegross_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3387 entries, 0 to 3386
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           3387 non-null   object 
 1   studio          3382 non-null   object 
 2   domestic_gross  3359 non-null   float64
 3   foreign_gross   2037 non-null   object 
 4   year            3387 non-null   int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 132.4+ KB


In [5]:
bommoviegross_df.isnull().sum()/len(bommoviegross_df)

title             0.000000
studio            0.001476
domestic_gross    0.008267
foreign_gross     0.398583
year              0.000000
dtype: float64

### IMDB-Name Basics
Note: Large percentage of birth and death dates unknown.
Also, primary profession and known_for_titles may have to be separated into lists to be useful.
Alternatively, maybe insight can come from organizing table by movie.

In [None]:
imdbnamebasics_df.head()

In [None]:
imdbnamebasics_df.info()

In [None]:
imdbnamebasics_df.isna().sum()/len(imdbnamebasics_df)

### IMDB-Title AKAs
Note: majority of region and attribute columns are missing data. Types missing roughly 50%

In [None]:
imdbtitleakas_df.head()

In [None]:
imdbtitleakas_df.info()

In [None]:
imdbtitleakas_df.isna().sum()/len(imdbtitleakas_df)

### IMDB-Title Basics
Note: May be useful to separate genre list.

In [6]:
imdbtitlebasics_df.head()

Unnamed: 0,tconst,primary_title,original_title,start_year,runtime_minutes,genres
0,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama"
1,tt0066787,One Day Before the Rainy Season,Ashad Ka Ek Din,2019,114.0,"Biography,Drama"
2,tt0069049,The Other Side of the Wind,The Other Side of the Wind,2018,122.0,Drama
3,tt0069204,Sabse Bada Sukh,Sabse Bada Sukh,2018,,"Comedy,Drama"
4,tt0100275,The Wandering Soap Opera,La Telenovela Errante,2017,80.0,"Comedy,Drama,Fantasy"


In [7]:
imdbtitlebasics_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146144 entries, 0 to 146143
Data columns (total 6 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   tconst           146144 non-null  object 
 1   primary_title    146144 non-null  object 
 2   original_title   146123 non-null  object 
 3   start_year       146144 non-null  int64  
 4   runtime_minutes  114405 non-null  float64
 5   genres           140736 non-null  object 
dtypes: float64(1), int64(1), object(4)
memory usage: 6.7+ MB


In [8]:
imdbtitlebasics_df.isna().sum()/len(imdbtitlebasics_df)

tconst             0.000000
primary_title      0.000000
original_title     0.000144
start_year         0.000000
runtime_minutes    0.217176
genres             0.037005
dtype: float64

### IMDB-Title Crew

In [None]:
imdbtitlecrew_df.head()

In [None]:
imdbtitlecrew_df.info()

In [None]:
imdbtitlecrew_df.isna().sum()/len(imdbtitlecrew_df)

### IMDB-Title Principals
83% of job column missing
62% characters column missing

In [None]:
imdbtitleprincipals_df.head()

In [None]:
imdbtitleprincipals_df.info()

In [None]:
imdbtitleprincipals_df.isna().sum()/len(imdbtitleprincipals_df)

### IMDB-Title Ratings
COMPLETE DATA SET

In [9]:
imdbtitleratings_df.head()

Unnamed: 0,tconst,averagerating,numvotes
0,tt10356526,8.3,31
1,tt10384606,8.9,559
2,tt1042974,6.4,20
3,tt1043726,4.2,50352
4,tt1060240,6.5,21


In [10]:
imdbtitleratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73856 entries, 0 to 73855
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   tconst         73856 non-null  object 
 1   averagerating  73856 non-null  float64
 2   numvotes       73856 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 1.7+ MB


In [11]:
imdbtitleratings_df.isna().sum()/len(imdbtitleratings_df)

tconst           0.0
averagerating    0.0
numvotes         0.0
dtype: float64

### Rotten Tomatoes-Movie Info

In [None]:
rtmovieinfo_df.head()

In [None]:
rtmovieinfo_df.info()

In [None]:
rtmovieinfo_df.isna().sum()/len(rtmovieinfo_df)

### The Movie DB
COMPLETE DATA SET

In [12]:
tmdbmovies_df.head()

Unnamed: 0.1,Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count
0,0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788
1,1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610
2,2,"[12, 28, 878]",10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368
3,3,"[16, 35, 10751]",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174
4,4,"[28, 878, 12]",27205,en,Inception,27.92,2010-07-16,Inception,8.3,22186


In [13]:
tmdbmovies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26517 entries, 0 to 26516
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         26517 non-null  int64  
 1   genre_ids          26517 non-null  object 
 2   id                 26517 non-null  int64  
 3   original_language  26517 non-null  object 
 4   original_title     26517 non-null  object 
 5   popularity         26517 non-null  float64
 6   release_date       26517 non-null  object 
 7   title              26517 non-null  object 
 8   vote_average       26517 non-null  float64
 9   vote_count         26517 non-null  int64  
dtypes: float64(2), int64(3), object(5)
memory usage: 2.0+ MB


In [None]:
tmdbmovies_df.isna().sum()/len(tmdbmovies_df)

### Budgets
COMPLETE DATA SET

In [14]:
budgets_df.head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"
2,3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350"
3,4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963"
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747"


In [None]:
budgets_df.info()

In [None]:
budgets_df.isna().sum()/len(budgets_df)