# __Data Cleaning and Preparing Tables for Merge__


#### First we need to import the libraries and data sets.

In [83]:
import pandas as pd
import matplotlib as plt

!ls zippedData

bom.movie_gross.csv.gz
imdb.name.basics.csv.gz
imdb.title.akas.csv.gz
imdb.title.basics.csv.gz
imdb.title.crew.csv.gz
imdb.title.principals.csv.gz
imdb.title.ratings.csv.gz
rt.movie_info.tsv.gz
rt.reviews.tsv.gz
tmdb.movies.csv.gz
tn.movie_budgets.csv.gz


In [84]:
bom = pd.read_csv('zippedData/bom.movie_gross.csv.gz')
num = pd.read_csv('zippedData/tn.movie_budgets.csv.gz')

#### My team and I have decided to only focus on 'The Numbers' for financial data. <br>
- I will make a copy of the data to work with.

In [85]:
numcopy = num.copy()

In [86]:
numcopy.head(10)

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"
2,3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350"
3,4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963"
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747"
5,6,"Dec 18, 2015",Star Wars Ep. VII: The Force Awakens,"$306,000,000","$936,662,225","$2,053,311,220"
6,7,"Apr 27, 2018",Avengers: Infinity War,"$300,000,000","$678,815,482","$2,048,134,200"
7,8,"May 24, 2007",Pirates of the Caribbean: At Worldâs End,"$300,000,000","$309,420,425","$963,420,425"
8,9,"Nov 17, 2017",Justice League,"$300,000,000","$229,024,295","$655,945,209"
9,10,"Nov 6, 2015",Spectre,"$300,000,000","$200,074,175","$879,620,923"


### Preparing financial data for transition from object data to numeric data.

- First we have to strip all of the dollar signs.

- Then we have to remove the commas.

In [87]:
# This code removes the '$' from our financial data.

numcopy['worldwide_gross'] = numcopy['worldwide_gross'].str.strip('$')
numcopy['domestic_gross'] = numcopy['domestic_gross'].str.strip('$')
numcopy['production_budget'] = numcopy['production_budget'].str.strip('$')

In [88]:
# This code removes the commas from our financial data.

numcopy['production_budget'] = numcopy['production_budget'].apply(lambda x: x.replace(',', ''))
numcopy['domestic_gross'] = numcopy['domestic_gross'].apply(lambda x: x.replace(',', ''))
numcopy['worldwide_gross'] = numcopy['worldwide_gross'].apply(lambda x: x.replace(',', ''))

In [89]:
# I am now overwriting these obect columns and making them numeric.

numcopy['production_budget'] = pd.to_numeric(numcopy['production_budget'])
numcopy['domestic_gross'] = pd.to_numeric(numcopy['domestic_gross'])
numcopy['worldwide_gross'] = pd.to_numeric(numcopy['worldwide_gross'])

#### Now I have to change my release_date column to a data type that we can manipulate easily.

In [90]:
numcopy['release_date'] = pd.to_datetime(numcopy['release_date'])

In [91]:
numcopy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5782 entries, 0 to 5781
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   id                 5782 non-null   int64         
 1   release_date       5782 non-null   datetime64[ns]
 2   movie              5782 non-null   object        
 3   production_budget  5782 non-null   int64         
 4   domestic_gross     5782 non-null   int64         
 5   worldwide_gross    5782 non-null   int64         
dtypes: datetime64[ns](1), int64(4), object(1)
memory usage: 271.2+ KB


In [92]:
numcopy.head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,1,2009-12-18,Avatar,425000000,760507625,2776345279
1,2,2011-05-20,Pirates of the Caribbean: On Stranger Tides,410600000,241063875,1045663875
2,3,2019-06-07,Dark Phoenix,350000000,42762350,149762350
3,4,2015-05-01,Avengers: Age of Ultron,330600000,459005868,1403013963
4,5,2017-12-15,Star Wars Ep. VIII: The Last Jedi,317000000,620181382,1316721747


### Creating extra columns to further explain our revenue data.

- Total profit = worldwide_gross divided by the cost of making the movie 
- Return on Investment = total profit divided by movie costs
- domestic profit = domestic gross minus the movie costs
- domestic profit margin = domestic profit divided by domestic gross
- total profit margin = total profit divided by worldwide gross

In [93]:
# Create extra columns to dive deeper into the financial data.

numcopy['total profit'] = numcopy['worldwide_gross'] - numcopy['production_budget']

numcopy['ROI']  =  numcopy['total profit'] / numcopy['production_budget']

numcopy['domestic profit'] = numcopy['domestic_gross'] - numcopy['production_budget']

numcopy['domestic profit margin'] = numcopy['domestic profit'] / numcopy['domestic_gross']

numcopy['total profit margin'] = numcopy['total profit'] / numcopy['worldwide_gross'] 

In [94]:
numcopy.head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross,total profit,ROI,domestic profit,domestic profit margin,total profit margin
0,1,2009-12-18,Avatar,425000000,760507625,2776345279,2351345279,5.532577,335507625,0.441163,0.846921
1,2,2011-05-20,Pirates of the Caribbean: On Stranger Tides,410600000,241063875,1045663875,635063875,1.546673,-169536125,-0.703283,0.607331
2,3,2019-06-07,Dark Phoenix,350000000,42762350,149762350,-200237650,-0.572108,-307237650,-7.18477,-1.337036
3,4,2015-05-01,Avengers: Age of Ultron,330600000,459005868,1403013963,1072413963,3.243841,128405868,0.279748,0.764364
4,5,2017-12-15,Star Wars Ep. VIII: The Last Jedi,317000000,620181382,1316721747,999721747,3.153696,303181382,0.488859,0.759251


### Creating Movie Id column and then setting it as the index for joining purposes.

In [97]:

numcopy['movie_id'] = numcopy['movie'].str.strip().str.lower() + numcopy['release_date'].dt.year.astype(str)

In [98]:
numcopy.head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross,total profit,ROI,domestic profit,domestic profit margin,total profit margin,movie_id
0,1,2009-12-18,Avatar,425000000,760507625,2776345279,2351345279,5.532577,335507625,0.441163,0.846921,avatar2009
1,2,2011-05-20,Pirates of the Caribbean: On Stranger Tides,410600000,241063875,1045663875,635063875,1.546673,-169536125,-0.703283,0.607331,pirates of the caribbean: on stranger tides2011
2,3,2019-06-07,Dark Phoenix,350000000,42762350,149762350,-200237650,-0.572108,-307237650,-7.18477,-1.337036,dark phoenix2019
3,4,2015-05-01,Avengers: Age of Ultron,330600000,459005868,1403013963,1072413963,3.243841,128405868,0.279748,0.764364,avengers: age of ultron2015
4,5,2017-12-15,Star Wars Ep. VIII: The Last Jedi,317000000,620181382,1316721747,999721747,3.153696,303181382,0.488859,0.759251,star wars ep. viii: the last jedi2017


In [101]:
# set our movie_id column as the index.
numcopy = numcopy.set_index('movie_id')

In [102]:
numcopy.head()

Unnamed: 0_level_0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross,total profit,ROI,domestic profit,domestic profit margin,total profit margin
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
avatar2009,1,2009-12-18,Avatar,425000000,760507625,2776345279,2351345279,5.532577,335507625,0.441163,0.846921
pirates of the caribbean: on stranger tides2011,2,2011-05-20,Pirates of the Caribbean: On Stranger Tides,410600000,241063875,1045663875,635063875,1.546673,-169536125,-0.703283,0.607331
dark phoenix2019,3,2019-06-07,Dark Phoenix,350000000,42762350,149762350,-200237650,-0.572108,-307237650,-7.18477,-1.337036
avengers: age of ultron2015,4,2015-05-01,Avengers: Age of Ultron,330600000,459005868,1403013963,1072413963,3.243841,128405868,0.279748,0.764364
star wars ep. viii: the last jedi2017,5,2017-12-15,Star Wars Ep. VIII: The Last Jedi,317000000,620181382,1316721747,999721747,3.153696,303181382,0.488859,0.759251


In [103]:
numcopy.to_csv('tn_data')