# Cleaning movieBudgets

In [None]:
import pandas as pd
import numpy as np
import sqlite3


### Unzip Data
This section is used to unzip data from the zippedData folder and place it into the new data folder

In [None]:
#extract im.db zip file
import zipfile
with zipfile.ZipFile('zippedData/im.db.zip', 'r') as zip_ref:
    zip_ref.extractall('data/')

# unzip the gz files 
import gzip
import shutil

# unzip bom.movie_gross
with gzip.open('zippedData/bom.movie_gross.csv.gz', 'rb') as f_in:
    with open('data/bom.movie_gross.csv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
        
# unzip rt.movie_info.tsv
with gzip.open('zippedData/rt.movie_info.tsv.gz', 'rb') as f_in:
    with open('data/rt.movie_info.tsv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
        
# unzip rt.reviews.tsv
with gzip.open('zippedData/rt.reviews.tsv.gz', 'rb') as f_in:
    with open('data/rt.reviews.tsv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
        
# unzip tmdb.movies.csv
with gzip.open('zippedData/tmdb.movies.csv.gz', 'rb') as f_in:
    with open('data/tmdb.movies.csv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
        
# unzip tn.movie_budgets.csv
with gzip.open('zippedData/tn.movie_budgets.csv.gz', 'rb') as f_in:
    with open('data/tn.movie_budgets.csv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

### Import Data and Connect to Database

In [None]:
# import data as 
movieGross = pd.read_csv('data/bom.movie_gross.csv')
tmdbMovies = pd.read_csv('data/tmdb.movies.csv')
movieBudgets = pd.read_csv('data/tn.movie_budgets.csv')
movieInfo = pd.read_csv('data/rt.movie_info.tsv', sep = '\t', index_col = 0)
reviews = pd.read_csv('data/rt.reviews.tsv', sep = '\t', encoding= 'latin1')

In [None]:
# Connect to sql database
conn = sqlite3.connect('data/im.db')

In [None]:
# Preview the dataset to get a better idea of the data that we are working with
movieBudgets.head()

In [None]:
movieBudgets.info()

The only problems with the dataset seem to be regarding types.
- release_date should be of type datetime
- production_budget, domestic_gross, and worldwide_gross should be of type int

In [None]:
#Convert release_date to datetime
movieBudgets["release_date"] = pd.to_datetime(movieBudgets["release_date"])

In [None]:
#for each column, remove the '$' and ',' for each entry and convert to an int
movieBudgets["production_budget"] = movieBudgets["production_budget"].str.replace('$','').str.replace(',','').astype(int)
movieBudgets["domestic_gross"]    = movieBudgets["domestic_gross"].str.replace('$','').str.replace(',','').astype(int)
movieBudgets["worldwide_gross"]   = movieBudgets["worldwide_gross"].str.replace('$','').str.replace(',','').astype(np.int64)

In [None]:
#make sure types are reflected in dataframe
movieBudgets.info()

In [None]:
#take a look at the new data
movieBudgets.head()

In [None]:
# Export movieBudgets as csv
movieBudgets.to_csv('cleanedData/movieBudgets.csv')