# Project Name
By Alec Plante, Deanna Hedges, Raul Cortez, Sunny Sanchez, Zachary Mitchell

### Import Libraries


In [None]:
import pandas as pd
import numpy as np
import sqlite3

### Unzip Data
This section is used to unzip data from the zippedData folder and place it into the new data folder

In [None]:
#extract im.db zip file
import zipfile
with zipfile.ZipFile('zippedData/im.db.zip', 'r') as zip_ref:
    zip_ref.extractall('data/')

# unzip the gz files 
import gzip
import shutil

# unzip bom.movie_gross
with gzip.open('zippedData/bom.movie_gross.csv.gz', 'rb') as f_in:
    with open('data/bom.movie_gross.csv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
        
# unzip rt.movie_info.tsv
with gzip.open('zippedData/rt.movie_info.tsv.gz', 'rb') as f_in:
    with open('data/rt.movie_info.tsv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
        
# unzip rt.reviews.tsv
with gzip.open('zippedData/rt.reviews.tsv.gz', 'rb') as f_in:
    with open('data/rt.reviews.tsv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
        
# unzip tmdb.movies.csv
with gzip.open('zippedData/tmdb.movies.csv.gz', 'rb') as f_in:
    with open('data/tmdb.movies.csv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
        
# unzip tn.movie_budgets.csv
with gzip.open('zippedData/tn.movie_budgets.csv.gz', 'rb') as f_in:
    with open('data/tn.movie_budgets.csv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

### Import Data and connect to Database

In [None]:
# import data as 
movieGross = pd.read_csv('data/bom.movie_gross.csv')
tmdbMovies = pd.read_csv('data/tmdb.movies.csv')
movieBudgets = pd.read_csv('data/tn.movie_budgets.csv')
movieInfo = pd.read_csv('data/rt.movie_info.tsv', sep = '\t')
reviews = pd.read_csv('data/rt.reviews.tsv', sep = '\t', encoding= 'latin1')


In [None]:
# Connect to sql database
conn = sqlite3.connect('data/im.db')

### Data Exploration

#### tmdbMovies

In [None]:
# start by looking at the first 5 rows of data
tmdbMovies.head()

At first glace, we can see that there is an extra column that matches with the index.This should be removed.

In [None]:
# View the Column names
tmdbMovies.columns

In [None]:
# Drop 'Unnamed: 0' as it contains the same information as the index
tmdbMovies.drop('Unnamed: 0', axis = 1, inplace = True)

In [None]:
# View the Column names again to confirm that changes were made
tmdbMovies.columns

After removing the unneeded column, the data types should be reviewed to ensure that we are able to work with the table.

In [None]:
# View the information about each column
tmdbMovies.info()

A few columns should be investigated:
- genre_ids should be a list
- release_date should be datetime

In [None]:
# Check the type of each Column
print(f"gener_id type: {type(tmdbMovies['genre_ids'].iloc[1])}\nrelease_date type: {type(tmdbMovies['release_date'].iloc[1])}")

Both are strings, which are not usable for data analysis. We must convert genre_ids and release_date to lists and datetimes, repectively.

Let's start with the genre_ids:

In [None]:
# View the values in genre_ids and check for null values:
print(tmdbMovies.genre_ids.value_counts())
print(f"There are {tmdbMovies['genre_ids'].isna().sum()} null values")
# There are no NA values, and they all seem to be close to lists. We can procede by changing the type to a list

In [None]:
# Convert genre_ids into list
#library with function for us to complete this operation
import ast
#converts all strings into a list
tmdbMovies.genre_ids = tmdbMovies.genre_ids.map(lambda x: ast.literal_eval(x))

In [None]:
# make sure that rows are of type list
for i in tmdbMovies['genre_ids']:
    assert isinstance(i, list), "ERROR: element is not a list"
print("all rows in genre_ids column are of type list :^)")

The genre_ids in tmdbMovies are numbers, which doesn't give us a lot of information. A new column reflecting the meaning of these numbers should be created. The dictionary of the meanings is listed below.

In [None]:
genre_ids_dict={28:'Action',
                12:'Adventure',
                16:'Animation',
                35:'Comedy',
                80:'Crime',
                99:'Documentary',
                18:'Drama',
                10751:'Family',
                14:'Fantasy',
                36:'History',
                27:'Horror',
                10402:'Music',
                9648:'Mystery',
                10749:'Romance',
                878:'Science Fiction',
                10770:'TV Movie',
                53:'Thriller',
                10752:'War',
                37:'Western'}

In [None]:
# Create a new column 'genres' that is a list of the genres as strings
tmdbMovies['genres'] = tmdbMovies['genre_ids'].map(lambda x: list(pd.Series(x,dtype='float64').replace(genre_ids_dict)))
tmdbMovies.head()

In [None]:
type(tmdbMovies['genres'].iloc[1])
tmdbMovies['genres'].iloc[1]

When creating models and comparing data, it may be beneficial to have each genre as its own column with a boolean value indicating whether a given movie is of that genre.

In [None]:
# Creates a column for every value in the dictionary and returns true if that value shows up in genre_ids
for i in range(len(genre_ids_dict)):
    tmdbMovies[list(genre_ids_dict.values())[i]] = (tmdbMovies['genre_ids'].map(lambda x: list(genre_ids_dict.keys())[i] in x)).astype(int)

In [None]:
# Check if the data is changed to booleans
tmdbMovies.head()
# print(int(True))

Now, the release_date column needs to be converted to a datetime.

In [None]:
# Investigate the types of values in the release date column
print(tmdbMovies['release_date'].value_counts())
# make sure there are no NA values
print(f"There are {tmdbMovies['release_date'].isna().sum()} null values")

In [None]:
#convert the column to datetimes
tmdbMovies['release_date'] = pd.to_datetime(tmdbMovies['release_date'])

In [None]:
# make sure that release_date is of type datetime
tmdbMovies.dtypes

### Cleaning movieGross dataset

Questions:
- What is the growth of popularity by genre?
- Which genres are the most profitable?
- Whats the relationship between run time and profitability?
- Which directors have the most popular movies


#### im.db
Exploring the data for the im database

In [None]:
pd.read_sql("""
SELECT name 
FROM sqlite_master 
WHERE type = 'table';""", conn)

In [None]:
# View the 
pd.read_sql("""
SELECT *
FROM movie_basics
""",conn)

In [None]:
pd.read_sql("""
SELECT *
FROM directors
""",conn)

In [None]:
pd.read_sql("""
SELECT *
FROM known_for
""",conn)

In [None]:
pd.read_sql("""
SELECT *
FROM movie_akas
""",conn)

In [None]:
pd.read_sql("""
SELECT *
FROM movie_ratings
""",conn)

In [None]:
pd.read_sql("""
SELECT *
FROM persons
""",conn)

In [None]:
pd.read_sql("""
SELECT *
FROM principals
""",conn)

In [None]:
pd.read_sql("""
SELECT *
FROM writers
""",conn)

In [None]:
pd.read_sql("""

""",conn)