# Project Name
By Alec Plante ...

### Import Libraries


In [None]:
import pandas as pd
import numpy as np
import sqlite3


### Unzip Data
This section is used to unzip data from the zippedData folder and place it into the new data folder

In [None]:
#extract im.db zip file
import zipfile
with zipfile.ZipFile('zippedData/im.db.zip', 'r') as zip_ref:
    zip_ref.extractall('data/')

# unzip the gz files 
import gzip
import shutil

# unzip bom.movie_gross
with gzip.open('zippedData/bom.movie_gross.csv.gz', 'rb') as f_in:
    with open('data/bom.movie_gross.csv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
        
# unzip rt.movie_info.tsv
with gzip.open('zippedData/rt.movie_info.tsv.gz', 'rb') as f_in:
    with open('data/rt.movie_info.tsv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
        
# unzip rt.reviews.tsv
with gzip.open('zippedData/rt.reviews.tsv.gz', 'rb') as f_in:
    with open('data/rt.reviews.tsv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
        
# unzip tmdb.movies.csv
with gzip.open('zippedData/tmdb.movies.csv.gz', 'rb') as f_in:
    with open('data/tmdb.movies.csv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
        
# unzip tn.movie_budgets.csv
with gzip.open('zippedData/tn.movie_budgets.csv.gz', 'rb') as f_in:
    with open('data/tn.movie_budgets.csv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

### Import Data and connect to Database

In [None]:
# import data as 
movieGross = pd.read_csv('data/bom.movie_gross.csv')
tmdbMovies = pd.read_csv('data/tmdb.movies.csv')
movieBudgets = pd.read_csv('data/tn.movie_budgets.csv')
movieInfo = pd.read_csv('data/rt.movie_info.tsv', sep = '\t', index_col = 0)
reviews = pd.read_csv('data/rt.reviews.tsv', sep = '\t', encoding= 'latin1')

In [None]:
# Connect to sql database
conn = sqlite3.connect('data/im.db')

### Cleaning movieInfo

In [None]:
movieInfo.head()

In [None]:
movieInfo.tail()

In [None]:
movieInfo.info()

In [None]:
# making a copy to clean without editing main dataframe
movieInfoClean = movieInfo.copy()

In [None]:
#finding duplicate rows
movieInfoClean.duplicated().value_counts()
movieInfoClean[movieInfoClean.duplicated(keep=False)].sort_values(by='id')

In [None]:
movieInfoClean = movieInfoClean.drop_duplicates()

In [None]:
movieInfoClean.duplicated().value_counts()

In [None]:
# changing null values for string columns to '-'
movieInfoClean['synopsis'] = movieInfoClean['synopsis'].fillna('-')
movieInfoClean['rating'] = movieInfoClean['rating'].fillna('-')
movieInfoClean['genre'] = movieInfoClean['genre'].fillna('-')
movieInfoClean['director'] = movieInfoClean['director'].fillna('-')
movieInfoClean['writer'] = movieInfoClean['writer'].fillna('-')
movieInfoClean['theater_date'] = movieInfoClean['theater_date'].fillna('-')
movieInfoClean['dvd_date'] = movieInfoClean['dvd_date'].fillna('-')
movieInfoClean['currency'] = movieInfoClean['currency'].fillna('-')
movieInfoClean['studio'] = movieInfoClean['studio'].fillna('-')


In [None]:
# changing runtime to int representing minutes, replaced null with 0
movieInfoClean['runtime']=movieInfoClean['runtime'].map(lambda x: 0 if x is np.nan else int(x.split(' ')[0]))

In [None]:
# removing commas and changing box office to a float
movieInfoClean['box_office']=movieInfoClean['box_office'].map(lambda x: 0 if x is np.nan else float(x.replace(',','')))

In [None]:
# changing theater date and dvd date to a date time type
movieInfoClean['theater_date']=movieInfoClean['theater_date'].map(lambda x: pd.to_datetime(x,format = "%b %d, %Y") if x != '-' else x)
movieInfoClean['dvd_date']=movieInfoClean['dvd_date'].map(lambda x: pd.to_datetime(x,format = "%b %d, %Y") if x != '-' else x)

In [None]:
# finding all the genres in the dataset
genres = []
for row in movieInfoClean['genre'].map(lambda x: x.split('|')):
    for genre in row:
        if genre not in genres:
            genres.append(genre)
genres
# matching genres to other datasets
genresUpdated = [['Action','Adventure'],
                 ['Classics'],
                 ['Drama'],
                 ['Science Fiction','Fantasy'],
                 ['Music'],
                 ['Mystery'],
                 ['Romance'],
                 ['Family'],
                 ['Comedy'],
                 ['-'],
                 ['Documentary'],
                 ['Special Interest'],
                 ['Art House and International'],
                 ['Horror'],
                 ['Western'],
                 ['TV Movie'],
                 ['Sports and Fitness'],
                 ['Animation'],
                 ['Faith and Spirituality'],
                 ['Cult Movies'],
                 ['Anime and Manga'],
                 ['Gay and Lesbian']
                ]
# making dict matching old genres with new
genreDict = {}
for i in range(len(genres)):
    genreDict[genres[i]]=genresUpdated[i]
# changing column to be final list of genres
finalGenres = []
for row in movieInfoClean['genre'].map(lambda x: x.split('|')):
    thisRow = []
    for genre in row:
        thisRow += genreDict[genre]
    finalGenres.append(thisRow)
movieInfoClean['genre']=finalGenres

In [None]:
movieInfoClean.head()

In [None]:
movieInfoClean.info()


### Cleaning reviews

In [None]:
# First we're gonna see how the data for Reviews is organized to determine types of data per column and unnecessary/problematic
# characters to get rid of



# Here we can see which type of data we're working with
type(reviews)

In [None]:
# Here we print the data
reviews

In [None]:
# Here we confirm missing data, count and add it all up
reviews.isna().sum()

In [None]:
# We create a copy of the review data set which we will modify
reviews2 = reviews.copy()

In [None]:
# We fill empty reviews with unavailable chosen string by the team for consistency
reviews2[['review','rating','critic','publisher']] = reviews2[['review','rating','critic','publisher']].fillna('-')
reviews2

In [None]:
# Here we confirm missing data has been filled out, count and add it all up missing data, should be 0
reviews2.isna().sum()

### Cleaning movieGross

In [None]:
# Converts foreign_gross column to string and removes commas
movieGross['foreign_gross'] = movieGross['foreign_gross'].astype(str).str.replace(",","")
# Converts Null values in foreign_gross column to 0
movieGross['foreign_gross'] = movieGross['foreign_gross'].fillna(0)
# Converts foreign_gross column from object type
movieGross['foreign_gross'] = movieGross['foreign_gross'].astype(float).astype(int)



# Converts domestic_gross column values to integers and Null values in domestic_gross column to 0
movieGross['domestic_gross'] = movieGross['domestic_gross'].fillna(0).astype(int)



# Converts year column to datetime data type
movieGross['year'] = pd.to_datetime(movieGross['year'],format = '%Y')