In [553]:
import json
import pandas as pd
import numpy as np
import re

In [554]:
file_dir = 'C://Users/Casey Chen/Desktop/Analysis Projects/Movies-ETL/'

In [555]:
with open(f'{file_dir}/wikipedia-movies.json', mode='r') as file:
    wiki_movies_raw = json.load(file)

In [556]:
len(wiki_movies_raw)

7311

In [557]:
#View the first 5 records
wiki_movies_raw[:5]

#View the last 5 records
wiki_movies_raw[-5:]

#View some records in the middle
wiki_movies_raw[3600:3605]

[{'url': 'https://en.wikipedia.org/wiki/Benji:_Off_the_Leash!',
  'year': 2004,
  'imdb_link': 'https://www.imdb.com/title/tt0315273/',
  'title': 'Benji: Off the Leash!',
  'Directed by': 'Joe Camp',
  'Written by': 'Joe Camp',
  'Starring': ['Benji', 'Nick Whitaker', 'Shaggy', 'Gypsy the Cockatoo'],
  'Music by': 'Antonio di Lorenzo',
  'Productioncompany ': 'Mulberry Square Productions',
  'Distributed by': 'Mulberry Square Productions',
  'Release date': ['March 26, 2004', '(', '2004-03-26', ')'],
  'Running time': '97 min',
  'Country': 'United States',
  'Language': 'English',
  'Box office': '$3,817,362'},
 {'url': 'https://en.wikipedia.org/wiki/The_Best_Thief_in_the_World',
  'year': 2004,
  'imdb_link': 'https://www.imdb.com/title/tt0389796/',
  'title': 'The Best Thief in the World',
  'Directed by': 'Jacob Kornbluth',
  'Produced by': ['Tim Perrell', 'Nicola Usborne'],
  'Written by': 'Jacob Kornbluth',
  'Starring': ['Marc Rozendaal',
   'Michael Silverman',
   'David Warsh

In [558]:
kaggle_metadata = pd.read_csv(f'{file_dir}movies_metadata.csv', low_memory=False)
ratings = pd.read_csv(f'{file_dir}ratings.csv')

In [559]:
#Convert the dictionary into a data frame
wiki_movies_df = pd.DataFrame(wiki_movies_raw)
wiki_movies_df.head()
wiki_movies_df.columns.tolist()

['url',
 'year',
 'imdb_link',
 'title',
 'Directed by',
 'Produced by',
 'Screenplay by',
 'Story by',
 'Based on',
 'Starring',
 'Narrated by',
 'Music by',
 'Cinematography',
 'Edited by',
 'Productioncompany ',
 'Distributed by',
 'Release date',
 'Running time',
 'Country',
 'Language',
 'Budget',
 'Box office',
 'Written by',
 'Genre',
 'Theme music composer',
 'Country of origin',
 'Original language(s)',
 'Producer(s)',
 'Editor(s)',
 'Production company(s)',
 'Original network',
 'Original release',
 'Productioncompanies ',
 'Executive producer(s)',
 'Production location(s)',
 'Distributor',
 'Picture format',
 'Audio format',
 'Voices of',
 'Followed by',
 'Composer(s)',
 'Created by',
 'Also known as',
 'Opening theme',
 'No. of episodes',
 'Preceded by',
 'Author',
 'Publisher',
 'Publication date',
 'Media type',
 'Pages',
 'ISBN',
 'OCLC',
 'LC Class',
 'Cover artist',
 'Series',
 'Set in',
 'Adaptation by',
 'Suggested by',
 'Biographical data',
 'Born',
 'Died',
 'Resti

In [560]:
#Create a list comprehension with the filter expression and save it to an intermediate variable
    #[expression for element in source_list]
    #or
    #[expression for element in source_list if filter_expression]
wiki_movies = [movie for movie in wiki_movies_raw
              if ('Director' in movie or 'Directed by' in movie)
                  and 'imdb_link' in movie
                  and 'No. of episodes' not in movie]
#This checks if "Director" or "Directed by" are keys in the current dictionary. 
#If there is a director listed, we also want to check that the dictionary has an IMBD link
#We want to remove TV shows by getting rid of the column "No. of episodes"

len(wiki_movies)

7076

## Create a Function to Clean the Data

In [561]:
def clean_movie(movie):
    movie = dict(movie)
    #Creates a non-destructive copy by:
    #1.) creating a local variable called 'movie' 
    #2.) assigning it to the new copy of the parameter movie
    #This way, 'movie' will refer to the local copy in the 'clean_movie' function. 
    #Any changes we make inside 'clean_movie' will only affect the copy.v 
    return movie

In [562]:
wiki_movies_df[wiki_movies_df['Arabic'].notnull()]['url']

7060    https://en.wikipedia.org/wiki/The_Insult_(film)
7293     https://en.wikipedia.org/wiki/Capernaum_(film)
Name: url, dtype: object

In [563]:
sorted(wiki_movies_df.columns.tolist())

['Actor control',
 'Adaptation by',
 'Alias',
 'Alma mater',
 'Also known as',
 'Animation by',
 'Arabic',
 'Area',
 'Area served',
 'Artist(s)',
 'Attraction type',
 'Audio format',
 'Author',
 'Based on',
 'Biographical data',
 'Bopomofo',
 'Born',
 'Box office',
 'Budget',
 'Camera setup',
 'Cantonese',
 'Characters',
 'Children',
 'Chinese',
 'Cinematography',
 'Closing date',
 'Color process',
 'Comics',
 'Composer(s)',
 'Coordinates',
 'Country',
 'Country of origin',
 'Cover artist',
 'Created by',
 'Date premiered',
 'Designer(s)',
 'Developed by',
 'Developer(s)',
 'Dewey Decimal',
 'Died',
 'Directed by',
 'Director',
 'Distributed by',
 'Distributor',
 'Divisions',
 'Duration',
 'Edited by',
 'Editor(s)',
 'Ending theme',
 'Engine',
 'Engine(s)',
 'Executive producer(s)',
 'Family',
 'Fate',
 'Film(s)',
 'Followed by',
 'Format(s)',
 'Formerly',
 'Founded',
 'Founder',
 'Founders',
 'French',
 'Full name',
 'Gender',
 'Genre',
 'Genre(s)',
 'Genres',
 'Gwoyeu Romatzyh',
 'Ha

### Handle the Alternative Titles & Combine Similar Column Names

In [564]:
#Step 1: Make an empty dictionary to hold all the alternative titles
#Step 2: Loop through a list of all alternative title keys
#Step 2a: Check if the current key exists in the movie object
#Step 2b: If so, remove the key-value pair and add to the alternative titles dictionary
#Step 3: After looping through every key, add the alternative titles dict to the movie object

def clean_movie(movie):
    movie = dict(movie)
    alt_titles = {}
    
    ## Combine alternate titles into one list ##
    for key in ['Also known as', 'Arabic', 'Cantonese', 'Chinese', 'French', 
                'Hangul', 'Hebrew', 'Hepburn', 'Japanese', 'Literally', 'Mandarin', 
                'McCune-Reischauer', 'Original title', 'Polish', 'Revised Romanization', 
                'Romanized', 'Russian', 'Simplified', 'Traditional', 'Yiddish']:
        if key in movie:
            alt_titles[key] = movie[key]
            movie.pop(key)
            #pop() returns the value from the removed key-pair value
    if len(alt_titles) > 0:
        movie['alt_titles'] = alt_titles
        
        ## Merge column names ##
        def change_column_name(old_name, new_name):
            if old_name in movie:
                movie[new_name] = movie.pop(old_name)
            change_column_name('Adaptation by', 'Writer(s)')
            change_column_name('Country of origin', 'Country')
            change_column_name('Directed by', 'Director')
            change_column_name('Edited by', 'Editor(s)')
            change_column_name('Length', 'Running time')
            change_column_name('Original release', 'Release date')
            change_column_name('Music by', 'Composer(s)')
            change_column_name('Produced by', 'Producer(s)')
            change_column_name('Producer', 'Producer(s)')
            change_column_name('Productioncompanies ', 'Production company(s)')
            change_column_name('Productioncompany ', 'Production company(s)')
            change_column_name('Released', 'Release Date')
            change_column_name('Release Date', 'Release date')
            change_column_name('Screen story by', 'Writer(s)')
            change_column_name('Screenplay by', 'Writer(s)')
            change_column_name('Story by', 'Writer(s)')
            change_column_name('Theme music composer', 'Composer(s)')
            change_column_name('Written by', 'Writer(s)')
            
    return movie

In [None]:
#Create a list of cleaned movies with a list comprehension
clean_movies = [clean_movie(movie) for movie in wiki_movies]

#Set wiki_movies_df to be the data frame created from clean_movies and print out a list of the columns
wiki_movies_df = pd.DataFrame(clean_movies)
sorted(wiki_movies_df.columns.tolist())

### Remove Duplicate Rows

In [None]:
#First, extract the IMDb ID from the IMDb link using regular expressions and str.extract()
#IMDb links generally look like "https://www.imdb.com/title/tt1234567/," with "tt1234567" as the IMDb ID

#Extract the IMDb ID
wiki_movies_df['imdb_id'] = wiki_movies_df['imdb_link'].str.extract(r'(tt\d{7})')
print(len(wiki_movies_df))

#Drop duplicates of IMDb IDs using drop_duplicates()
wiki_movies_df.drop_duplicates(subset='imdb_id', inplace=True)
print(len(wiki_movies_df))
wiki_movies_df.head()

### Remove Mostly Null Columns

In [None]:
#To count the null values for each column, we can use a list comprehension
[[column,wiki_movies_df[column].isnull().sum()] for column in wiki_movies_df.columns]

#We want a list of columns that have less than 90% null values
#The output gives us the columns that we want to keep
[column for column in wiki_movies_df.columns if wiki_movies_df[column].isnull().sum() < len(wiki_movies_df) * 0.9]

In [None]:
#Select this list from the data frame
wiki_columns_to_keep = [column for column in wiki_movies_df.columns if wiki_movies_df[column].isnull().sum() < len(wiki_movies_df) * 0.9]
wiki_movies_df = wiki_movies_df[wiki_columns_to_keep]
wiki_movies_df.head()

### Convert and Parse Data

In [None]:
#Identify which columns need to be converted
wiki_movies_df.dtypes

#Box office should be numeric
#Budget should be numeric
#Release date should be date object
#Running time should be numeric

In [None]:
#Look for rows where the box office data is defined and make a data series that drops missing values
box_office = wiki_movies_df['Box office'].dropna()

In [None]:
#Regular expressions only work on string, so we need to make sure box office data is entered as a string
def is_not_a_string(x):
    return type(x) != str
box_office[box_office.map(is_not_a_string)]

#Having to create a new function everytime we use map() method is cumbersome and interrupts readability.
#We also don't need to use the function outside of the map() call
#Instead of creating a new function with a block of code and the def keyword, we can create an anonymous lambda function inside the map() call

#The lambda version of 'is_not_a_string' function is:
box_office[box_office.map(lambda x: type(x) != str)]

#We can see that quite a few data points are stored as lists

In [None]:
#In order to concatenate the items on the list, we need to make a separator string and then use the join() method
#The space is the joining character
box_office = box_office.apply(lambda x: ' '.join(x) if type(x) == list else x)

#Many of the office box numbers are written like "\$123.4 million" or "\$123,456,789"
#Thus, we need to use regular expressions to find out how many of each style is in the data
#There is a built-in dependency for regular expressions: re

## Parse the Box Office Data

### Create the First Form

In [None]:
#The pattern will need to match 6 elements:
#1.) a dollar sign
#2.) an arbitrary (but non-zero) number of digits - use a + modifier to capture one or more digits
#3.) an optional decimal point - use a question mark modifier
#4.) an arbitrary (but possibly zero) number of more digits - use a * modifier because there may not be any digits after the decimal
#5.) a space (possibily more than one) - use \s to match whitespace and a * modifier to match any number of whitespace
#6.) the word "million" or "billion" - use [] to match with a character set for the first letter
#Because we need the escape characters to remain, we need to preface the string with an 'r'
form_one = r"\$\d+\.?\d*\s*[mb]illion"

In [None]:
#Count the number of box office values that match the first form using 'str.contains()'
#To ignore upper and lower cases, add an argument called 'flags' and set it equal to 're.IGNORECASE'
box_office.str.contains(form_one, flags=re.IGNORECASE).sum()

### Create the Second Form

In [None]:
#The patter will need to match 3 elements:
#1.) a dollar sign
#2.) a group of one to three digits - use {} to match one through three repetitions
#3.) at least one group starting with a comma and followed by exactly three digits - use ()+ to match any repetition of the group of three
    # use ?: to specify a non-captuing group
form_two = r"\$\d{1,3}(?:,\d{3})+"

In [None]:
box_office.str.contains(form_two, flags=re.IGNORECASE).sum()

### Compare Values in Forms

In [None]:
#See if any box office values are described by both
#Start by creating two Boolean Series, then select the box office values that don't match either
matches_form_one = box_office.str.contains(form_one, flags=re.IGNORECASE)
matches_form_two = box_office.str.contains(form_two, flags=re.IGNORECASE)

#This will render an error:
    #box_office[(not matches_form_one) and (not matches_form_two)]

#Pandas has element-wise logical operators:
    # element-wise negation operator: ~ (similar to "not")
    # element-wise logical "and": &
    # element-wise logical "or": |
box_office[~matches_form_one & ~matches_form_two]

### Fix Pattern Matches

In [None]:
#Some values have spaces in between the dollar sign and the number. To fix that, add '\s*' after the $ sign
form_one = r"\$\s*\d+\.?\d*\s*[mb]illion"
form_two = r"\$\s*\d{1,3}(?:,\d{3})+"

#Some values use a period as a thousands separator, not a comma. To fix that, allow for either a comma or period
#Don't forget escape the period with a \
form_two = r"\$\s*\d{1,3}(?:[,\.]\d{3})+"
#The results will match values like 1.234 billion, but we want to change raw numbers
#To solve this, dd a negative lookahead group that looks for "million" or "billion" and rejects the match
form_two = r"\$\s*\d{1,3}(?:[,\.]\d{3})+(?!\s[mb]illion)"

#Some values are given as a range. To fix that, search for any string that starts with a $ and ends with a -
box_office = box_office.str.replace(r'\$.*[-–—](?![a-z])', '$', regex=True)

#"Million" is sometimes mispelled as "millon"
form_one = r"\$\s*\d+\.?\d*\s*[mb]illi?on"

## Extract and Convert the Box Office Values

In [None]:
#Create a regular expression that captures data when it matches either form_one or form_two
box_office.str.extract(f'({form_one}|{form_two})')

In [None]:
#Create a function to turn the extracted values into a numeric value
#Use re.match(pattern,string) to see if the string matches a pattern
#Use re.sub(pattern, replacement_string, string) to remove dollar signs, spaces, commas, and letters

def parse_dollars(s):
    #if s is not a string, return NaN
    if type(s) != str:
        return np.nan

    #if input is of the form $###.# million
    if re.match(r'\$\s*\d+\.?\d*\s*milli?on', s, flags=re.IGNORECASE):
        #remove dollar sign and " million"
        s = re.sub('\$|\s|[a-zA-Z]', '', s)
        #convert to float and multiply by a million
        value = float(s) * 10**6
        #return value
        return value
    
    #if input is of the form $###.# billion
    elif re.match(r'\$\s*\d+\.?\d*\s*billi?on', s, flags=re.IGNORECASE):
        #remove dollar sign and " billion"
        s = re.sub('\$|\s|[a-zA-Z]', '', s)
        #convert to float and multiply by a billion
        value = float(s) * 10**9
        # return value
        return value
    
    #if input is of the form $###,###,###
    elif re.match(r'\$\s*\d{1,3}(?:[,\.]\d{3})+(?!\s[mb]illion)', s, flags=re.IGNORECASE):
        #remove dollar sign and commas
        s = re.sub('\$|,', '', s)
        #convert to float
        value = float(s)
        #return value
        return value

    #otherwise, return NaN
    else:
        return np.nan

In [None]:
#Extract the values from box_office with str.extract and apply parse_dollars to the first column in the df returned by str.extract
wiki_movies_df['box_office'] = box_office.str.extract(f'({form_one}|{form_two})', flags=re.IGNORECASE)[0].apply(parse_dollars)
wiki_movies_df['box_office']

In [None]:
#Drop the box_office column
wiki_movies_df.drop('Box office', axis = 1, inplace=True)
wiki_movies_df.head()

## Parse Budget Data

In [None]:
#Create a budget variable
budget = wiki_movies_df['Budget'].dropna()

#Convert any lists to strings
budget = budget.map(lambda x: ' '.join(x) if type(x) == list else x)

#Remove any values between a dollar sign and a hypen (for budgets given in ranges)
budget = budget.str.replace(r'\$.*[-–—](?![a-z])', '$', regex=True)

In [None]:
#Apply the same pattern matches from box_office to the budget_data without modifications
matches_form_one = budget.str.contains(form_one, flags=re.IGNORECASE)
matches_form_two = budget.str.contains(form_two, flags=re.IGNORECASE)
budget[~matches_form_one & ~matches_form_two]

In [None]:
#Remove the citation references
budget = budget.str.replace(r'\[\d+\]\s*', '')
budget[~matches_form_one & ~matches_form_two]

In [None]:
#Parse the budget values
wiki_movies_df['budget'] = budget.str.extract(f'({form_one}|{form_two})', flags=re.IGNORECASE)[0].apply(parse_dollars)

In [None]:
#Drop the original Budget column
wiki_movies_df.drop('Budget', axis=1, inplace=True)
wiki_movies_df.head()

## Parse Release Data

In [None]:
#Make a variable that holds the non-null values of Release date in the data frame, converting lists to strings
release_date = wiki_movies_df['Release date'].dropna().apply(lambda x: ' '.join(x) if type(x) == list else x)

In [None]:
#The forms that will be parsed include:
#1.) full month name, one- to two-digit day, four-digit year (i.e. January 1, 2000)
#2.) four-digit year, two-digit month, two-digit day, with any separator (i.e. 2000-01-01)
#3.) full month name, four-digit year (i.e. January 2000)
#4.) four-digit year

date_form_one = r'(?:January|February|March|April|May|June|July|August|September|October|November|December)\s[123]\d,\s\d{4}'
date_form_two = r'\d{4}.[01]\d.[123]\d'
date_form_three = r'(?:January|February|March|April|May|June|July|August|September|October|November|December)\s\d{4}'
date_form_four = r'\d{4}'

In [None]:
#Extract the dates
release_date.str.extract(f'({date_form_one}|{date_form_two}|{date_form_three}|{date_form_four})', flags=re.IGNORECASE)

In [None]:
#Instead of creating our own function to parse the dates, we can use the built-in to_datetime()
wiki_movies_df['release_date'] = pd.to_datetime(release_date.str.extract(f'({date_form_one}|{date_form_two}|{date_form_three}|{date_form_four})')[0], infer_datetime_format=True)

### Parse Running Time

In [None]:
#Make a variable that holds the non-null values of Release date in the data frame, converting lists into strings
running_time = wiki_movies_df['Running time'].dropna().apply(lambda x: ' '.join(x) if type(x) == list else x)

In [None]:
#Determine the number of running times that follow the format "100 minutes" using string boundaries
running_time.str.contains(r'^\d*\s*minutes$', flags=re.IGNORECASE).sum()

In [None]:
#View what the other data formats look like
running_time[running_time.str.contains(r'^\d*\s*minutes$', flags=re.IGNORECASE) != True]

In [None]:
#Make it more general by marking the beginning of the string and accepting other abbreviations of 'minutes'
running_time.str.contains(r'^\d*\s*m', flags=re.IGNORECASE).sum()

In [None]:
#View the remaining running times
running_time[running_time.str.contains(r'^\d*\s*m', flags=re.IGNORECASE) != True]

In [None]:
#Match all hour+minute patterns with one regular expression pattern:
#1.) start with one or more digits
#2.) have an optional space after the digit and before the letter 'h'
#3.) captuer all possible abbreviations of 'hour(s).' (We would make every letter in 'hours' optional except the 'h')
#4.) have an optional space after the 'hours' marker
#5.) have an optional number of digits for minutes
#Add capture groups and alternating characters to ensure that we extract the digits and allow for both possible patterns
running_time_extract = running_time.str.extract(r'(\d+)\s*ho?u?r?s?\s*(\d*)|(\d+)\s*m')

In [None]:
#We need to convert this new data frame from strings to numeric values. Since we may have captured empty strings,
#we'll use the to_numeric() method and set the errors argument to 'coerce'. Coercing the errors will turn the empty strings
#into Not a Number (NaN). Then use fillna() to change all NaNs to zeros
running_time_extract = running_time_extract.apply(lambda col: pd.to_numeric(col, errors='coerce')).fillna(0)

In [None]:
#Apply a function that converts the hour capture groups and minute capture groups to minutes 
#if the pure minute capture group is zero and save the output to wiki_movies_df
wiki_movies_df['running_time'] = running_time_extract.apply(lambda row: row[0]*60 + row[1] if row[2] == 0 else row[2], axis=1)

In [None]:
#Drop Running time from the data set
wiki_movies_df.drop('Running time', axis=1, inplace=True)
wiki_movies_df.head()