In [1]:
import json
import pandas as pd
import numpy as np

import re

from sqlalchemy import create_engine
import psycopg2

from config import db_password

import time

In [2]:
# 1. Add the clean movie function that takes in the argument, "movie".
def clean_movie(movie):
    local_movie = dict(movie)
    alt_titles = {}
    for key in ['Also known as','Arabic','Cantonese','Chinese','French',
                'Hangul','Hebrew','Hepburn','Japanese','Literally',
                'Mandarin','McCune–Reischauer','Original title','Polish',
                'Revised Romanization','Romanized','Russian',
                'Simplified','Traditional','Yiddish']:
        if key in local_movie:
            alt_titles[key] = local_movie[key]
            local_movie.pop(key)
    if len(alt_titles) > 0:
        local_movie['alt_titles'] = alt_titles

#define embedded function to remove a column that is equivalent to another column and pop the value to the remaining column
    def change_column_name (old_key,new_key):
        if old_key in local_movie:
            local_movie[new_key] = local_movie.pop(old_key)
# Dictionary of repeated columns with similar names and what they should be changed to
    old_key_dict = {'Directed by':'Director',
                   'Country of origin':'Country',
                   'Distributed by':'Distributor',
                   'Edited by':'Editor(s)',
                   'Produced by':'Producer',
                    'Producer(s)':'Producer',
                    'Production company(s)':'Productioncompany',
                    'Productioncompanies ':'Productioncompany',
                    'Productioncompany ':'Productioncompany',
                    'Released':'Release date',
                    'Running time':'Length',
                    'Screen story by':'Writer(s)',
                    'Screenplay by':'Writer(s)',
                   'Release Date':'Release date,',
                   'Story by':'Writer(s)',
                   'Theme music composer':'Composer(s)',
                   'Written by':'Writer(s)'}
    
    for key,value in old_key_dict.items():
       change_column_name(key,value)
                    
    return local_movie


In [None]:
def extract_transform_load(wiki_data, kaggle_data, rating_data):
    # 2. Read in the kaggle metadata and MovieLens ratings CSV files as Pandas DataFrames.
    kaggle_metadata = pd.read_csv(kaggle_file)
    ratings = pd.read_csv(rating_data)    

    with open(wiki_data, mode = 'r') as file:
        wiki_movies_raw = json.load(file)
# Filter out the tv shows by removing rows with values in 'No. of episodes', and/or 'No. of seasons','Television series',
# Write list comprehension that filters out TV shows from the dataset
    wiki_movies_raw = [movie for movie in wiki_movies_raw if ('Television series' not in movie) and ('No. of episodes' not in movie) and ('No. of Seasons' not in movie)]
    wiki_movies_cleaner = [clean_movie(movie) for movie in wiki_movies_raw]
    wiki_movies_df = pd.DataFrame(wiki_movies_cleaner)
    
    imdb_id_form = r'(tt\d{7})'
        
    try:            
        wiki_movies_df['imdb_id'] = wiki_movies_df['imbd_link'].str.extract(imdb_data_form)
        wiki_movies_df.drop_duplicates(subset = 'imdb_id',inplace = True)
    except Exception as error:
        print("Exception: {}".format(type(error).__name__))
        print("Exception message: {}".format(error))

    #  7. Write a list comprehension to keep the columns that don't have null values from the wiki_movies_df DataFrame.
    wiki_movies_non_null_columns = [column for column in wiki_movies_df.columns if wiki_movies_df[column].isnull().sum() > 0]
    wiki_movies_df = wiki_movies_df[wiki_movies_non_numm_columns]
    # 8. Create a variable that will hold the non-null values from the “Box office” column.
    box_office_non_null = wiki_movies_df['Box office'].dropna()
    
    # 9. Convert the box office data created in Step 8 to string values using the lambda and join functions.
    box_office_non_null = box_office_non_null.apply(lambda x: ' '.join(x) if type(x) == list else x)

    # 10. Write a regular expression to match the six elements of "form_one" of the box office data.
    form_one = r'\$\s*\d+\.?\d*\s*[mb]illi?on'
    # 11. Write a regular expression to match the three elements of "form_two" of the box office data.
    form_two = r'\$\s*\d{1,3}(?:[,\.]\d{3})+(?!\s[mb]illion)'
    # 12. Add the parse_dollars function.
    
    def parse_dollars(s):
    # if s is not a string, return NaN
    if type(s) != str:
        return np.nan
    
    # if input is of the form $###.# million
    if re.match(r'\$\s*\d+\.?\d*\s*milli?on', s, flags=re.IGNORECASE):
        s = re.sub('\$|\s|[a-zA-Z]','', s)
        value = float(s) * 10**6
        return value
    
    # if input is of the form $###.# billion
    elif re.match(r'\$\s*\d+\.?\d*\s*billi?on', s, flags=re.IGNORECASE):
        s = re.sub('\$|\s|[a-zA-Z]','', s)
        value = float(s) * 10**9
        return value

    # if input is of the form $###,###,###
    elif re.match(r'\$\s*\d{1,3}(?:[,\.]\d{3})+(?!\s[mb]illion)', s, flags=re.IGNORECASE):
        s = re.sub('\$|,','', s)
        value = float(s)
        return value
    else:
        return np.nan
        
    # 13. Clean the box office column in the wiki_movies_df DataFrame.
    wiki_movies_df['box_office'] = box_office_non_null.str.extract(f'({form_one}|{form_two})',
                                                          flags=re.IGNORECASE)[0].apply(parse_dollars)
    wiki_movies_df.drop('Box office', index = 1, inplace = True)
    
    # 14. Clean the budget column in the wiki_movies_df DataFrame.
    budget_non_null = wiki_movies_df['Budget'].dropna()
    budget_non_null = budget_non_null.apply(lambda x:  ' '.join(x) if type(x) == list else x)
    budget_non_null = budget_non_null.str.replace(r'\$.*[-—–](?![a-z])', '$', regex=True)
    budget_non_null = budget_non_null.str.replace(r'\[\d+\]\s*', '')
    wiki_movies_df['budget'] = budget_non_null.str.extract(f'({form_one}|{form_two})', 
                                                           flags=re.IGNORECASE)[0].apply(parse_dollars)
    wiki_movies_df.drop('Budget', index = 1, inplace = True)
    
    # 15. Clean the release date column in the wiki_movies_df DataFrame.
    release_date_non_null = wiki_mivies_df.dropna().apply(lambda x:  ' '.join(x) if type(x) == list else x)

    # 16. Clean the running time column in the wiki_movies_df DataFrame.
    
    # Return three variables. The first is the wiki_movies_df DataFrame
    
    
    
    return wiki_movies_df, kaggle_metadata, ratings


In [3]:
# 2 Add the function that takes in three arguments;
# Wikipedia data, Kaggle metadata, and MovieLens rating data (from Kaggle)

def function_name():
    # Read in the kaggle metadata and MovieLens ratings CSV files as Pandas DataFrames.
    

    # Open and read the Wikipedia data JSON file.
    
    
    # 3. Write a list comprehension to filter out TV shows.
    

    # 4. Write a list comprehension to iterate through the cleaned wiki movies list
    # and call the clean_movie function on each movie.
    

    # 5. Read in the cleaned movies list from Step 4 as a DataFrame.


    # 6. Write a try-except block to catch errors while extracting the IMDb ID using a regular expression string and
    #  dropping any imdb_id duplicates. If there is an error, capture and print the exception.
    try:
        
    except 

    #  7. Write a list comprehension to keep the columns that don't have null values from the wiki_movies_df DataFrame.
    

    # 8. Create a variable that will hold the non-null values from the “Box office” column.

    
    # 9. Convert the box office data created in Step 8 to string values using the lambda and join functions.
    

    # 10. Write a regular expression to match the six elements of "form_one" of the box office data.
   
    # 11. Write a regular expression to match the three elements of "form_two" of the box office data.
    

    # 12. Add the parse_dollars function.
    
    
        
    # 13. Clean the box office column in the wiki_movies_df DataFrame.

    
    # 14. Clean the budget column in the wiki_movies_df DataFrame.
    

    # 15. Clean the release date column in the wiki_movies_df DataFrame.
    

    # 16. Clean the running time column in the wiki_movies_df DataFrame.
    
    # Return three variables. The first is the wiki_movies_df DataFrame
    
    return wiki_movies_df, kaggle_metadata, ratings 

In [4]:
# 17. Create the path to your file directory and variables for the three files.
file_dir = 
# The Wikipedia data
wiki_file = f'{file_dir}/wikipedia.movies.json'
# The Kaggle metadata
kaggle_file = f'{file_dir}/movies_metadata.csv'
# The MovieLens rating data.
ratings_file = f'{file_dir}/ratings.csv'

In [5]:
# 18. Set the three variables equal to the function created in D1.
wiki_file, kaggle_file, ratings_file = extract_transform_load()

In [6]:
# 19. Set the wiki_movies_df equal to the wiki_file variable. 
wiki_movies_df = wiki_file

In [7]:
# 20. Check that the wiki_movies_df DataFrame looks like this. 
wiki_movies_df.head()

In [8]:
# 21. Check that wiki_movies_df DataFrame columns are correct. 
wiki_movies_df.columns.to_list()