In [1]:
# Import dependencies
import json
import pandas as pd
import numpy as np

In [2]:
# Create path for data resources
file_dir = "../Movies-ETL/data"

In [3]:
# Read raw JSON data file into pandas
with open(f'{file_dir}/wikipedia-movies.json', mode='r') as file:
    wiki_movies_raw = json.load(file)

In [4]:
# Read raw Kaggle data into pandas
kaggle_metadata = pd.read_csv(f'{file_dir}/movies_metadata.csv', low_memory=False)
ratings = pd.read_csv(f'{file_dir}/ratings.csv')

In [5]:
# Trim down initial JSON movie list to clean data
wiki_movies = [movie for movie in wiki_movies_raw
               if ('Director' in movie or 'Directed by' in movie)
                   and 'imdb_link' in movie
                   and 'No. of episodes' not in movie]
wiki_movies_df = pd.DataFrame(wiki_movies)

In [6]:
# Define function to clean individual movies in list
def clean_movie(movie):
    movie = dict(movie) #create a non-destructive copy
    alt_titles = {}
    # combine alternate titles into one list
    for key in ['Also known as','Arabic','Cantonese','Chinese','French',
                'Hangul','Hebrew','Hepburn','Japanese','Literally',
                'Mandarin','McCune-Reischauer','Original title','Polish',
                'Revised Romanization','Romanized','Russian',
                'Simplified','Traditional','Yiddish']:
        if key in movie:
            alt_titles[key] = movie[key]
            movie.pop(key)
    if len(alt_titles) > 0:
        movie['alt_titles'] = alt_titles

    # merge column names
    def change_column_name(old_name, new_name):
        if old_name in movie:
            movie[new_name] = movie.pop(old_name)
    change_column_name('Adaptation by', 'Writer(s)')
    change_column_name('Country of origin', 'Country')
    change_column_name('Directed by', 'Director')
    change_column_name('Distributed by', 'Distributor')
    change_column_name('Edited by', 'Editor(s)')
    change_column_name('Length', 'Running time')
    change_column_name('Original release', 'Release date')
    change_column_name('Music by', 'Composer(s)')
    change_column_name('Produced by', 'Producer(s)')
    change_column_name('Producer', 'Producer(s)')
    change_column_name('Productioncompanies ', 'Production company(s)')
    change_column_name('Productioncompany ', 'Production company(s)')
    change_column_name('Released', 'Release Date')
    change_column_name('Release Date', 'Release date')
    change_column_name('Screen story by', 'Writer(s)')
    change_column_name('Screenplay by', 'Writer(s)')
    change_column_name('Story by', 'Writer(s)')
    change_column_name('Theme music composer', 'Composer(s)')
    change_column_name('Written by', 'Writer(s)')

    return movie

In [7]:
# Make a list of cleaned movies
clean_movies = [clean_movie(movie) for movie in wiki_movies]
# Create a DataFrame from the clean_movies list
wiki_movies_df = pd.DataFrame(clean_movies)

In [8]:
sorted(wiki_movies_df.columns.tolist())

['Animation by',
 'Audio format',
 'Based on',
 'Box office',
 'Budget',
 'Cinematography',
 'Color process',
 'Composer(s)',
 'Country',
 'Created by',
 'Director',
 'Distributor',
 'Editor(s)',
 'Executive producer(s)',
 'Followed by',
 'Genre',
 'Label',
 'Language',
 'McCune–Reischauer',
 'Narrated by',
 'Original language(s)',
 'Original network',
 'Picture format',
 'Preceded by',
 'Producer(s)',
 'Production company(s)',
 'Production location(s)',
 'Recorded',
 'Release date',
 'Running time',
 'Starring',
 'Suggested by',
 'Venue',
 'Voices of',
 'Writer(s)',
 'alt_titles',
 'imdb_link',
 'title',
 'url',
 'year']

In [10]:
# Extract the IMDB ID tag from each row
wiki_movies_df['imdb_id'] = wiki_movies_df['imdb_link'].str.extract(r'(tt\d{7})')
# Check length of DataFrame
print(len(wiki_movies_df))
# Drop duplicate rows from the DataFrame that contain the same IMDB ID 
wiki_movies_df.drop_duplicates(subset='imdb_id', inplace=True)
# Check length of DataFrame
print(len(wiki_movies_df))

7076
7033


In [15]:
wiki_columns_to_keep = [column for column in wiki_movies_df.columns if wiki_movies_df[column].isnull().sum() < len(wiki_movies_df) * 0.9]
wiki_movies_df = wiki_movies_df[wiki_columns_to_keep]

In [16]:
wiki_movies_df.sample(3)

Unnamed: 0,url,year,imdb_link,title,Based on,Starring,Cinematography,Release date,Running time,Country,...,Budget,Box office,Director,Distributor,Editor(s),Composer(s),Producer(s),Production company(s),Writer(s),imdb_id
4055,https://en.wikipedia.org/wiki/For_Your_Conside...,2006,https://www.imdb.com/title/tt0470765/,For Your Consideration,,"[Bob Balaban, Jennifer Coolidge, Christopher G...",Roberto Schaefer,"[November 17, 2006, (, 2006-11-17, )]",86 minutes,United States,...,$12 million,$5.9 million,Christopher Guest,Warner Independent Pictures,Robert Leighton,C. J. Vanston,Karen Murphy,"[Castle Rock Entertainment, Shangri-La Enterta...","[Christopher Guest, Eugene Levy]",tt0470765
1933,https://en.wikipedia.org/wiki/The_Ice_Storm_(f...,1997,https://www.imdb.com/title/tt0119349/,The Ice Storm,"[The Ice Storm, by, Rick Moody]","[Kevin Kline, Joan Allen, Henry Czerny, Adam H...",Frederick Elmes,"[May 12, 1997, (, 1997-05-12, ), (, Cannes, ),...",113 minutes,United States,...,$18 million,$8 million,Ang Lee,Fox Searchlight Pictures,Tim Squyres,Mychael Danna,"[Ted Hope, James Schamus, Ang Lee]",Good Machine,James Schamus,tt0119349
6805,https://en.wikipedia.org/wiki/Ferdinand_(film),2017,https://www.imdb.com/title/tt3411444/,Ferdinand,"[Ferdinand, by, Munro Leaf, Robert Lawson]","[John Cena, Kate McKinnon, Anthony Anderson, B...",Renato Falcão,"[December 10, 2017, (, 2017-12-10, ), (, Los A...",108 minutes,United States,...,$111 million,$296.1 million,Carlos Saldanha,20th Century Fox,Harry Hitner,John Powell,"[John Davis, Lisa Marie Stetler, Lori Forte, B...","[Blue Sky Studios, [1], 20th Century Fox Anima...","[Ron Burch, David Kidd, Don Rhymer]",tt3411444
