In [17]:
# Import dependencies
import json
import pandas as pd
import numpy as np
import re

In [2]:
# Create path for data resources
file_dir = "../Movies-ETL/data"

In [3]:
# Read raw JSON data file into pandas
with open(f'{file_dir}/wikipedia-movies.json', mode='r') as file:
    wiki_movies_raw = json.load(file)

In [4]:
# Read raw Kaggle data into pandas
kaggle_metadata = pd.read_csv(f'{file_dir}/movies_metadata.csv', low_memory=False)
ratings = pd.read_csv(f'{file_dir}/ratings.csv')

In [5]:
# Trim down initial JSON movie list to clean data
wiki_movies = [movie for movie in wiki_movies_raw
               if ('Director' in movie or 'Directed by' in movie)
                   and 'imdb_link' in movie
                   and 'No. of episodes' not in movie]
wiki_movies_df = pd.DataFrame(wiki_movies)

In [6]:
# Define function to clean individual movies in list
def clean_movie(movie):
    movie = dict(movie) #create a non-destructive copy
    alt_titles = {}
    # combine alternate titles into one list
    for key in ['Also known as','Arabic','Cantonese','Chinese','French',
                'Hangul','Hebrew','Hepburn','Japanese','Literally',
                'Mandarin','McCune-Reischauer','Original title','Polish',
                'Revised Romanization','Romanized','Russian',
                'Simplified','Traditional','Yiddish']:
        if key in movie:
            alt_titles[key] = movie[key]
            movie.pop(key)
    if len(alt_titles) > 0:
        movie['alt_titles'] = alt_titles

    # merge column names
    def change_column_name(old_name, new_name):
        if old_name in movie:
            movie[new_name] = movie.pop(old_name)
    change_column_name('Adaptation by', 'Writer(s)')
    change_column_name('Country of origin', 'Country')
    change_column_name('Directed by', 'Director')
    change_column_name('Distributed by', 'Distributor')
    change_column_name('Edited by', 'Editor(s)')
    change_column_name('Length', 'Running time')
    change_column_name('Original release', 'Release date')
    change_column_name('Music by', 'Composer(s)')
    change_column_name('Produced by', 'Producer(s)')
    change_column_name('Producer', 'Producer(s)')
    change_column_name('Productioncompanies ', 'Production company(s)')
    change_column_name('Productioncompany ', 'Production company(s)')
    change_column_name('Released', 'Release Date')
    change_column_name('Release Date', 'Release date')
    change_column_name('Screen story by', 'Writer(s)')
    change_column_name('Screenplay by', 'Writer(s)')
    change_column_name('Story by', 'Writer(s)')
    change_column_name('Theme music composer', 'Composer(s)')
    change_column_name('Written by', 'Writer(s)')

    return movie

In [7]:
# Make a list of cleaned movies
clean_movies = [clean_movie(movie) for movie in wiki_movies]
# Create a DataFrame from the clean_movies list
wiki_movies_df = pd.DataFrame(clean_movies)

In [8]:
sorted(wiki_movies_df.columns.tolist())

['Animation by',
 'Audio format',
 'Based on',
 'Box office',
 'Budget',
 'Cinematography',
 'Color process',
 'Composer(s)',
 'Country',
 'Created by',
 'Director',
 'Distributor',
 'Editor(s)',
 'Executive producer(s)',
 'Followed by',
 'Genre',
 'Label',
 'Language',
 'McCune–Reischauer',
 'Narrated by',
 'Original language(s)',
 'Original network',
 'Picture format',
 'Preceded by',
 'Producer(s)',
 'Production company(s)',
 'Production location(s)',
 'Recorded',
 'Release date',
 'Running time',
 'Starring',
 'Suggested by',
 'Venue',
 'Voices of',
 'Writer(s)',
 'alt_titles',
 'imdb_link',
 'title',
 'url',
 'year']

In [9]:
# Extract the IMDB ID tag from each row
wiki_movies_df['imdb_id'] = wiki_movies_df['imdb_link'].str.extract(r'(tt\d{7})')
# Check length of DataFrame
print(len(wiki_movies_df))
# Drop duplicate rows from the DataFrame that contain the same IMDB ID 
wiki_movies_df.drop_duplicates(subset='imdb_id', inplace=True)
# Check length of DataFrame
print(len(wiki_movies_df))

7076
7033


In [10]:
wiki_columns_to_keep = [column for column in wiki_movies_df.columns if wiki_movies_df[column].isnull().sum() < len(wiki_movies_df) * 0.9]
wiki_movies_df = wiki_movies_df[wiki_columns_to_keep]

In [11]:
wiki_movies_df.sample(3)

Unnamed: 0,url,year,imdb_link,title,Based on,Starring,Cinematography,Release date,Running time,Country,...,Budget,Box office,Director,Distributor,Editor(s),Composer(s),Producer(s),Production company(s),Writer(s),imdb_id
5391,https://en.wikipedia.org/wiki/Season_of_the_Wi...,2011,https://www.imdb.com/title/tt0479997/,Season of the Witch,,"[Nicolas Cage, Ron Perlman, Robert Sheehan, Cl...",Amir Mokri,"[January 4, 2011, (, 2011-01-04, ), (New York ...",98 minutes,United States,...,$40 million,$91.6 million,Dominic Sena,"[Lionsgate, (original), Rogue, (through, Relat...","[Mark Helfrich, Dan Zimmerman]",Atli Örvarsson,"[Alex Gartner, Charles Roven]",Atlas Entertainment,Bragi F. Schut,tt0479997
1189,https://en.wikipedia.org/wiki/Carnosaur_2,1995,https://www.imdb.com/title/tt0112634/,Carnosaur 2,,"[John Savage, Cliff De Young]",John B. Aronson,"[February 1995, (, 1995-02, )]",83 minutes,United States,...,,,Louis Morneau,New Horizon Picture Corp,"[Roderick Davis, Lorne Morris]",Ed Tomney,Roger Corman,,Michael Palmer,tt0112634
1678,https://en.wikipedia.org/wiki/Pandora%27s_Clock,1996,https://www.imdb.com/title/tt0115309/,Pandora's Clock(Doomsday Virus),"[Pandora's Clock, by, John J. Nance]","[Richard Dean Anderson, Stephen Root, Jane Lee...",,"[November 10, 1996 -, November 11, 1996]",176 mins.,United States,...,,,Eric Laneuville,,Stephen Lovejoy,Don Davis,Michael O. Gallant,,David Israel,tt0115309


In [18]:
# Create a function to clean up [Box office] data
box_office = box_office.apply(lambda x: ' '.join(x) if type(x) == list else x)

In [12]:
wiki_movies_df.dtypes

url                      object
year                      int64
imdb_link                object
title                    object
Based on                 object
Starring                 object
Cinematography           object
Release date             object
Running time             object
Country                  object
Language                 object
Budget                   object
Box office               object
Director                 object
Distributor              object
Editor(s)                object
Composer(s)              object
Producer(s)              object
Production company(s)    object
Writer(s)                object
imdb_id                  object
dtype: object

In [13]:
# Check length of DataFrame
print(len(wiki_movies_df))
# Drop duplicate rows from the DataFrame that contain the same IMDB ID 
box_office = wiki_movies_df['Box office'].dropna()
# Check length of DataFrame
print(len(wiki_movies_df))

7033
7033
