In [None]:
# Import dependencies

# json library to extract the Wikipedia data
import json

# Pandas to create DataFrames
import pandas as pd

# NumPy library for converting data types
import numpy as np

import os

In [None]:
# Load the JSON file into a List of Dictionaries
# Data needs to be cleaned before loading it into DataFrame
# Below - wiki_movies_raw is a list of dictionaries

with open ('wikipedia-movies.json', mode='r') as file: 
    wiki_movies_raw = json.load(file)

In [None]:
# Use len to see how many records were pulled in
len(wiki_movies_raw)

In [None]:
# Review a few individual records to make sure data isn't garbled
# Remember - we're working with lists so will have to use index slices to select specific chunks of data

# First 5 records
wiki_movies_raw[:5]

In [None]:
# Last 5 records
wiki_movies_raw[-5:]

In [None]:
# Some records in the middle
wiki_movies_raw[3600:3605]

In [None]:
# Read the 2 data sets directly into DataFrames since they are already "flat" files w/ all rows filled in
kaggle_metadata_df = pd.read_csv('movies_metadata.csv', low_memory=False)
ratings_df = pd.read_csv('ratings.csv')

In [None]:
# Inspect kaggle metadata with head
kaggle_metadata_df.head(10)

In [None]:
# Inspect kaggle metadata with tail
kaggle_metadata_df.tail(10)

In [None]:
# Inspect the kaggle data by sampling a handful of rows randomly
kaggle_metadata_df.sample(15)

In [None]:
# Inspect the ratings data with head
ratings_df.head(10)

In [None]:
# Inspect the ratings data with tail
ratings_df.tail(10)

In [None]:
# Inspect the ratings data by sampling a handful of rows randomly
ratings_df.sample(15)

In [None]:
# Turn wiki_movies_raw into a DataFrame
wiki_movies_df = pd.DataFrame(wiki_movies_raw)
wiki_movies_df.head()

In [None]:
# 193 columns is a lot of columns!
# Convert the column names to a new list to see them more easily
wiki_movies_df.columns.tolist()

In [None]:
# Use list comprehension to filter data 
# Using list comprehension to restrict only to those entries that have a director and an IMDb link
wiki_movies = [movie for movie in wiki_movies_raw
               if ('Director' in movie or 'Directed by' in movie) 
               and 'imdb_link' in movie
              and 'No. of episodes' not in movie]
len(wiki_movies)

In [None]:
# Make a dataframe from wiki_movies
wiki_movies_v2_df = pd.DataFrame(wiki_movies)

In [None]:
# Inspecting showing that down to 78 columns. Better than 193 but not great.
wiki_movies_v2_df.head(3)

In [None]:
wiki_movies_v2_df.columns.tolist()

In [None]:
# Find movies that have a value for Arabic
wiki_movies_v2_df[wiki_movies_v2_df['Arabic'].notnull()]

In [None]:
# Find movies that have a value for Arabic - get their urls to be able to visit Wiki
wiki_movies_v2_df[wiki_movies_v2_df['Arabic'].notnull()]['url']

In [None]:
# Sort columns and go through them one-by-one to find all that hold alternate titles
sorted(wiki_movies_df.columns.tolist())

In [None]:
# Find movies that have a value for '' - get their urls to be able to visit Wiki
wiki_movies_v2_df[wiki_movies_v2_df['Also known as'].notnull()]['url']

In [None]:
#  Here is a complete list of columns that hold alternate title data: 
# Also known as, Arabic, Cantonese, Chinese, French, Hangul, Hebrew, Hepburn, Japanese, Literally,  
# Mandarin, McCune–Reischauer, Original title, Polish,  Revised Romanization, Romanized, Russian, Simplified, Traditional, 
# Yiddish

In [None]:
# Create our function to clean our movie data
# Because the movies are dict and we want to make nondestrictive edits, make a copy of the incoming movie
def clean_movie(movie):
    movie = dict(movie) # create a non-destrictive copy
    alt_titles = {} # make an empty dict to hold alternative titles
    
    # Loop through a list of all alternative title keys
    for key in ['Also known as','Arabic','Cantonese','Chinese','French',
                'Hangul','Hebrew','Hepburn','Japanese','Literally',
                'Mandarin','McCune–Reischauer','Original title','Polish',
                'Revised Romanization','Romanized','Russian',
                'Simplified','Traditional','Yiddish']:
        
        # Check if the current key exists in the movie object
        if key in movie:
            
            # remove the key-value pair with pop() and add to the alternative titles dictionary
            alt_titles[key] = movie[key]
            movie.pop(key)
        
        # After looping through every key, add the alternative titles dict to the movie object.
        if len(alt_titles) > 0:
            movie['alt_titles'] = alt_titles
            
    return movie

In [None]:
# Use list comprehension to make a list of cleaned movies
clean_movies = [clean_movie(movie) for movie in wiki_movies]

In [None]:
wiki_movies_df = pd.DataFrame(clean_movies)
sorted(wiki_movies_df.columns.tolist())

In [None]:
# Pick up at 8.3.5