# Data Management
Use this file to import and manipulate the data before saving them to the database.

In [3]:
# Dependencies
import pandas as pd
from pathlib import Path
import requests

# Local modules
from config import omdb_api_key

## Movies, actor and character database
The CSV files were downloaded from https://data.world/jamesgaskin/movies (author: James Gaskin). The files are opened here to prepare the database.

In [48]:
# Get the movies data
movies_csv = Path('../Datasets/movies.csv')
movies_df = pd.read_csv(movies_csv)

In [49]:
# List the columns in the DataFrame
movies_df.columns

Index(['movieid', 'title', 'mpaa_rating', 'budget', 'gross', 'release_date',
       'genre', 'runtime', 'rating', 'rating_count', 'summary'],
      dtype='object')

In [50]:
# Get all movies in the dataset into a list
movie_list = movies_df['title'].tolist()
print(f"There are {len(movie_list)} movies in the list.")

There are 636 movies in the list.


In [4]:
characters_df = Path('../Datasets/character.csv')
characters_df = pd.read_csv(characters_df)

In [5]:
characters_df

Unnamed: 0,movieid,actorid,character_name,creditorder,pay,screentime
0,1,1,James,1,,
1,1,2,Mollie,2,,
2,1,3,Rosie,3,,
3,1,4,Albert,4,,
4,1,5,Grandpa,5,,
...,...,...,...,...,...,...
4314,483,2494,Explorer Captain,6,,
4315,483,2495,Russian Space Station Captain (voice),7,,
4316,483,2495,Russian Space Station Captain (voice),8,,
4317,483,2495,Russian Space Station Captain (voice),9,,


In [6]:
actor_df = Path('../Datasets/actor.csv')
actor_df = pd.read_csv(actor_df)

In [7]:
actor_df

Unnamed: 0,actorid,name,date_of_birth,birth_city,birth_country,height_inches,biography,gender,ethnicity,networth
0,2169,Aaron Eckhart,1968-03-12,Cupertino,USA,61.0,"Aaron Eckhart was born on March 12, 1968 in Cu...",Male,White,16000000.0
1,2204,Aasif Mandvi,1966-03-05,Bombay,India,68.0,"Aasif Mandvi was born on March 5, 1966 in Bomb...",Male,Indian,3000000.0
2,5,Abe Vigoda,1921-02-24,Brooklyn,USA,73.0,Abraham Charles Vigoda was an American actor k...,Male,White,10000000.0
3,1618,Abigail Breslin,1996-04-14,New York City,USA,61.0,Academy Award-nominated actress Abigail Bresli...,Female,White,8000000.0
4,1062,Abraham Benrubi,1969-10-04,Indianapolis,USA,79.0,"Abraham Benrubi was born on October 4, 1969 in...",Male,White,12000000.0
...,...,...,...,...,...,...,...,...,...,...
2594,1337,Ziyi Zhang,1979-02-09,Beijing,China,65.0,Ziyi Zhang is a Chinese actress and model. She...,,,
2595,2566,Zoe Kravitz,1988-12-01,Los Angeles,USA,62.0,"Zoe Isabella Kravitz, the daughter of singer/a...",Female,,
2596,2232,Zoe Saldana,1978-06-19,Passaic,USA,66.0,"Zoe Saldana was born on June 19, 1978 in Passa...",Female,,35000000.0
2597,2534,Zoey Vargas,,,,,"Zoey Vargas is an actress, known for Neighbors...",Female,,


## OMDB
NOTE: Before proceeding please make sure that the repo contains `config.py` with a variable `omdb_api_key` that contains your OMDB API.

This section of the code loop through the movies included in the James Gaskin dataset and retrieve all their information from OMDB.

In [51]:
# Base URL and key
url = "http://www.omdbapi.com/?"
api_key = "&apikey=" + omdb_api_key

# Initialise list and movie counter
omdb_movies = []
counter = 0

# Loop through the list of movie titles
for movie in movie_list:

    # Display progress counter
    counter += 1
    print(f"Get info for Movie {counter} of {len(movie_list)}: {movie}")

    # Add movie title to search item and contruct API URL
    movie_title = f"t={movie}"
    url_omdb = url + movie_title + api_key

    # Add movie to list
    omdb_movies.append(requests.get(url_omdb).json())


Get info for Movie 1 of 636: Look Who's Talking
Get info for Movie 2 of 636: Driving Miss Daisy
Get info for Movie 3 of 636: Turner & Hooch
Get info for Movie 4 of 636: Born on the Fourth of July
Get info for Movie 5 of 636: Field of Dreams
Get info for Movie 6 of 636: Uncle Buck
Get info for Movie 7 of 636: When Harry Met Sally...
Get info for Movie 8 of 636: Dead Poets Society
Get info for Movie 9 of 636: Parenthood
Get info for Movie 10 of 636: Lethal Weapon 2
Get info for Movie 11 of 636: The War of the Roses
Get info for Movie 12 of 636: National Lampoon's Christmas Vacation
Get info for Movie 13 of 636: Honey, I Shrunk the Kids
Get info for Movie 14 of 636: Batman
Get info for Movie 15 of 636: Ghostbusters II
Get info for Movie 16 of 636: The Little Mermaid
Get info for Movie 17 of 636: Back to the Future Part II
Get info for Movie 18 of 636: Indiana Jones and the Last Crusade
Get info for Movie 19 of 636: Tango & Cash
Get info for Movie 20 of 636: Steel Magnolias
Get info for Mo

In [52]:
# Save the results in a DataFrame
omdb_movies_df = pd.DataFrame(omdb_movies)

# Save the DataFrame to a CSV file
omdb_movies_df.to_csv('Datasets/omdb.csv', index=False)

In [54]:
# List the columns in the DataFrame
movies_df.columns

Index(['movieid', 'title', 'mpaa_rating', 'budget', 'gross', 'release_date',
       'genre', 'runtime', 'rating', 'rating_count', 'summary'],
      dtype='object')

In [55]:
# List the columns in the DataFrame
omdb_movies_df.columns

Index(['Title', 'Year', 'Rated', 'Released', 'Runtime', 'Genre', 'Director',
       'Writer', 'Actors', 'Plot', 'Language', 'Country', 'Awards', 'Poster',
       'Ratings', 'Metascore', 'imdbRating', 'imdbVotes', 'imdbID', 'Type',
       'DVD', 'BoxOffice', 'Production', 'Website', 'Response', 'Error',
       'totalSeasons'],
      dtype='object')