In [1]:
with open("API_KEY.txt") as f:
    CREDENTIALS = {"API_KEY" : f.readline().strip().split("=")[-1]}

print(CREDENTIALS)

{'API_KEY': '5805b27b'}


In [2]:
import requests
import json

BASE_URL = f"http://www.omdbapi.com/?apikey={CREDENTIALS['API_KEY']}&"

def fetch(query):
    COMPOSITE_URL = f"{BASE_URL}{query}"
    with requests.get(COMPOSITE_URL) as r:
        if r.status_code < 400:
            film_dict = json.loads(r.content)
            return film_dict

In [3]:
def prune(film):
    rel_dict = {}

    relevant_fields = ["Title", "Runtime", "Genre", "Director", "Ratings", "imdbRating", "BoxOffice"]
    scores = ["imdbRating", "Internet Movie Database", "Rotten Tomatoes", "Metacritic"]
    
    if not film:
        return None

    for key_ in relevant_fields:
        if key_ in film.keys():
            rel_dict[key_] = film[key_]

    try:
        rel_dict["Genre"] = rel_dict["Genre"].split(", ")
        rel_dict["Director"] = rel_dict["Director"].split(", ")
    except:
        pass
    
    sources = {}
    try:
        sources = {d["Source"] : d["Value"] for d in rel_dict["Ratings"]}
    except:
        pass
    
    for k, v in sources.items():
        rel_dict[k] = v

    for score in scores:
        split_char = "/"
        if score == "Rotten Tomatoes":
            split_char = "%"
        value = None
        try:
            value = float(rel_dict[score].split(split_char)[0]) or None
        except:
            pass
        finally:
            rel_dict[score] = value
            value = None
        
    rel_dict.pop("Ratings", None)
    return rel_dict

In [4]:
FILENAME = "data.json"

def write_to_file(FILENAME, rel_dict):
    json_string = json.dumps(rel_dict, separators=(",",":"))
    with open(FILENAME, "a") as infile:
        infile.write(json_string+"\n")

In [5]:
from random import randint

def batch_fetch():
    for i in range(100):
        id_ = f"{randint(1, 1877830)}"
        film = fetch(f"i=tt{str(id_).zfill(7)}")
        if film:
            rel_film = prune(film)
            write_to_file(FILENAME, rel_film)

In [6]:
for i in range(5):
    batch_fetch()
    print(f"{(i+1)*100} links fetched!")
print("Batch Job Done!")

100 links fetched!
200 links fetched!
300 links fetched!
400 links fetched!
500 links fetched!
Batch Job Done!


In [10]:
import pandas as pd
df = pd.read_json("data.json", lines=True)

df.dropna(subset=["Title"], inplace=True)
df.dropna(subset=['imdbRating', 'Internet Movie Database', 'Rotten Tomatoes', 'Metacritic'], how="all", inplace=True)
df.reset_index(drop=True, inplace=True)

In [11]:
print(f"STDDEV = {df['imdbRating'].std()}\nMEAN = {df['imdbRating'].mean()}")

STDDEV = 1.4066257661182244
MEAN = 6.644797687861272


In [14]:
def save_curated(frame):
    with open("curated.json", "w") as f:
        out = frame.to_json(orient="records", lines=True)
        f.write(out)

In [15]:
df.describe()

Unnamed: 0,imdbRating,Internet Movie Database,Rotten Tomatoes,Metacritic
count,346.0,346.0,15.0,8.0
mean,6.644798,6.644798,55.133333,59.0
std,1.406626,1.406626,29.369242,18.165902
min,2.0,2.0,17.0,42.0
25%,5.9,5.9,26.0,45.0
50%,6.8,6.8,57.0,52.5
75%,7.6,7.6,79.0,71.25
max,10.0,10.0,95.0,86.0
