In [1]:
def retrieve_credentials():
    with open("API_KEY.txt") as f:
        return {"API_KEY" : f.readline().strip().split("=")[-1]}

CREDENTIALS = retrieve_credentials()
print(CREDENTIALS)

In [2]:
import requests
import json
from random import randint

BASE_URL = f"http://www.omdbapi.com/?apikey={CREDENTIALS['API_KEY']}&"
FILENAME = "data.json"

def fetch(query):
    COMPOSITE_URL = f"{BASE_URL}{query}"
    with requests.get(COMPOSITE_URL) as r:
        if r.status_code < 400:
            film_dict = json.loads(r.content)
            return film_dict

In [3]:
def prune(film):
    rel_dict = {}

    relevant_fields = ["Title", "Runtime", "Genre", "Director", "Ratings", "imdbRating", "BoxOffice"]
    scores = ["imdbRating", "Internet Movie Database", "Rotten Tomatoes", "Metacritic"]
    
    if not film:
        return None

    for key_ in relevant_fields:
        if key_ in film.keys():
            rel_dict[key_] = film[key_]

    try:
        rel_dict["Genre"] = rel_dict["Genre"].split(", ")
        rel_dict["Director"] = rel_dict["Director"].split(", ")
    except:
        pass
    
    sources = {}
    try:
        sources = {d["Source"] : d["Value"] for d in rel_dict["Ratings"]}
    except:
        pass
    
    for k, v in sources.items():
        rel_dict[k] = v

    for score in scores:
        split_char = "/"
        if score == "Rotten Tomatoes":
            split_char = "%"
        value = None
        try:
            value = float(rel_dict[score].split(split_char)[0]) or None
        except:
            pass
        finally:
            rel_dict[score] = value
            value = None
        
    rel_dict.pop("Ratings", None)
    return rel_dict

In [4]:
def write_to_file(FILENAME, rel_dict):
    json_string = json.dumps(rel_dict, separators=(",",":"))
    with open(FILENAME, "a") as infile:
        infile.write(json_string+"\n")

In [5]:
def batch_fetch():
    for i in range(100):
        id_ = f"{randint(1, 1877830)}"
        film = fetch(f"i=tt{str(id_).zfill(7)}")
        if film:
            rel_film = prune(film)
            write_to_file(FILENAME, rel_film)

In [6]:
def save_curated(frame):
    with open("curated.json", "w") as f:
        out = frame.to_json(orient="records", lines=True)
        f.write(out)

In [7]:
#for i in range(5):
#    batch_fetch()
#    print(f"{(i+1)*100} links fetched!")
#print("Batch Job Done!")

In [8]:
#import pandas as pd
#df = pd.read_json("data.json", lines=True)

#df.dropna(subset=["Title"], inplace=True)
#df.dropna(subset=['imdbRating', 'Internet Movie Database', 'Rotten Tomatoes', 'Metacritic'], how="all", inplace=True)
#df.reset_index(drop=True, inplace=True)

In [9]:
#print(f"STDDEV = {df['imdbRating'].std()}\nMEAN = {df['imdbRating'].mean()}")

In [10]:
#df.describe()

In [14]:
from bs4 import BeautifulSoup

top250 = "https://www.imdb.com/search/title/?groups=top_250&sort=user_rating"

def fetch_(query):
    with requests.get(top250) as r:
        if r.status_code < 400:
            soup = BeautifulSoup(r.content, "lxml")
            try:
                item_pane = soup.find("div", class_="lister-item-content")
                print(item_pane)
            except:
                print("bad")
            
fetch_("i=tt0111161")

<div class="lister-item-content">
<h3 class="lister-item-header">
<span class="lister-item-index unbold text-primary">1.</span>
<a href="/title/tt0111161/">The Shawshank Redemption</a>
<span class="lister-item-year text-muted unbold">(1994)</span>
</h3>
<p class="text-muted">
<span class="certificate">15</span>
<span class="ghost">|</span>
<span class="runtime">142 min</span>
<span class="ghost">|</span>
<span class="genre">
Drama            </span>
</p>
<div class="ratings-bar">
<div class="inline-block ratings-imdb-rating" data-value="9.3" name="ir">
<span class="global-sprite rating-star imdb-rating"></span>
<strong>9.3</strong>
</div>
<div class="inline-block ratings-user-rating">
<span class="userRatingValue" data-tconst="tt0111161" id="urv_tt0111161">
<span class="global-sprite rating-star no-rating"></span>
<span class="rate" data-no-rating="Rate this" data-value="0" name="ur">Rate this</span>
</span>
<div class="starBarWidget" id="sb_tt0111161">
<div class="rating rating-list" 