In [2]:
from bs4 import BeautifulSoup as BeautifulSoup
import requests
import json
import re
import os

from dotenv import load_dotenv

load_dotenv()

API_KEY = os.getenv("API_KEY")
base_image_url = "https://image.tmdb.org/t/p/original"

In [None]:
def make_tmbd_request(id):
    ## Build the URL
    url = "https://api.themoviedb.org/3/find/"+ id +"?api_key="+ API_KEY +"&language=en-US&external_source=imdb_id"
    
    ## Make the request
    req = requests.get(url)
    
    return req.json()

In [None]:
def get_keywords(id) -> list:
    url =  "https://api.themoviedb.org/3/movie/"+ id + "/keywords?api_key=" + API_KEY

    ## Make the request
    req = requests.get(url)

    json_response = req.json()

    keywords = []
    if 'keywords' in json_response:
        for object in json_response["keywords"]:
            keywords.append(object["name"])

    return keywords

In [None]:
def get_popularity(response) -> float:
    if len(response["movie_results"]) < 1:
        return -1
    return response["movie_results"][0]["popularity"] if "popularity" in response["movie_results"][0] else -1

In [None]:
def get_poster(response) -> str:
    if len(response["movie_results"]) < 1:
        return "N/A"
    return "N/A" if response["movie_results"][0]["poster_path"] == None else base_image_url + response["movie_results"][0]["poster_path"]

In [None]:
feature_films = []

In [None]:
def get_feature_films(start, end):
    j = 0
    for page in range(start, end):
        print(f"Page: {page}")
        ## Assemble URL
        if page == 0:
            url = "https://www.imdb.com/search/title/?title_type=feature&num_votes=10000,&countries=us&sort=user_rating,desc&view=advanced"

        else:
            start = str(1 + page*50)
            url = "https://www.imdb.com/search/title/?title_type=feature&num_votes=10000,&countries=us&view=advanced&sort=user_rating,desc&start=" + start
        
        ## Make the request
        req=requests.get(url, headers={'Accept-Language': 'en-US, en;q=0.5'})
        content=req.text
        soup = BeautifulSoup(content)

        ### Get the div containg all movies
        list_of_movies = soup.findAll("div", attrs={"class" : "lister-item mode-advanced"})

        for movie in list_of_movies:
            j +=1
            film = {}
            
            ### Helpful elements
            header = movie.find("h3", attrs={"class" : "lister-item-header"})
            paragraphs = movie.findAll("p", attrs={"class" : "text-muted"})
            ratingsBar = movie.find("div", attrs={"class" : "ratings-bar"})
            crewInfo = movie.find("p", attrs={"class" : ""})
            votesIncome = movie.find("p", attrs={"class" : "sort-num_votes-visible"}).findAll("span", attrs={"name" : "nv"})
            
            ## Get the primary title 
            film["title"]   =  header.find("a").text

            ## Get the IMBD movie id, this will be useful later on
            film["id"] = movie.find("img", attrs={"class" : "loadlate"})["data-tconst"]
            
            ## Using the ID, make a request to TMBD to get infos not avaliable per scraping
            response  = make_tmbd_request(film["id"])

            
            ## Get the year of the movie
            year                = header.find("span", attrs={"class" : "lister-item-year"}).text
            
            ## Some movie may have (I) or (V) before the year, prevent that
            if 'I' in year or 'V' in year:
                 film["year"]    = int(year.split()[1][1:5])
            else:
                film["year"]    = int(year[1:5])

            
            ## Get the age rating
            film["certificate"] = 'N/A' if not paragraphs[0].find("span", attrs={"class" : "certificate"}) else paragraphs[0].find("span", attrs={"class" : "certificate"}).text
            
            ## Get its runtime
            film["runtime"]     = int(paragraphs[0].find("span", attrs={"class" : "runtime"}).text.split()[0])
            
            ## Get the genres, as an array
            film["genres"] = []
            genres = paragraphs[0].find("span", attrs={"class" : "genre"}).text.strip()
            
            for genre in genres.split(','):
                film["genres"].append(genre.replace(' ', ''))
            
            ## Get the rating
            film["rating"]      = float(ratingsBar.find("strong").text)
            
            ## Get the description, strip newlines
            film["description"] = paragraphs[1].text.strip()

            ## Get keywords
            film["keywords"] = get_keywords(film["id"])

            ## This is used to limit directors from stars
            ghost = crewInfo.find("span", attrs={"class" : "ghost"})
            
            film["directors"] = []
            film["stars"] = []

            ## Get popularity
            film["popularity"] = get_popularity(response)

            ## Find all names mentioned
            crew = crewInfo.findAll("a")
            aux  = crewInfo.next_element
            elements = 0

            # Count the number of directors
            if ghost:
                while aux.next_element != ghost:
                    elements += 1
                    aux = aux.next_element
            
            ## If there is not a delimter, the number of directors is just one
            numDirectors = 1 if not ghost else elements//3

            for i in range(0, numDirectors):
                film["directors"].append(crew[i].text)

            for i in range(numDirectors, len(crew)):
                film["stars"].append(crew[i].text)
            
            ## Get votes and gross income 
            film["votes"]       = int(votesIncome[0]["data-value"])

            ## Get images
            film["poster"]      = get_poster(response)
            
            ## Append to array
            feature_films.append(film)


In [None]:
get_feature_films(0, 50)

In [None]:
get_feature_films(50, 100)

In [None]:
get_feature_films(100, 147)

In [None]:
for movie in feature_films:
    # If "duringcreditsstinger" is in the keywords, replace it with "post credit scene"
    if "duringcreditsstinger" in movie["keywords"]:
        movie["keywords"].remove("duringcreditsstinger")
        movie["keywords"].append("post credit scene")

In [12]:
with open("output-new/movies-no-reviews.json", "w") as f:
   json.dump(feature_films, f, indent =4)