In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen
import pandas as pd
import numpy as np
import time

In [2]:
BASE_URL = "https://www.rottentomatoes.com"
API_URL = "https://www.rottentomatoes.com/api/private/v2.0"
SEARCH_URL = "{base_url}/search".format(base_url=API_URL)
        
def search_for_movie(title, year):
    r = requests.get(url=SEARCH_URL, params={"q": title})
    r.raise_for_status()
    movies = r.json()["movies"]
    for movie in movies:
        if movie["year"] == year:
            return movie
    return movies[0] if movies else None

In [3]:
def get_scores_from_movie(movie):
    if not movie:
        return (np.nan, np.nan)
    page_movie = urlopen(BASE_URL + movie["url"])
    soup = BeautifulSoup(page_movie, "lxml")
    score = soup.find_all('div', class_='mop-ratings-wrap__half')
    score_rotten = score[0].text.strip().replace('\n', '').split(' ')[0]
    score_audience = score[1].text.strip().replace('\n', '').split(' ')[0]
    return (score_rotten, score_audience)

In [4]:
movie = search_for_movie("titanic", 1997)
print(get_scores_from_movie(movie))
print(get_scores_from_movie(None))

HTTPError: 403 Client Error: Forbidden for url: https://www.rottentomatoes.com/api/private/v2.0/search?q=titanic

In [5]:
df = pd.read_csv("movies.csv")
df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
df.tail()

Unnamed: 0,movieId,title,genres
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)
62422,209171,Women of Devil's Island (1962),Action|Adventure|Drama


In [7]:
def clean_title_data(title):
    year = int(title[-5:-1])
    remove_index = title.index("(")
    search_title = title[:remove_index]
    return (search_title, year)

In [8]:
print(clean_title_data("City of Lost Children, The (Cité des enfants perdus, La) (1995)"))

('City of Lost Children, The ', 1995)


In [9]:
scored_rows = []
scores_rotten = []
scores_audience = []

start = 0
end = len(df.index)
for index, row in df.loc[start:end].iterrows():
    title = row["title"]
    try:
        search_title, year = clean_title_data(title)
        movie = search_for_movie(search_title, year)
    except requests.HTTPError as e:
        print(e)
        print("Exiting... Restart on index", index)
        break
    except Exception as e:
        movie = None
        print(e)
    try:
        score_rotten, score_audience = get_scores_from_movie(movie)
    except Exception as e:
        score_rotten, score_audience = np.nan, np.nan
        print(e, index)
    if index % 100 == 0:
        print(title, index, score_rotten, score_audience)
    scored_rows.append(row)
    scores_rotten.append(score_rotten)
    scores_audience.append(score_audience)

403 Client Error: Forbidden for url: https://www.rottentomatoes.com/api/private/v2.0/search?q=Toy+Story+
Exiting... Restart on index 0


In [10]:
scored_df = df.loc[start:start + len(scores_rotten) - 1].copy()
scored_df["Rotten Tomatoes Critic Score"] = scores_rotten
scored_df["Rotten Tomatoes Audience Score"] = scores_audience
scored_df.head()

Unnamed: 0,movieId,title,genres,Rotten Tomatoes Critic Score,Rotten Tomatoes Audience Score


In [11]:
scored_df.to_csv("scored_movies.csv")