In [5]:
import pandas as pd
from imdb import Cinemagoer
from rotten_tomatoes_scraper.rt_scraper import MovieScraper



basics = pd.read_csv('data/data.tsv', sep='\t')


In [6]:
def get_IMDB_movie_data(movie_ID=''):
    """
    Returns the following information of a title in a list form:
        IMDB rating
        genres
        MPAA rating
        Description
        maybe later: movie_title='',
    """
    ia = Cinemagoer()
    if movie_ID=='':
        raise Exception("No title or ID provided")
    
    movie_ID = movie_ID.replace('t', '')
    
    try:
        movie = ia.get_movie(movie_ID)
    except:
        return (pd.NA,pd.NA,pd.NA,pd.NA,pd.NA)

    try:
        rating = movie.data['rating']
        # result['rating'] = movie.data['rating']
    except:
        rating = pd.NA
        # result['rating'] = pd.NA

    try:
        # result['genres'] = 0
        genres = [genre.lower() for genre in movie.data['genres']]
    except:
        # result['genres'] = pd.NA
        genres = pd.NA

    try:
        # result['mpaa'] = [c.split(':')[1] for c in movie.data['certificates'] if (c.startswith('United States') or 'USA' in c)][0]
        MPAA = [c.split(':')[1] for c in movie.data['certificates'] if (c.startswith('United States') or 'USA' in c)][0]
    except:
        # result['mpaa'] = pd.NA
        MPAA = pd.NA

    try:
        # result['description'] = movie.data['plot outline']
        description = movie.data['plot']
    except:
        # result['description'] = pd.NA
        description = pd.NA

    try:
        # result['votes'] = movie.data['votes']
        votes = movie.data['votes']
    except:
        # result['votes'] = pd.NA
        votes = pd.NA

    # print(result)
    return (rating,genres,MPAA,description,votes)

In [7]:
def get_RT_ratings(movie_title):
    """
    Returns the Rotten Tomatoes critic score and audience score of a title
    """

    # Extract URL
    RT_search = MovieScraper()
    try:
        search_res = RT_search.search(movie_title)

        # Exact match
        url_list = [movie_dict['url'] for movie_dict in search_res['movies']
                    if movie_dict['name'].lower() == movie_title.lower()]
        if len(url_list) == 1:
            url = url_list[0]
        # No exact match -  return the latest one
        elif not url_list:
            url_list = sorted([(movie_dict['url'], movie_dict['year']) for movie_dict in search_res['movies']],
                            key=lambda x: x[1], reverse=True)
            try:
                url = url_list[0][0]
            except:
                return pd.NA, pd.NA
            # print(f'No exact match found. Going with {url}')
        # More than one exact match - return the latest one
        elif len(url_list) > 1:
            url_list = sorted([(movie_dict['url'], movie_dict['year']) for movie_dict in search_res['movies']
                            if movie_dict['name'].lower() == movie_title.lower()],
                            key=lambda x: x[1], reverse=True)
            url = url_list[0][0]
            # print(f'More than one exact match found. Going with {url}')

        movie_scraper = MovieScraper(movie_url='https://www.rottentomatoes.com' + url)
        movie_scraper.extract_metadata()
    except:
        return pd.NA, pd.NA
        
    try:
        rt_critics_score = int(movie_scraper.metadata['Score_Rotten'])
    except:
        rt_critics_score = pd.NA

    try:
        rt_audience_score = int(movie_scraper.metadata['Score_Audience'])
    except:
        rt_audience_score = pd.NA
        
    return rt_critics_score, rt_audience_score

In [8]:
def merge_data(table):
    # table = table.reset_index()
    extra = table['tconst'].apply(get_IMDB_movie_data) #gets extra info from imdb
    extra = pd.DataFrame(extra.to_list(), columns=['rating','genres','mpaa','description','votes'])

    rotten = table['primaryTitle'].apply(get_RT_ratings) # gets more ratings from rotten tomatoes
    rotten = pd.DataFrame(rotten.to_list(), columns=['rt_critics_score','rt_audience_score'])
    table_big = pd.concat([table.reset_index(), extra.reset_index(), rotten.reset_index()], axis=1)

    return table_big

In [9]:
basics_movies = basics.loc[basics['titleType'] == 'movie']
set_movies = basics_movies.loc[(basics_movies['genres'] != '\\N')]
set_movies['startYear'] = set_movies['startYear'].replace('\\N',0).astype('int32')
set_movies = set_movies.loc[(set_movies['startYear'] >= 1930)]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  set_movies['startYear'] = set_movies['startYear'].replace('\\N',0).astype('int32')


In [10]:
set_movies_big = merge_data(set_movies)

2022-03-05 17:56:16,467 CRITICAL [imdbpy] C:\Users\carlo\anaconda3\lib\site-packages\imdb\_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt0080419/plotsummary', 'proxy': '', 'exception type': 'IOError', 'original exception': <HTTPError 503: 'Service Temporarily Unavailable'>},); kwds: {}
Traceback (most recent call last):
  File "C:\Users\carlo\anaconda3\lib\site-packages\imdb\parser\http\__init__.py", line 221, in retrieve_unicode
    response = uopener.open(url)
  File "C:\Users\carlo\anaconda3\lib\urllib\request.py", line 523, in open
    response = meth(req, response)
  File "C:\Users\carlo\anaconda3\lib\urllib\request.py", line 632, in http_response
    response = self.parent.error(
  File "C:\Users\carlo\anaconda3\lib\urllib\request.py", line 561, in error
    return self._call_chain(*args)
  File "C:\Users\carlo\anaconda3\lib\urllib\request.py", line 494, in _call_chain
    result = func(*arg