In [3]:
import pandas as pd
from imdb import Cinemagoer
from rotten_tomatoes_scraper.rt_scraper import MovieScraper
from tqdm.notebook import tqdm #progress bar

In [2]:
# !pip install tqdm




In [7]:

basics = pd.read_csv('data/data.tsv', sep='\t')


  exec(code_obj, self.user_global_ns, self.user_ns)


In [8]:
def get_IMDB_movie_data(movie_ID=''):
    """
    Returns the following information of a title in a list form:
        IMDB rating
        genres
        MPAA rating
        Description
        maybe later: movie_title='',
    """
    ia = Cinemagoer()
    if movie_ID=='':
        raise Exception("No title or ID provided")
    
    movie_ID = movie_ID.replace('t', '')
    
    try:
        movie = ia.get_movie(movie_ID)
    except:
        return (pd.NA,pd.NA,pd.NA,pd.NA,pd.NA)

    try:
        rating = movie.data['rating']
        # result['rating'] = movie.data['rating']
    except:
        rating = pd.NA
        # result['rating'] = pd.NA

    try:
        # result['genres'] = 0
        genres = [genre.lower() for genre in movie.data['genres']]
    except:
        # result['genres'] = pd.NA
        genres = pd.NA

    try:
        # result['mpaa'] = [c.split(':')[1] for c in movie.data['certificates'] if (c.startswith('United States') or 'USA' in c)][0]
        MPAA = [c.split(':')[1] for c in movie.data['certificates'] if (c.startswith('United States') or 'USA' in c)][0]
    except:
        # result['mpaa'] = pd.NA
        MPAA = pd.NA

    try:
        # result['description'] = movie.data['plot outline']
        description = movie.data['plot']
    except:
        # result['description'] = pd.NA
        description = pd.NA

    try:
        # result['votes'] = movie.data['votes']
        votes = movie.data['votes']
    except:
        # result['votes'] = pd.NA
        votes = pd.NA

    # print(result)
    return (rating,genres,MPAA,description,votes)

In [9]:
def get_RT_ratings(movie_title):
    """
    Returns the Rotten Tomatoes critic score and audience score of a title
    """

    # Extract URL
    RT_search = MovieScraper()
    try:
        search_res = RT_search.search(movie_title)

        # Exact match
        url_list = [movie_dict['url'] for movie_dict in search_res['movies']
                    if movie_dict['name'].lower() == movie_title.lower()]
        if len(url_list) == 1:
            url = url_list[0]
        # No exact match -  return the latest one
        elif not url_list:
            url_list = sorted([(movie_dict['url'], movie_dict['year']) for movie_dict in search_res['movies']],
                            key=lambda x: x[1], reverse=True)
            try:
                url = url_list[0][0]
            except:
                return pd.NA, pd.NA
            # print(f'No exact match found. Going with {url}')
        # More than one exact match - return the latest one
        elif len(url_list) > 1:
            url_list = sorted([(movie_dict['url'], movie_dict['year']) for movie_dict in search_res['movies']
                            if movie_dict['name'].lower() == movie_title.lower()],
                            key=lambda x: x[1], reverse=True)
            url = url_list[0][0]
            # print(f'More than one exact match found. Going with {url}')

        movie_scraper = MovieScraper(movie_url='https://www.rottentomatoes.com' + url)
        movie_scraper.extract_metadata()
    except:
        return pd.NA, pd.NA
        
    try:
        rt_critics_score = int(movie_scraper.metadata['Score_Rotten'])
    except:
        rt_critics_score = pd.NA

    try:
        rt_audience_score = int(movie_scraper.metadata['Score_Audience'])
    except:
        rt_audience_score = pd.NA
        
    return rt_critics_score, rt_audience_score

In [None]:
def merge_data(table):
    # table = table.reset_index()
    extra = table['tconst'].apply(get_IMDB_movie_data) #gets extra info from imdb
    extra = pd.DataFrame(extra.to_list(), columns=['rating','genres','mpaa','description','votes'])

    rotten = table['primaryTitle'].apply(get_RT_ratings) # gets more ratings from rotten tomatoes
    rotten = pd.DataFrame(rotten.to_list(), columns=['rt_critics_score','rt_audience_score'])
    table_big = pd.concat([table.reset_index(), extra.reset_index(), rotten.reset_index()], axis=1)

    return table_big

In [10]:
def merge_data_imdb(table):
    # table = table.reset_index()
    extra = table['tconst'].apply(get_IMDB_movie_data) #gets extra info from imdb
    extra = pd.DataFrame(extra.to_list(), columns=['rating','genres','mpaa','description','votes'])

    # rotten = table['primaryTitle'].apply(get_RT_ratings) # gets more ratings from rotten tomatoes
    # rotten = pd.DataFrame(rotten.to_list(), columns=['rt_critics_score','rt_audience_score'])
    table_big = pd.concat([table.reset_index(), extra.reset_index()], axis=1)

    return table_big

In [11]:
basics_movies = basics.loc[basics['titleType'] == 'movie']
set_movies = basics_movies.loc[(basics_movies['genres'] != '\\N')]
set_movies['startYear'] = set_movies['startYear'].replace('\\N',0).astype('int32')
set_movies = set_movies.loc[(set_movies['startYear'] >= 1930)]

table = set_movies.copy()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  set_movies['startYear'] = set_movies['startYear'].replace('\\N',0).astype('int32')


In [24]:
result = pd.DataFrame()
i=0

pbar = tqdm(total=len(table))
while len(table) > 1000:

# while i < 2:
    set = table.head(1000)

    set_big = merge_data_imdb(set)

    result = pd.concat([result,set_big], ignore_index=True)

    table = table.iloc[1000: , :]
    i = i + 1
    result.to_csv('data\imdb_full.csv', index=False)
    pbar.update(1000)
pbar.close

  0%|          | 0/435918 [00:00<?, ?it/s]

<bound method tqdm_notebook.close of <tqdm.notebook.tqdm_notebook object at 0x0000022AAE37E9A0>>

In [26]:
table.shape

(433918, 9)

In [25]:
result

Unnamed: 0,index,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,index.1,rating,genres.1,mpaa,description,votes
0,3816,tt0003854,movie,Dodge City Trail,Dodge City Trail,0,1936,\N,56,"Adventure,Music,Western",0,3.7,"[adventure, music, western]",Passed,[Let's start with some house cleaning first. C...,28
1,11059,tt0011216,movie,Spanish Fiesta,La fête espagnole,0,2019,\N,67,Drama,1,6.9,[drama],,"[Coveted by two different men, a woman turns t...",30
2,11636,tt0011801,movie,Tötet nicht mehr,Tötet nicht mehr,0,2019,\N,\N,"Action,Crime",2,,"[action, crime]",,"[In Russia's revolution, a violinist 's son di...",
3,15487,tt0015724,movie,Dama de noche,Dama de noche,0,1993,\N,102,"Drama,Mystery,Romance",3,6.2,"[drama, mystery, romance, thriller]",,,25
4,15789,tt0016029,movie,The Little Colonel,The Little Colonel,0,1935,\N,81,"Comedy,Family,Musical",4,7.0,"[comedy, family, musical]",Approved,"[In the post-Civil war south, a darling little...",1678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,23124,tt0023520,movie,A Strange Adventure,A Strange Adventure,0,1932,\N,60,"Mystery,Romance,Thriller",995,5.1,"[mystery, romance, thriller]",Passed,[A police lieutenant and a female reporter inv...,184
1996,23125,tt0023521,movie,The Strange Case of Clara Deane,The Strange Case of Clara Deane,0,1932,\N,60,Drama,996,6.8,[drama],Passed,[A young dress designer marries an insurance a...,46
1997,23127,tt0023523,movie,Strange Interlude,Strange Interlude,0,1932,\N,109,Drama,997,5.6,[drama],Passed,[After Nina Leeds finds out that insanity runs...,741
1998,23128,tt0023524,movie,Strange Justice,Strange Justice,0,1932,\N,64,"Drama,Romance",998,5.7,"[drama, romance]",Passed,[Socialite banker Henry Judson maintains his e...,144


In [22]:
# set = table.head(1000)
del(set)
set

NameError: name 'set' is not defined

In [None]:
# set_movies_big = merge_data(set_movies)