# 1 - Data gathering

I am going to use as the main source of info the dataset offered by IMDB in https://datasets.imdbws.com/  
Specifically, I choose the file **title.basics.tsv.gz**, which has this structure according to the documentation:

* tconst (string) - alphanumeric unique identifier of the title
* titleType (string) – the type/format of the title (e.g. movie, short, tvseries, tvepisode, video, etc)
* primaryTitle (string) – the more popular title / the title used by the filmmakers on promotional materials at the point of release
* originalTitle (string) - original title, in the original language
* isAdult (boolean) - 0: non-adult title; 1: adult title
* startYear (YYYY) – represents the release year of a title. In the case of TV Series, it is the series start year
* endYear (YYYY) – TV Series end year. ‘\N’ for all other title types
* runtimeMinutes – primary runtime of the title, in minutes
* genres (string array) – includes up to three genres associated with the title

In [130]:
import pandas as pd

basics = pd.read_csv('data/data.tsv', sep='\t')
basics

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...,...
8738245,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0,2010,\N,\N,"Action,Drama,Family"
8738246,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0,2010,\N,\N,"Action,Drama,Family"
8738247,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0,2010,\N,\N,"Action,Drama,Family"
8738248,tt9916856,short,The Wind,The Wind,0,2015,\N,27,Short


In [131]:
basics.titleType.value_counts(dropna=False)

tvEpisode       6541970
short            858445
movie            603535
video            257832
tvSeries         221817
tvMovie          135194
tvMiniSeries      42352
tvSpecial         36106
videoGame         30513
tvShort           10484
tvPilot               2
Name: titleType, dtype: int64

The amount of items is too big, so I will keep only the movies, with more than 600k items.

In [132]:
basics_movies = basics.loc[basics['titleType'] == 'movie']
basics_movies

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
498,tt0000502,movie,Bohemios,Bohemios,0,1905,\N,100,\N
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,\N,90,Drama
610,tt0000615,movie,Robbery Under Arms,Robbery Under Arms,0,1907,\N,\N,Drama
625,tt0000630,movie,Hamlet,Amleto,0,1908,\N,\N,Drama
...,...,...,...,...,...,...,...,...,...
8738140,tt9916622,movie,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,0,2015,\N,57,Documentary
8738167,tt9916680,movie,De la ilusión al desconcierto: cine colombiano...,De la ilusión al desconcierto: cine colombiano...,0,2007,\N,100,Documentary
8738179,tt9916706,movie,Dankyavar Danka,Dankyavar Danka,0,2013,\N,\N,Comedy
8738190,tt9916730,movie,6 Gunn,6 Gunn,0,2017,\N,116,\N


But that is not enough, I need more information, mainly the description, and for that I will use the CinemaGoer library (https://cinemagoer.github.io):

In [133]:
# !pip install cinemagoer
from imdb import Cinemagoer

In [134]:
def get_IMDB_movie_data(movie_ID=''):
    """
    Returns the following information of a title in a list form:
        IMDB rating
        genres
        MPAA rating
        Description
        maybe later: movie_title='',
    """
    ia = Cinemagoer()
    if movie_ID=='':
        raise Exception("No title or ID provided")
    
    movie_ID = movie_ID.replace('t', '')
    
    try:
        movie = ia.get_movie(movie_ID)
    except:
        return (pd.NA,pd.NA,pd.NA,pd.NA,pd.NA)

    try:
        rating = movie.data['rating']
        # result['rating'] = movie.data['rating']
    except:
        rating = pd.NA
        # result['rating'] = pd.NA

    try:
        # result['genres'] = 0
        genres = [genre.lower() for genre in movie.data['genres']]
    except:
        # result['genres'] = pd.NA
        genres = pd.NA

    try:
        # result['mpaa'] = [c.split(':')[1] for c in movie.data['certificates'] if (c.startswith('United States') or 'USA' in c)][0]
        MPAA = [c.split(':')[1] for c in movie.data['certificates'] if (c.startswith('United States') or 'USA' in c)][0]
    except:
        # result['mpaa'] = pd.NA
        MPAA = pd.NA

    try:
        # result['description'] = movie.data['plot outline']
        description = movie.data['plot']
    except:
        # result['description'] = pd.NA
        description = pd.NA

    try:
        # result['votes'] = movie.data['votes']
        votes = movie.data['votes']
    except:
        # result['votes'] = pd.NA
        votes = pd.NA

    # print(result)
    return (rating,genres,MPAA,description,votes)

In [135]:
movies_sample = basics_movies.sample(500, random_state=42).reset_index()
# movies_sample

In [136]:
extra = movies_sample['tconst'].apply(get_IMDB_movie_data)
extra = pd.DataFrame(extra.to_list(), columns=['rating','genres','mpaa','description','votes'])
# df

2022-03-05 18:18:44,040 CRITICAL [imdbpy] C:\Users\carlo\anaconda3\lib\site-packages\imdb\_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt1381833/reference', 'proxy': '', 'exception type': 'IOError', 'original exception': <HTTPError 404: ''>},); kwds: {}
Traceback (most recent call last):
  File "C:\Users\carlo\anaconda3\lib\site-packages\imdb\parser\http\__init__.py", line 221, in retrieve_unicode
    response = uopener.open(url)
  File "C:\Users\carlo\anaconda3\lib\urllib\request.py", line 523, in open
    response = meth(req, response)
  File "C:\Users\carlo\anaconda3\lib\urllib\request.py", line 632, in http_response
    response = self.parent.error(
  File "C:\Users\carlo\anaconda3\lib\urllib\request.py", line 561, in error
    return self._call_chain(*args)
  File "C:\Users\carlo\anaconda3\lib\urllib\request.py", line 494, in _call_chain
    result = func(*args)
  File "C:\Users\carlo\anacond

In [137]:
movies_sample_big = pd.concat([movies_sample, extra], axis=1)
# movies_sample_big

In [138]:
# !pip install rotten-tomatoes-scraper
from rotten_tomatoes_scraper.rt_scraper import MovieScraper


In [139]:
def get_RT_ratings(movie_title):
    """
    Returns the Rotten Tomatoes critic score and audience score of a title
    """

    # Extract URL
    RT_search = MovieScraper()
    try:
        search_res = RT_search.search(movie_title)

        # Exact match
        url_list = [movie_dict['url'] for movie_dict in search_res['movies']
                    if movie_dict['name'].lower() == movie_title.lower()]
        if len(url_list) == 1:
            url = url_list[0]
        # No exact match -  return the latest one
        elif not url_list:
            url_list = sorted([(movie_dict['url'], movie_dict['year']) for movie_dict in search_res['movies']],
                            key=lambda x: x[1], reverse=True)
            try:
                url = url_list[0][0]
            except:
                return pd.NA, pd.NA
            # print(f'No exact match found. Going with {url}')
        # More than one exact match - return the latest one
        elif len(url_list) > 1:
            url_list = sorted([(movie_dict['url'], movie_dict['year']) for movie_dict in search_res['movies']
                            if movie_dict['name'].lower() == movie_title.lower()],
                            key=lambda x: x[1], reverse=True)
            url = url_list[0][0]
            # print(f'More than one exact match found. Going with {url}')

        movie_scraper = MovieScraper(movie_url='https://www.rottentomatoes.com' + url)
        movie_scraper.extract_metadata()
    except:
        return pd.NA, pd.NA
        
    try:
        rt_critics_score = int(movie_scraper.metadata['Score_Rotten'])
    except:
        rt_critics_score = pd.NA

    try:
        rt_audience_score = int(movie_scraper.metadata['Score_Audience'])
    except:
        rt_audience_score = pd.NA
        
    return rt_critics_score, rt_audience_score


In [140]:
get_RT_ratings('The Big City')

(<NA>, <NA>)

In [141]:
rotten = movies_sample_big['primaryTitle'].apply(get_RT_ratings)
rotten = pd.DataFrame(rotten.to_list(), columns=['rt_critics_score','rt_audience_score'])
rotten

Unnamed: 0,rt_critics_score,rt_audience_score
0,,
1,,
2,,
3,,
4,,
...,...,...
495,,
496,,
497,,
498,,


In [142]:
movies_sample_big.to_csv('data\movies_sample_big.csv', index=False)
# rotten.to_csv('data\rotten.csv', index=False)

With the information coming from the samples, it seems that there are too many movies without important fields, so I will clean a bit the original dataset:

In [143]:
set_movies = basics_movies.loc[(basics_movies['genres'] != '\\N')]
# basics_movies = basics.loc[basics['titleType'] == 'movie']
set_movies.shape

(532117, 9)

In [144]:
set_movies['startYear'] = set_movies['startYear'].replace('\\N',0).astype('int32')
set_movies = set_movies.loc[(set_movies['startYear'] >= 1930)]
set_movies.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  set_movies['startYear'] = set_movies['startYear'].replace('\\N',0).astype('int32')


(435918, 9)

Now I hope the scraping will have less errors

In [145]:
set_movies_sample = set_movies.sample(500, random_state=42)


In [146]:
def merge_data(table):
    table = table.reset_index()
    extra = table['tconst'].apply(get_IMDB_movie_data) #gets extra info from imdb
    extra = pd.DataFrame(extra.to_list(), columns=['rating','genres','mpaa','description','votes'])

    rotten = movies_sample_big['primaryTitle'].apply(get_RT_ratings) # gets more ratings from rotten tomatoes
    rotten = pd.DataFrame(rotten.to_list(), columns=['rt_critics_score','rt_audience_score'])
    table_big = pd.concat([table, extra, rotten], axis=1)

    return table_big

In [147]:
set_movies_sample_big = merge_data(set_movies_sample)
set_movies_sample_big.head()

Unnamed: 0,index,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,index.1,rating,genres.1,mpaa,description,votes,index.2,rt_critics_score,rt_audience_score
0,5252956,tt2258239,movie,8 First Dates,8 pervykh svidaniy,0,2012,\N,87,Comedy,0,6.1,[comedy],Not Rated,"[Vera is a TV star, Nikita is a veterinarian. ...",1051.0,0,,
1,282345,tt0294888,movie,Sakariba blues,Sakariba blues,0,1968,\N,108,"Action,Crime",1,,"[action, crime]",,,,1,,
2,5843848,tt3503004,movie,Polk Road: The Homecoming,Polk Road: The Homecoming,0,2012,\N,79,"Comedy,Mystery,Thriller",2,7.6,"[comedy, mystery, thriller]",Not Rated,"[After the events that occurred on Polk Road, ...",7.0,2,,
3,5194598,tt2191618,movie,"Every Everything: The Music, Life & Times of G...","Every Everything: The Music, Life & Times of G...",0,2013,\N,97,"Documentary,Music",3,7.1,"[documentary, music]",Not Rated,"[In marked contrast to ""Color Me Obsessed,"" di...",105.0,3,,
4,29878,tt0030409,movie,Man from Music Mountain,Man from Music Mountain,0,1938,\N,58,Western,4,5.9,[western],Passed,[When nasty land developers try to bilk honest...,153.0,4,,


In [148]:
set_movies_sample_big.shape

(500, 19)

In [149]:
set_movies_sample_big.to_csv('data\set_movies_sample_big.csv', index=False)

# testing grounds

In [159]:
movie_title = '8 First Dates'
from rotten_tomatoes_scraper.rt_scraper import MovieScraper

RT_search = MovieScraper()

search_res = RT_search.search(movie_title)

# Exact match
url_list = [movie_dict['url'] for movie_dict in search_res['movies']
            if movie_dict['name'].lower() == movie_title.lower()]
if len(url_list) == 1:
    url = url_list[0]
# No exact match -  return the latest one
elif not url_list:
    url_list = sorted([(movie_dict['url'], movie_dict['year']) for movie_dict in search_res['movies']],
                    key=lambda x: x[1], reverse=True)
    try:
        url = url_list[0][0]
    except:
        print('except')
        # return pd.NA, pd.NA
    # print(f'No exact match found. Going with {url}')
# More than one exact match - return the latest one
elif len(url_list) > 1:
    url_list = sorted([(movie_dict['url'], movie_dict['year']) for movie_dict in search_res['movies']
                    if movie_dict['name'].lower() == movie_title.lower()],
                    key=lambda x: x[1], reverse=True)
    url = url_list[0][0]
    # print(f'More than one exact match found. Going with {url}')

movie_scraper = MovieScraper(movie_url='https://www.rottentomatoes.com' + url)
movie_scraper.extract_metadata()

    
try:
    rt_critics_score = int(movie_scraper.metadata['Score_Rotten'])
except:
    rt_critics_score = int(movie_scraper.metadata['Score_Rotten'])

    # rt_critics_score = pd.NA

try:
    rt_audience_score = int(movie_scraper.metadata['Score_Audience'])
except:
    # rt_audience_score = pd.NA
    rt_audience_score = int(movie_scraper.metadata['Score_Audience'])

    
# return rt_critics_score, rt_audience_score


HTTPError: 403 Client Error: Forbidden for url: https://www.rottentomatoes.com/api/private/v2.0/search?q=8+First+Dates&limit=10

In [156]:
get_RT_ratings('Sakariba blues')

(<NA>, <NA>)

In [153]:
print(movie.keys())

['localized title', 'cast', 'genres', 'runtimes', 'countries', 'country codes', 'language codes', 'color info', 'aspect ratio', 'box office', 'certificates', 'original air date', 'rating', 'votes', 'cover url', 'imdbID', 'languages', 'title', 'year', 'kind', 'original title', 'director', 'writer', 'producer', 'cinematographer', 'editor', 'editorial department', 'sound crew', 'visual effects', 'stunt performer', 'camera and electrical department', 'music department', 'akas', 'production companies', 'distributors', 'other companies', 'plot', 'canonical title', 'long imdb title', 'long imdb canonical title', 'smart canonical title', 'smart long imdb canonical title', 'full-size cover url']


In [155]:
rotten

Unnamed: 0,rt_critics_score,rt_audience_score
0,,
1,,
2,,
3,,
4,,
...,...,...
495,,
496,,
497,,
498,,


In [154]:
movie.data['plot']

["Vera is a TV star, Nikita is a veterinarian. One morning, after a fun-filled evening, Vera wakes up in bed with Nikita. They decide it's a party consequence and leave in different directions. The next day, everything repeats."]