# 1 - Data gathering

I am going to use as the main source of info the dataset offered by IMDB in https://datasets.imdbws.com/  
Specifically, I choose the file **title.basics.tsv.gz**, which has this structure according to the documentation:

* tconst (string) - alphanumeric unique identifier of the title
* titleType (string) – the type/format of the title (e.g. movie, short, tvseries, tvepisode, video, etc)
* primaryTitle (string) – the more popular title / the title used by the filmmakers on promotional materials at the point of release
* originalTitle (string) - original title, in the original language
* isAdult (boolean) - 0: non-adult title; 1: adult title
* startYear (YYYY) – represents the release year of a title. In the case of TV Series, it is the series start year
* endYear (YYYY) – TV Series end year. ‘\N’ for all other title types
* runtimeMinutes – primary runtime of the title, in minutes
* genres (string array) – includes up to three genres associated with the title

In [None]:
import pandas as pd

basics = pd.read_csv('data/data.tsv', sep='\t')
basics

In [None]:
basics.titleType.value_counts(dropna=False)

The amount of items is too big, so I will keep only the movies, with more than 600k items.

In [None]:
basics_movies = basics.loc[basics['titleType'] == 'movie']
basics_movies

But that is not enough, I need more information, mainly the description, and for that I will use the CinemaGoer library (https://cinemagoer.github.io):

In [None]:
# !pip install cinemagoer
from imdb import Cinemagoer

In [None]:

def get_IMDB_movie_data(movie_ID=''):
    """
    Returns the following information of a title in a list form:
        IMDB rating
        genres
        MPAA rating
        Description
        maybe later: movie_title='',
    """
    if movie_ID=='':
        raise Exception("No title or ID provided")
    
    movie_ID = movie_ID.replace('t', '')
    
    try:
        movie = ia.get_movie(movie_ID)
    except:
        return (pd.NA,pd.NA,pd.NA,pd.NA,pd.NA)

    try:
        rating = movie.data['rating']
        # result['rating'] = movie.data['rating']
    except:
        rating = pd.NA
        # result['rating'] = pd.NA

    try:
        # result['genres'] = 0
        genres = [genre.lower() for genre in movie.data['genres']]
    except:
        # result['genres'] = pd.NA
        genres = pd.NA

    try:
        # result['mpaa'] = [c.split(':')[1] for c in movie.data['certificates'] if (c.startswith('United States') or 'USA' in c)][0]
        MPAA = [c.split(':')[1] for c in movie.data['certificates'] if (c.startswith('United States') or 'USA' in c)][0]
    except:
        # result['mpaa'] = pd.NA
        MPAA = pd.NA

    try:
        # result['description'] = movie.data['plot outline']
        description = movie.data['plot outline']
    except:
        # result['description'] = pd.NA
        description = pd.NA

    try:
        # result['votes'] = movie.data['votes']
        votes = movie.data['votes']
    except:
        # result['votes'] = pd.NA
        votes = pd.NA

    # print(result)
    return (rating,genres,MPAA,description,votes)

In [None]:
movies_sample = basics_movies.sample(200, random_state=42).reset_index()
# movies_sample

In [None]:
extra = movies_sample['tconst'].apply(get_IMDB_movie_data)
extra = pd.DataFrame(extra.to_list(), columns=['rating','genres','mpaa','description','votes'])
# df

In [None]:
movies_sample_big = pd.concat([movies_sample, extra], axis=1)
# movies_sample_big

In [None]:
# !pip install rotten-tomatoes-scraper
from rotten_tomatoes_scraper.rt_scraper import MovieScraper


In [None]:
def get_RT_ratings(movie_title):
    """
    Returns the Rotten Tomatoes critic score and audience score of a title
    """

    # Extract URL
    RT_search = MovieScraper()
    try:
        search_res = RT_search.search(movie_title)

        # Exact match
        url_list = [movie_dict['url'] for movie_dict in search_res['movies']
                    if movie_dict['name'].lower() == movie_title.lower()]
        if len(url_list) == 1:
            url = url_list[0]
        # No exact match -  return the latest one
        elif not url_list:
            url_list = sorted([(movie_dict['url'], movie_dict['year']) for movie_dict in search_res['movies']],
                            key=lambda x: x[1], reverse=True)
            try:
                url = url_list[0][0]
            except:
                return pd.NA, pd.NA
            # print(f'No exact match found. Going with {url}')
        # More than one exact match - return the latest one
        elif len(url_list) > 1:
            url_list = sorted([(movie_dict['url'], movie_dict['year']) for movie_dict in search_res['movies']
                            if movie_dict['name'].lower() == movie_title.lower()],
                            key=lambda x: x[1], reverse=True)
            url = url_list[0][0]
            # print(f'More than one exact match found. Going with {url}')

        movie_scraper = MovieScraper(movie_url='https://www.rottentomatoes.com' + url)
        movie_scraper.extract_metadata()
    except:
        return pd.NA, pd.NA
        
    try:
        rt_critics_score = int(movie_scraper.metadata['Score_Rotten'])
    except:
        rt_critics_score = pd.NA

    try:
        rt_audience_score = int(movie_scraper.metadata['Score_Audience'])
    except:
        rt_audience_score = pd.NA
        
    return rt_critics_score, rt_audience_score


In [None]:
get_RT_ratings('The Big City')

In [None]:
rotten = movies_sample_big['primaryTitle'].apply(get_RT_ratings)
rotten = pd.DataFrame(rotten.to_list(), columns=['rt_critics_score','rt_audience_score'])
rotten

In [None]:
movies_sample_big.to_csv('movies_sample_big.csv', index=False)
rotten.to_csv('rotten.csv', index=False)

With the information coming from the samples, it seems that there are too many movies without important fields, so I will clean a bit the original dataset:

In [None]:
set_movies = basics_movies.loc[(basics_movies['genres'] != '\\N')]
# basics_movies = basics.loc[basics['titleType'] == 'movie']
set_movies.shape

In [None]:
set_movies['startYear'] = set_movies['startYear'].replace('\\N',0).astype('int32')
set_movies = set_movies.loc[(set_movies['startYear'] >= 1930)]
set_movies.shape

Now I hope the scraping will have less errors

In [None]:
set_movies_sample = set_movies.sample(200, random_state=42)


In [None]:
def merge_data(table):
    extra = table['tconst'].apply(get_IMDB_movie_data) #gets extra info from imdb
    extra = pd.DataFrame(extra.to_list(), columns=['rating','genres','mpaa','description','votes'])

    rotten = movies_sample_big['primaryTitle'].apply(get_RT_ratings) # gets more ratings from rotten tomatoes
    rotten = pd.DataFrame(rotten.to_list(), columns=['rt_critics_score','rt_audience_score'])
    table_big = pd.concat([table, extra, rotten], axis=1)

    return table_big

In [None]:
def merge_data(table):
    table = table.reset_index()
    extra = table['tconst'].apply(get_IMDB_movie_data) #gets extra info from imdb
    extra = pd.DataFrame(extra.to_list(), columns=['rating','genres','mpaa','description','votes'])

    rotten = movies_sample_big['primaryTitle'].apply(get_RT_ratings) # gets more ratings from rotten tomatoes
    rotten = pd.DataFrame(rotten.to_list(), columns=['rt_critics_score','rt_audience_score'])
    table_big = pd.concat([table, extra, rotten], axis=1)

    return table_big

In [None]:
set_movies_sample_big = merge_data(set_movies_sample)
set_movies_sample_big.head()