# 1 - Data gathering

I am going to use as the main source of info the dataset offered by IMDB in https://datasets.imdbws.com/  
Specifically, I choose the file **title.basics.tsv.gz**, which has this structure according to the documentation:

* tconst (string) - alphanumeric unique identifier of the title
* titleType (string) – the type/format of the title (e.g. movie, short, tvseries, tvepisode, video, etc)
* primaryTitle (string) – the more popular title / the title used by the filmmakers on promotional materials at the point of release
* originalTitle (string) - original title, in the original language
* isAdult (boolean) - 0: non-adult title; 1: adult title
* startYear (YYYY) – represents the release year of a title. In the case of TV Series, it is the series start year
* endYear (YYYY) – TV Series end year. ‘\N’ for all other title types
* runtimeMinutes – primary runtime of the title, in minutes
* genres (string array) – includes up to three genres associated with the title

In [9]:
import pandas as pd

basics = pd.read_csv('data/data.tsv', sep='\t')
basics

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...,...
8738245,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0,2010,\N,\N,"Action,Drama,Family"
8738246,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0,2010,\N,\N,"Action,Drama,Family"
8738247,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0,2010,\N,\N,"Action,Drama,Family"
8738248,tt9916856,short,The Wind,The Wind,0,2015,\N,27,Short


In [11]:
basics.titleType.value_counts(dropna=False)

tvEpisode       6541970
short            858445
movie            603535
video            257832
tvSeries         221817
tvMovie          135194
tvMiniSeries      42352
tvSpecial         36106
videoGame         30513
tvShort           10484
tvPilot               2
Name: titleType, dtype: int64

The amount of items is too big, so I will keep only the movies, with more than 600k items.

In [12]:
basics_movies = basics.loc[basics['titleType'] == 'movie']
basics_movies

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
498,tt0000502,movie,Bohemios,Bohemios,0,1905,\N,100,\N
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,\N,90,Drama
610,tt0000615,movie,Robbery Under Arms,Robbery Under Arms,0,1907,\N,\N,Drama
625,tt0000630,movie,Hamlet,Amleto,0,1908,\N,\N,Drama
...,...,...,...,...,...,...,...,...,...
8738140,tt9916622,movie,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,0,2015,\N,57,Documentary
8738167,tt9916680,movie,De la ilusión al desconcierto: cine colombiano...,De la ilusión al desconcierto: cine colombiano...,0,2007,\N,100,Documentary
8738179,tt9916706,movie,Dankyavar Danka,Dankyavar Danka,0,2013,\N,\N,Comedy
8738190,tt9916730,movie,6 Gunn,6 Gunn,0,2017,\N,116,\N


But that is not enough, I need more information, mainly the description, and for that I will use the CinemaGoer library (https://cinemagoer.github.io):

In [103]:
# !pip install cinemagoer
from imdb import Cinemagoer

# create an instance of the Cinemagoer class
ia = Cinemagoer()

# get a movie
movie = ia.get_movie('0133093')

# print the genres of the movie
print('Genres:')
for genre in movie['genres']:
    print(genre)


Genres:
Action
Sci-Fi


In [34]:
print(ia.get_movie_infoset())

['airing', 'akas', 'alternate versions', 'awards', 'connections', 'crazy credits', 'critic reviews', 'episodes', 'external reviews', 'external sites', 'faqs', 'full credits', 'goofs', 'keywords', 'list', 'locations', 'main', 'misc sites', 'news', 'official sites', 'parents guide', 'photo sites', 'plot', 'quotes', 'recommendations', 'release dates', 'release info', 'reviews', 'sound clips', 'soundtrack', 'synopsis', 'taglines', 'technical', 'trivia', 'tv schedule', 'video clips', 'vote details']


In [47]:
print(movie.keys())

['original title', 'localized title', 'cast', 'genres', 'runtimes', 'countries', 'country codes', 'language codes', 'color info', 'aspect ratio', 'sound mix', 'box office', 'certificates', 'original air date', 'rating', 'votes', 'cover url', 'imdbID', 'plot outline', 'languages', 'title', 'year', 'kind', 'director', 'writer', 'producer', 'composer', 'cinematographer', 'editor', 'editorial department', 'casting director', 'production design', 'art direction', 'set decoration', 'costume designer', 'make up', 'production manager', 'assistant director', 'art department', 'sound crew', 'special effects', 'visual effects', 'stunt performer', 'camera and electrical department', 'animation department', 'casting department', 'costume department', 'location management', 'music department', 'script department', 'transportation department', 'miscellaneous crew', 'akas', 'top 250 rank', 'production companies', 'distributors', 'special effects companies', 'other companies', 'plot', 'synopsis', 'cano

In [105]:
movie = ia.get_movie('0133093')
print(movie.data['rating'])

8.7


In [156]:

def get_IMDB_movie_data(movie_ID=''):
    """
    Returns the following information of a title in a list form:
        IMDB rating
        genres
        MPAA rating
        Description
        maybe later: movie_title='',
    """
    # # Get movie data from IMDB API
    # ia = Cinemagoer()
    # if movie_title!='':
    #     res = ia._search_movie(movie_title, results=True)
    #     movie_ID = res[0][0]
    #     if res[0][1]['title'] != movie_title:
    #         print('Titles do not exactly match: ', movie_title, res[0][1]['title'])
    # result = pd.DataFrame()
    if movie_ID=='':
        raise Exception("No title or ID provided")
    
    movie_ID = movie_ID.replace('t', '')
    
    movie = ia.get_movie(movie_ID)
    try:
        rating = movie.data['rating']
        # result['rating'] = movie.data['rating']
    except:
        rating = pd.NA
        # result['rating'] = pd.NA

    try:
        # result['genres'] = 0
        genres = [genre.lower() for genre in movie.data['genres']]
    except:
        # result['genres'] = pd.NA
        genres = pd.NA

    try:
        # result['mpaa'] = [c.split(':')[1] for c in movie.data['certificates'] if (c.startswith('United States') or 'USA' in c)][0]
        MPAA = [c.split(':')[1] for c in movie.data['certificates'] if (c.startswith('United States') or 'USA' in c)][0]
    except:
        # result['mpaa'] = pd.NA
        MPAA = pd.NA

    try:
        # result['description'] = movie.data['plot outline']
        description = movie.data['plot outline']
    except:
        # result['description'] = pd.NA
        description = pd.NA

    try:
        # result['votes'] = movie.data['votes']
        votes = movie.data['votes']
    except:
        # result['votes'] = pd.NA
        votes = pd.NA

    # print(result)
    return (rating,genres,MPAA,description,votes)

In [153]:
get_IMDB_movie_data('0133093')

8.7


(8.7,
 ['action', 'sci-fi'],
 'R',
 'Thomas A. Anderson is a man living two lives. By day he is an average computer programmer and by night a hacker known as Neo. Neo has always questioned his reality, but the truth is far beyond his imagination. Neo finds himself targeted by the police when he is contacted by Morpheus, a legendary computer hacker branded a terrorist by the government. As a rebel against the machines, Neo must confront the agents: super-powerful computer programs devoted to stopping Neo and the entire human rebellion.',
 1840533)

In [165]:
get_IMDB_movie_data('0468569')

(9.1,
 ['action', 'crime', 'drama', 'thriller'],
 'TV-14',
 'Set within a year after the events of Batman Begins (2005), Batman, Lieutenant James Gordon, and new District Attorney Harvey Dent successfully begin to round up the criminals that plague Gotham City, until a mysterious and sadistic criminal mastermind known only as "The Joker" appears in Gotham, creating a new wave of chaos. Batman\'s struggle against The Joker becomes deeply personal, forcing him to "confront everything he believes" and improve his technology to stop him. A love triangle develops between Bruce Wayne, Dent, and Rachel Dawes.',
 2504092)

Now that it works, I will take a small sample of the movie dataset to grow the table.

In [166]:
movies_sample = basics_movies.sample(20, random_state=42).reset_index()
movies_sample

Unnamed: 0,index,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,443576,tt0461760,movie,Cut Sleeve Boys,Cut Sleeve Boys,0,2006,\N,86,"Comedy,Romance"
1,4889474,tt1869525,movie,Las piedras,Las piedras,0,2011,\N,75,\N
2,987145,tt10034258,movie,Sergiu Celibidache: The Triumphant Return,Sergiu Celibidache: The Triumphant Return,0,1992,\N,55,Documentary
3,2420451,tt12619178,movie,Sojiga,Sojiga,0,2016,\N,\N,Action
4,6556911,tt5116988,movie,Under the Sun,Under the Sun,0,\N,\N,\N,"Adventure,Biography,Drama"
5,267985,tt0279867,movie,Hockey Girl,Hockey Girl,0,2002,\N,\N,Sport
6,5461315,tt2599006,movie,Tribyville,Tribyville,0,\N,\N,\N,Musical
7,2205273,tt12219112,movie,Heaven Beneath My Feet,Heaven Beneath My Feet,0,2020,\N,90,Documentary
8,8007012,tt8332260,movie,Anambra Boys,Anambra Boys,0,2018,\N,\N,Drama
9,391498,tt0408296,movie,Too Loud a Solitude,Une trop bruyante solitude,0,1996,\N,110,"Comedy,Drama"


In [167]:
extra = movies_sample['tconst'].apply(get_IMDB_movie_data)

In [168]:
df = pd.DataFrame(extra.to_list(), columns=['rating','genres','mpaa','description','votes'])

In [169]:
df

Unnamed: 0,rating,genres,mpaa,description,votes
0,6.0,"[comedy, romance]",R,,481.0
1,6.3,,,"In a house in Delta, in a quiet interrupted on...",14.0
2,,[documentary],G,,
3,,[action],,,
4,,"[adventure, biography, drama]",,,
5,,[sport],,,
6,,[musical],,,
7,5.6,[documentary],,Heaven Beneath my Feet is the story of three L...,5.0
8,,[drama],,,
9,5.6,"[comedy, drama]",,,51.0


In [173]:
movies_sample_big = pd.concat([movies_sample, df], axis=1)
movies_sample_big

Unnamed: 0,index,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,rating,genres.1,mpaa,description,votes
0,443576,tt0461760,movie,Cut Sleeve Boys,Cut Sleeve Boys,0,2006,\N,86,"Comedy,Romance",6.0,"[comedy, romance]",R,,481.0
1,4889474,tt1869525,movie,Las piedras,Las piedras,0,2011,\N,75,\N,6.3,,,"In a house in Delta, in a quiet interrupted on...",14.0
2,987145,tt10034258,movie,Sergiu Celibidache: The Triumphant Return,Sergiu Celibidache: The Triumphant Return,0,1992,\N,55,Documentary,,[documentary],G,,
3,2420451,tt12619178,movie,Sojiga,Sojiga,0,2016,\N,\N,Action,,[action],,,
4,6556911,tt5116988,movie,Under the Sun,Under the Sun,0,\N,\N,\N,"Adventure,Biography,Drama",,"[adventure, biography, drama]",,,
5,267985,tt0279867,movie,Hockey Girl,Hockey Girl,0,2002,\N,\N,Sport,,[sport],,,
6,5461315,tt2599006,movie,Tribyville,Tribyville,0,\N,\N,\N,Musical,,[musical],,,
7,2205273,tt12219112,movie,Heaven Beneath My Feet,Heaven Beneath My Feet,0,2020,\N,90,Documentary,5.6,[documentary],,Heaven Beneath my Feet is the story of three L...,5.0
8,8007012,tt8332260,movie,Anambra Boys,Anambra Boys,0,2018,\N,\N,Drama,,[drama],,,
9,391498,tt0408296,movie,Too Loud a Solitude,Une trop bruyante solitude,0,1996,\N,110,"Comedy,Drama",5.6,"[comedy, drama]",,,51.0


In [175]:
# !pip install rotten-tomatoes-scraper
from rotten_tomatoes_scraper.rt_scraper import MovieScraper

# RT_search = MovieScraper()
# search_res = RT_search.search('The Big City')

In [180]:
def get_RT_ratings(movie_title):
    """
    Returns the Rotten Tomatoes critic score and audience score of a title
    """

    # Extract URL
    RT_search = MovieScraper()
    search_res = RT_search.search(movie_title)

    # Exact match
    url_list = [movie_dict['url'] for movie_dict in search_res['movies']
                if movie_dict['name'].lower() == movie_title.lower()]
    if len(url_list) == 1:
        url = url_list[0]
    # No exact match -  return the latest one
    elif not url_list:
        url_list = sorted([(movie_dict['url'], movie_dict['year']) for movie_dict in search_res['movies']],
                          key=lambda x: x[1], reverse=True)
        url = url_list[0][0]
        print(f'No exact match found. Going with {url}')
    # More than one exact match - return the latest one
    elif len(url_list) > 1:
        url_list = sorted([(movie_dict['url'], movie_dict['year']) for movie_dict in search_res['movies']
                           if movie_dict['name'].lower() == movie_title.lower()],
                          key=lambda x: x[1], reverse=True)
        url = url_list[0][0]
        print(f'More than one exact match found. Going with {url}')

    movie_scraper = MovieScraper(movie_url='https://www.rottentomatoes.com' + url)
    movie_scraper.extract_metadata()
    try:
        rt_critics_score = int(movie_scraper.metadata['Score_Rotten'])
    except:
        rt_critics_score = pd.NA
    rt_audience_score = int(movie_scraper.metadata['Score_Audience'])
    return rt_critics_score, rt_audience_score


In [182]:
get_RT_ratings('The Big City')

No exact match found. Going with /m/big_heart_city


(<NA>, 0)

In [184]:
RT_search = MovieScraper()
search_res = RT_search.search('The Prodigal Son')
search_res

{'actorCount': 0,
 'actors': [],
 'criticCount': 0,
 'critics': [],
 'franchiseCount': 0,
 'franchises': [],
 'movieCount': 9,
 'movies': [{'name': 'The Prodigal Son',
   'year': 1983,
   'url': '/m/the_prodigal_son_1983',
   'image': 'https://resizing.flixster.com/YqA7gOe3e90cgo4iRWIeH_Ok5nE=/fit-in/80x80/v1.bTsxMjE4NjI3ODtqOzE5MDg2OzEyMDA7MjI1OzQwMA',
   'meterClass': 'N/A',
   'castItems': [{'name': 'Biao Yuen', 'url': '/celebrity/yuen_biao'},
    {'name': 'Ching-Ying Lam', 'url': '/celebrity/chingying_lam'},
    {'name': 'Sammo Kam-Bo Hung', 'url': '/celebrity/sammo_kam_bo_hung'}],
   'subline': 'Biao Yuen, Ching-Ying Lam, Sammo Kam-Bo Hung, '},
  {'name': 'Tuhlaajapoika (Kotikatsomo: Tuhlaajapoika) (The Prodigal Son)',
   'year': 1992,
   'url': '/m/tuhlaajapoika-kotikatsomo-tuhlaajapoika-the-prodigal-son',
   'image': 'https://staticv2-4.rottentomatoes.com/static/images/redesign/poster_default_redesign.gif',
   'meterClass': 'N/A',
   'castItems': [{'name': 'Hannu Kivioja', 'url'

In [171]:
import requests
from bs4 import BeautifulSoup

class IMDBScraper:
   def __init__(self, url):
       self.url = url
       self.download_page()

   def download_page(self):
       # method for downloading the hotel page
       self.page = requests.get(self.url).text

   def scrape_data(self):
       #method for scraping out movie title and description
       soup = BeautifulSoup(self.page, "html.parser")
       movie_title = soup.find("h1", {"data-testid": "hero-title-block__title"}).text
       movie_description = soup.find("span", {"data-testid": "plot-xl"}).text
       return {"title": movie_title,
               "description": movie_description,
               }

urls = ["https://www.imdb.com/title/tt2382320/?ref_=hm_fanfav_tt_i_3_pd_fp1",]

for url in urls:
   x = IMDBScraper(url)
   print(x.scrape_data())

{'title': 'Sin tiempo para morir', 'description': 'James Bond has left active service. His peace is short-lived when Felix Leiter, an old friend from the CIA, turns up asking for help, leading Bond onto the trail of a mysterious villain armed with dangerous new technology.'}
