In [96]:
!pip install tqdm
!pip install imdbpy



In [161]:
from requests_html import HTMLSession # for making request
import requests # for making request
import json # for API parsing
import pandas as pd # for data processing
import numpy as np # for data processing
from tqdm import tqdm # for count time of iteration
import re
from tqdm import tqdm
from imdb import IMDb
from imdb.Person import Person

# Problem

Prediction plays an important role in many **Recommendation Systems**.

In this task, data from IMDb will be used to:
- predict movies based on a movie's content such as: title, directors, casts, genres, keywords, etc (Content Based Filtering). (TF-IDF, cosine similarity matrix)
- predict a preference that a user would give to a movie (Collaborative Filtering) (SVD)
- predict movies for a user based on other relevant users (Collaborative Filtering) (KNN)

The first prediction helps us to recommend movies when not knowing anything about users' tastes.

The next two predictions  

In [None]:
title, directors, casts, genres, keywords, overview

In [None]:
userid, movieid, rating

# Parse HTML

## Get genre

In [98]:
url = 'https://www.imdb.com/chart/top'
session = HTMLSession()
r = session.get(url)

In [99]:
genres = [genre.text for genre in r.html.find(".subnav_item_main")]

## Get item

In [123]:
num_item_per_page = 250
min_vote = 10000
session = HTMLSession()

movies_elements = []

for genre in tqdm(genres):
    index = 1
    while True:
        search_url = f'https://www.imdb.com/search/title/?title_type=feature&num_votes={min_vote},&genres={genre.lower()}&count={num_item_per_page}&start={index}'
        r = session.get(search_url)
        items = r.html.find('.lister-item-header > a')
        if not items:
            break
        else:
            movies_elements.extend(items)
            index += num_item_per_page

100%|██████████████████████████████████████████████████████████████████████████████████| 21/21 [10:21<00:00, 29.61s/it]


In [124]:
len(movies_elements)

26522

## Extract movie ID from list of movie elements

In [125]:
for item in movies_elements:
    id = re.search('/tt(.+?)/', item.links.pop())
    if not id:
        continue
    found_id = id.group(1)
    if found_id in set_id:
        continue
    set_id.add(found_id)

In [189]:
len(set_id)

8681

In [237]:
set_id

{'0337578',
 '0113253',
 '0995868',
 '4901306',
 '0361696',
 '1287878',
 '0410730',
 '0431308',
 '5096470',
 '6988116',
 '0251114',
 '0120789',
 '0437800',
 '0785006',
 '0339135',
 '9426210',
 '0435761',
 '0075784',
 '0160862',
 '0042530',
 '0331933',
 '0044837',
 '1714203',
 '0805184',
 '1389127',
 '1798684',
 '0077269',
 '0169858',
 '0080117',
 '0815178',
 '0120910',
 '0031885',
 '1618434',
 '0385004',
 '0455967',
 '2193215',
 '0400426',
 '1921149',
 '0443274',
 '1028528',
 '0059274',
 '0019421',
 '0125659',
 '1655441',
 '0486946',
 '0113986',
 '0293007',
 '0130827',
 '0762073',
 '5690360',
 '0374339',
 '5113040',
 '0120746',
 '8442644',
 '0051337',
 '0052572',
 '0095294',
 '1588170',
 '1981115',
 '0096118',
 '0373469',
 '3175038',
 '0040369',
 '0304711',
 '0102768',
 '1833673',
 '0058888',
 '0454987',
 '3504048',
 '0120390',
 '0212985',
 '0455590',
 '1355683',
 '0112401',
 '0460890',
 '1104001',
 '0105665',
 '2784936',
 '3152592',
 '3097204',
 '3721964',
 '0093677',
 '5360952',
 '42

# API

In [220]:
imdb_handler = IMDb()

In [256]:
def get_people_info(movie, people_type):
    if movie.get(people_type, None) is None:
        return []
    data = []
    for item in movie[people_type]:
        dict_row = {}
        dict_row['id'] = item.getID()
        dict_row['name'] = item.get('name', None)
        if dict_row['id'] is None or dict_row['name'] is None:
            continue
        data.append(dict_row)
    return data

In [273]:
# list 'movies_info' dùng để chứa thông tin riêng của movie, không liên quan đến yếu tố con người
movies_info = []
# list 'movies_people_info' dùng để chứa thông tin liên quan đến yếu tố con người trong movie (như cast, directors, ...)
movies_people_info = []

for id in tqdm(set_id):
    try:
        movie = imdb_handler.get_movie(id)
        dict_row_movie = {}
        dict_row_movie['id'] = movie['imdbID']
        dict_row_movie['title'] = movie['title']
        dict_row_movie['runtimes'] = movie['runtimes'][0]
        dict_row_movie['genres'] = ";".join(movie['genres'])
        dict_row_movie['vote_counts'] = movie['votes']
        dict_row_movie['average_rating'] = movie['rating']
        dict_row_movie['overview'] = movie.get('plot outline', None)
        if dict_row_movie['overview'] is None:
            plot = movie.get('plot', None)
            if plot is not None:
                chosen_plot = max(plot, key=len)
                end_index = chosen_plot.rfind('::')
                dict_row_movie['overview'] = chosen_plot[:end_index]

        dict_row_movie_person = {}
        dict_row_movie_person['id'] = movie['imdbID']
        people_types = ['cast', 'directors', 'writers', 'producers', 'composers']
        for people_type in people_types:
            dict_row_movie_person[people_type] = get_people_info(movie, people_type)

        movies_info.append(dict_row_movie)
        movies_people_info.append(dict_row_movie_person)
    except:
        pass

 35%|█████████████████████████▉                                                | 3043/8681 [3:06:02<5:50:52,  3.73s/it]2020-12-31 02:52:45,457 CRITICAL [imdbpy] C:\ProgramData\Anaconda3\lib\site-packages\imdb\_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt0112625/reference', 'proxy': '', 'exception type': 'IOError', 'original exception': <HTTPError 500: 'Internal Server Error'>},); kwds: {}
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\imdb\parser\http\__init__.py", line 220, in retrieve_unicode
    response = uopener.open(url)
  File "C:\ProgramData\Anaconda3\lib\urllib\request.py", line 531, in open
    response = meth(req, response)
  File "C:\ProgramData\Anaconda3\lib\urllib\request.py", line 640, in http_response
    response = self.parent.error(
  File "C:\ProgramData\Anaconda3\lib\urllib\request.py", line 569, in error
    return self._call_chain(*arg

In [269]:
pd.set_option('display.max_colwidth', None)

In [274]:
movies_info_df = pd.DataFrame(data=movies_info)
movies_people_info_df = pd.DataFrame(data=movies_people_info)

In [281]:
movies_info_df.head()

Unnamed: 0,id,title,runtimes,genres,vote_counts,average_rating,overview
0,337578,Gardener,181,Drama;Romance,15084,7.4,"Raj Malhotra and wife Pooja have four sons. The sons have settled down professionally and are quite independent. However, when Raj Malhotra retires, none of his children want to be burdened with the responsibility of taking care of their parents. Strangely, it is the adopted son who proves to be the most kind hearted of them all. Salman's girlfriend eventually marries him. The question is, will Raj and Pooja's sons learn the folly of their ways and turn over a new leaf?"
1,113253,Halloween: The Curse of Michael Myers,87,Horror;Thriller,31301,4.8,"Six years after Michael Myers' last massacre in Haddonfield, Jamie Lloyd has a child, but is then killed by Michael. Michael is allied with the Cult of Thorn, who both protect him and gave him the Curse of Thorn, the reason he killed all of his family. The Strode Family also moved into the Myers' house and are stalked by Michael. Sam Loomis and Tommy Doyle, a boy Laurie babysat during Michael's first rampage, are now out to stop Michael and the cult. Michael heads to kill Jamie's son and the rest of Laurie's family, but Dr. Loomis and Tommy plan to stop the curse, defeat the cult, stop Michael, and put an end to his murderous rampages, once and for all."
2,995868,Pele: Birth of a Legend,107,Biography;Drama;Sport,16402,7.2,Pele's meteoric rise from the slums of Sao Paulo to leading Brazil to its first World Cup victory at the age of 17 is chronicled in this biographical drama.
3,4901306,Perfect Strangers,96,Comedy;Drama,56387,7.8,"On a warm summer evening, the loving couple of Rocco, a plastic surgeon, and Eva, a therapist, are expecting their good friends to share a pleasant gathering over dinner. Everything is in order: The first course is ready; the roast is in the oven; the table is set, and without a doubt, this is going to be a meeting of true friends. Before long, the group begins the feast; however, in this nice but somewhat ordinary dinner, there is certainly something missing. Perhaps, if everyone placed their mobile phones on the table--and like a dangerous Russian roulette shared whatever arrived (texts, WhatsApp messages, and calls)--it would spice things up. Clearly, this uncommon truth-or-dare game has no point among honest companions who share everything with each other; nevertheless, when the phones start ringing, who will be the one with the sweatiest palms?"
4,361696,Raise Your Voice,107,Family;Music;Musical;Romance,26730,5.9,"This film is about a teenage girl who is very upset about her brother's death in a car crash. Terri has a love of singing and making up her own songs. Her brother (before his death) secretly submits a DVD of her singing to a musical summer camp down in L.A. Her father doesn't want her to go, but secretly her mom lets her go and everything goes fine, except she has stage fright. She gets through her stage fright with the help of her new friend Jay. At the end of the contest everyone at the musical school have to perform something. And if they win, they win a scholarship along with it. Her dad finds out, comes down to L.A. and threatens to take her home! Will he let her stay? and will she win the contest? Viewers are on the edges of their seats to find out what happens..."


In [282]:
movies_people_info_df.head()

Unnamed: 0,id,cast,directors,writers,producers,composers
0,337578,"[{'id': '0000821', 'name': 'Amitabh Bachchan'}, {'id': '0004564', 'name': 'Hema Malini'}, {'id': '0006795', 'name': 'Salman Khan'}, {'id': '0154274', 'name': 'Mahima Chaudhry'}, {'id': '0990090', 'name': 'Aman Verma'}, {'id': '1267095', 'name': 'Samir Soni'}, {'id': '1265436', 'name': 'Saahil Chadha'}, {'id': '1177893', 'name': 'Nasirr Khan'}, {'id': '0710046', 'name': 'Suman Ranganath'}, {'id': '0244890', 'name': 'Divya Dutta'}, {'id': '1247644', 'name': 'Arzoo Govitrikar'}, {'id': '1267008', 'name': 'Rimi Sen'}, {'id': '0310698', 'name': 'Yash Gawli'}, {'id': '0712546', 'name': 'Paresh Rawal'}, {'id': '0239267', 'name': 'Lillete Dubey'}, {'id': '0768296', 'name': 'Sharat Saxena'}, {'id': '1266956', 'name': 'Sanjeeda Sheikh'}, {'id': '2147526', 'name': 'Asrani'}, {'id': '0318622', 'name': 'Avtar Gill'}, {'id': '0430803', 'name': 'Mohan Joshi'}, {'id': '0220807', 'name': 'Anang Desai'}, {'id': '0883165', 'name': 'Nakul Vaid'}, {'id': '0456094', 'name': 'Shashi Kiran'}, {'id': '1024111', 'name': 'Gajendra Chauhan'}, {'id': '0747195', 'name': 'Sudipta Roy'}, {'id': '1869031', 'name': 'Neha Ghanekar'}, {'id': '1866697', 'name': 'Titiksha Burman'}, {'id': '1587122', 'name': 'Smit Sheth'}, {'id': '1249326', 'name': 'Harsh Vashisht'}]","[{'id': '0159162', 'name': 'Ravi Chopra'}]","[{'id': '1338353', 'name': 'Shafiq Ansari'}, {'id': '1338448', 'name': 'Satish Bhatnagar'}, {'id': '0159148', 'name': 'B.R. Chopra'}, {'id': '1338988', 'name': 'Ram Govind'}, {'id': '0492854', 'name': 'Josephine Lawrence'}, {'id': '0495259', 'name': 'Helen Leary'}, {'id': '0495270', 'name': 'Nolan Leary'}, {'id': '1221396', 'name': 'Achala Nagar'}, {'id': '1221396', 'name': 'Achala Nagar'}]","[{'id': '1388826', 'name': 'Ashwani Chopra'}, {'id': '0159148', 'name': 'B.R. Chopra'}]","[{'id': '0795411', 'name': 'Aadesh Shrivastava'}, {'id': '0802252', 'name': 'Uttam Singh'}]"
1,113253,"[{'id': '0000587', 'name': 'Donald Pleasence'}, {'id': '0748620', 'name': 'Paul Rudd'}, {'id': '0353238', 'name': 'Marianne Hagan'}, {'id': '0752751', 'name': 'Mitchell Ryan'}, {'id': '0200981', 'name': 'Kim Darby'}, {'id': '0257588', 'name': 'Bradford English'}, {'id': '0091569', 'name': 'Keith Bogart'}, {'id': '0639686', 'name': 'Mariah O'Brien'}, {'id': '0315210', 'name': 'Leo Geter'}, {'id': '0105046', 'name': 'J.C. Brandy'}, {'id': '0306894', 'name': 'Devin Gardner'}, {'id': '0842630', 'name': 'Susan Swift'}, {'id': '0928111', 'name': 'George P. Wilbur'}, {'id': '0460758', 'name': 'Janice Knickrehm'}, {'id': '0248430', 'name': 'Alan Echeverria'}, {'id': '0750539', 'name': 'Hildur Ruriks'}, {'id': '0382959', 'name': 'Sheri Hicks'}, {'id': '0698354', 'name': 'Tom Proctor'}, {'id': '0606408', 'name': 'Bryan Morris'}, {'id': '0156806', 'name': 'Lee Ju Chew'}, {'id': '0027309', 'name': 'Raquelle Anderson'}, {'id': '0838741', 'name': 'Kristine Summers'}, {'id': '0232172', 'name': 'Elyse Donalson'}, {'id': '0503583', 'name': 'A. Michael Lerner'}, {'id': '1115786', 'name': 'Jimmy Chunga'}, {'id': '0242941', 'name': 'Ellen Dunning'}, {'id': '1355220', 'name': 'Brad Hardin'}, {'id': '0503610', 'name': 'Fred Lerner'}, {'id': '1083984', 'name': 'James Woodson'}]","[{'id': '0152640', 'name': 'Joe Chappelle'}]","[{'id': '0384185', 'name': 'Debra Hill'}, {'id': '0000118', 'name': 'John Carpenter'}, {'id': '0268107', 'name': 'Daniel Farrands'}]","[{'id': '0015443', 'name': 'Malek Akkad'}, {'id': '0002160', 'name': 'Moustapha Akkad'}, {'id': '0293551', 'name': 'Paul Freeman'}]","[{'id': '0397697', 'name': 'Alan Howarth'}, {'id': '0704979', 'name': 'Paul Rabjohns'}]"
2,995868,"[{'id': '5991572', 'name': 'Kevin de Paula'}, {'id': '5991573', 'name': 'Leonardo Lima Carvalho'}, {'id': '1179580', 'name': 'Seu Jorge'}, {'id': '4178453', 'name': 'Mariana Nunes'}, {'id': '0328604', 'name': 'Milton Gonçalves'}, {'id': '0584492', 'name': 'Seth Michaels'}, {'id': '0000352', 'name': 'Vincent D'Onofrio'}, {'id': '0560412', 'name': 'André Mattos'}, {'id': '0679888', 'name': 'Phil Miler'}, {'id': '4158426', 'name': 'Rafael Henriques'}, {'id': '5991243', 'name': 'Felipe Simas'}, {'id': '2186742', 'name': 'Adriano Aragon'}, {'id': '4309059', 'name': 'Mariana Balsa'}, {'id': '6534539', 'name': 'Eric Bell Jr.'}, {'id': '1546300', 'name': 'Diego Boneta'}, {'id': '1832579', 'name': 'Fernando Caruso'}, {'id': '1397601', 'name': 'Tonya Cornelisse'}, {'id': '6644313', 'name': 'Jon Cotterill'}, {'id': '6565973', 'name': 'Vivi Devereaux'}, {'id': '1499052', 'name': 'Thelmo Fernandes'}, {'id': '5991245', 'name': 'Jerome Franz'}, {'id': '5991247', 'name': 'Roger Haag'}, {'id': '2003849', 'name': 'Sven Holmberg'}, {'id': '6331142', 'name': 'Arthur Jansen'}, {'id': '0433784', 'name': 'Garcia Júnior'}, {'id': '0506492', 'name': 'Julio Levy'}, {'id': '0000538', 'name': 'Colm Meaney'}, {'id': '0616564', 'name': 'Charles Myara'}, {'id': '4550048', 'name': 'Ivan Orlic'}, {'id': '0671446', 'name': 'Pelé'}, {'id': '0763928', 'name': 'Rodrigo Santoro'}, {'id': '1518261', 'name': 'Brandon Wilson'}, {'id': '3681119', 'name': 'Celine Zapata'}]","[{'id': '1905310', 'name': 'Jeff Zimbalist'}, {'id': '3104562', 'name': 'Michael Zimbalist'}]","[{'id': '1905310', 'name': 'Jeff Zimbalist'}, {'id': '3104562', 'name': 'Michael Zimbalist'}]","[{'id': '6453234', 'name': 'Alexandre Dauman'}, {'id': '0247524', 'name': 'Guy East'}, {'id': '0274288', 'name': 'Caíque Martins Ferreira'}, {'id': '0004976', 'name': 'Brian Grazer'}, {'id': '3070587', 'name': 'Paul Kemsley'}, {'id': '3092126', 'name': 'Benjamin Mathes'}, {'id': '4550048', 'name': 'Ivan Orlic'}, {'id': '0671446', 'name': 'Pelé'}, {'id': '1909034', 'name': 'Patrick Tendai Pfupajena'}, {'id': '0744907', 'name': 'Kim Roth'}, {'id': '5966439', 'name': 'Shana Salazar'}, {'id': '0933213', 'name': 'Colin Wilson'}]","[{'id': '0006246', 'name': 'A.R. Rahman'}]"
3,4901306,"[{'id': '0061484', 'name': 'Giuseppe Battiston'}, {'id': '2063290', 'name': 'Anna Foglietta'}, {'id': '0316074', 'name': 'Marco Giallini'}, {'id': '0502412', 'name': 'Edoardo Leo'}, {'id': '0557609', 'name': 'Valerio Mastandrea'}, {'id': '2130040', 'name': 'Alba Rohrwacher'}, {'id': '0810738', 'name': 'Kasia Smutniak'}, {'id': '8111444', 'name': 'Benedetta Porcaroli'}, {'id': '0210812', 'name': 'Elisabetta De Palo'}, {'id': '8217329', 'name': 'Tommaso Tatafiore'}, {'id': '8217330', 'name': 'Noemi Pagotto'}]","[{'id': '0002645', 'name': 'Paolo Genovese'}]","[{'id': '3798121', 'name': 'Filippo Bologna'}, {'id': '0182499', 'name': 'Paolo Costella'}, {'id': '0002645', 'name': 'Paolo Genovese'}, {'id': '0002645', 'name': 'Paolo Genovese'}, {'id': '2601560', 'name': 'Paola Mammini'}, {'id': '0712330', 'name': 'Rolando Ravello'}]","[{'id': '1471794', 'name': 'Marco Belardi'}, {'id': '3056604', 'name': 'Ughetta Curto'}, {'id': '0316315', 'name': 'Marco Giannoni'}]","[{'id': '1304832', 'name': 'Maurizio Filardo'}]"
4,361696,"[{'id': '0240381', 'name': 'Hilary Duff'}, {'id': '1268888', 'name': 'Oliver James'}, {'id': '0001418', 'name': 'David Keith'}, {'id': '1048128', 'name': 'Dana Davis'}, {'id': '0507381', 'name': 'Johnny Lewis'}, {'id': '0001854', 'name': 'Rita Wilson'}, {'id': '0562674', 'name': 'Lauren C. Mayhew'}, {'id': '0993507', 'name': 'Kat Dennings'}, {'id': '0728762', 'name': 'Jason Ritter'}, {'id': '0000360', 'name': 'Rebecca De Mornay'}, {'id': '0179173', 'name': 'John Corbett'}, {'id': '1703463', 'name': 'Carly Reeves'}, {'id': '0043041', 'name': 'James Avery'}, {'id': '0871636', 'name': 'Robert Trebor'}, {'id': '1736564', 'name': 'Steven T. Palmer'}, {'id': '0930397', 'name': 'Davida Williams'}, {'id': '0542000', 'name': 'Marshall Manesh'}, {'id': '0104462', 'name': 'Gibby Brand'}, {'id': '0573732', 'name': 'Sean McNamara'}, {'id': '0583541', 'name': 'Fred Meyers'}, {'id': '1112597', 'name': 'Mitch Rotter'}, {'id': '1734679', 'name': 'Seis Cuerdas'}, {'id': '0320567', 'name': 'John Gipson'}, {'id': '0862328', 'name': 'T.J. Thyne'}, {'id': '1734739', 'name': 'Adam Gontier'}, {'id': '1736594', 'name': 'Neil Sanderson'}, {'id': '1736625', 'name': 'Brad Walst'}, {'id': '1734952', 'name': 'Barry Stock'}, {'id': '1949925', 'name': 'Josh Berger'}, {'id': '1179559', 'name': 'Steven Gridley'}, {'id': '1742061', 'name': 'Portis Hershey'}, {'id': '3528523', 'name': 'Judith Hill'}, {'id': '0474121', 'name': 'Josh Kuhn'}, {'id': '11356849', 'name': 'Kaelyn Lyverse'}, {'id': '2200128', 'name': 'Michael Marchand'}, {'id': '1692361', 'name': 'Christopher Mauldin'}, {'id': '1567548', 'name': 'Shauney Recke'}, {'id': '2015607', 'name': 'Raquel C. Rischard'}, {'id': '1756395', 'name': 'Lindsay Roth'}, {'id': '2491332', 'name': 'J.R. Taylor'}, {'id': '1659641', 'name': 'Carly Alyssa Thorne'}, {'id': '4905234', 'name': 'Brooke Ventre'}]","[{'id': '0573732', 'name': 'Sean McNamara'}]","[{'id': '1112597', 'name': 'Mitch Rotter'}, {'id': '1368026', 'name': 'Sam Schreiber'}]","[{'id': '0112335', 'name': 'David Brookwell'}, {'id': '0228690', 'name': 'A.J. Dix'}, {'id': '0256497', 'name': 'Toby Emmerich'}, {'id': '1236446', 'name': 'Brad Jensen'}, {'id': '0438195', 'name': 'Avram 'Butch' Kaplan'}, {'id': '0442212', 'name': 'Mark Kaufman'}, {'id': '0483091', 'name': 'Christina Lambert'}, {'id': '0006998', 'name': 'Todd Lewis'}, {'id': '0573732', 'name': 'Sean McNamara'}, {'id': '0601597', 'name': 'Matt Moore'}, {'id': '0722603', 'name': 'Anthony Rhulen'}, {'id': '0728260', 'name': 'Sara Risher'}, {'id': '1387141', 'name': 'Troy Rowland'}, {'id': '0999995', 'name': 'William Shively'}, {'id': '1520636', 'name': 'Matthew A. Thomas'}]","[{'id': '1573324', 'name': 'Machine Head'}, {'id': '0956374', 'name': 'Aaron Zigman'}]"


# Save data

In [280]:
movies_people_info_df.to_csv('./data/credit.csv', index=False)

In [278]:
movies_info_df.to_csv('./data/movie.csv', index=False)