In [1]:
## IMDbPY API package

# Import the imdb package.
import imdb

# Create the object that will be used to access the IMDb's database.
ia = imdb.IMDb() # by default access the web.

# Search for a movie (get a list of Movie objects).
s_result = ia.search_movie('The Untouchables')
print s_result

[<Movie id:0094226[http] title:_The Untouchables (1987)_>, <Movie id:1987680[http] title:_Untouchable (2018)_>, <Movie id:0052522[http] title:_"The Untouchables" (1959)_>, <Movie id:0106165[http] title:_"The Untouchables" (1993)_>, <Movie id:0065895[http] title:_Machine Gun McCain (1969)_>, <Movie id:1877895[http] title:_The Untouchables (2017)_>, <Movie id:0056449[http] title:_The Scarface Mob (1959) (TV)_>, <Movie id:0425602[http] title:_The Untouchables: Capone Rising (in development) (????)_>, <Movie id:1675434[http] title:_The Intouchables (2011)_>, <Movie id:0810077[http] title:_The Untouchable (2006)_>, <Movie id:5509634[http] title:_Untouchable (I) (2016)_>, <Movie id:0287778[http] title:_The Untouchable 2 (2001) (VG)_>, <Movie id:0287779[http] title:_The Untouchable (1997) (VG)_>, <Movie id:4191792[http] title:_"Untouchable" (2015)_>, <Movie id:2806760[http] title:_Untouchable (I) (2013)_>, <Movie id:0335509[http] title:_The Untouchables (1991) (VG)_>, <Movie id:3001590[http] 

In [113]:
## Using the Python Requests library to access the TMDb API via URI

import requests
import pandas as pd
import time

# set up a timer to check speed
start = time.time()

# API request
payload = {'api_key': '9290a6fe9125b32e7bbe5512036be0d0',
           'sort_by':'popularity.desc',
           'primary_release_date.gte':'2006',
           'primary_release_date.lte':'2007',
           'page':'1'}

r = requests.get('https://api.themoviedb.org/3/discover/movie?', params=payload)
pages = r.json()['total_pages'] 

# create data frame from JSON
tmdb_movies = pandas.io.json.json_normalize(r.json()['results'])

# check timing
timing = time.time()-start

print 'There are ', r.json()['total_results'], ' total results across ', r.json()['total_pages'], ' total pages.'
print 'Query took ', timing, ' seconds.'

There are  8760  total results across  438  total pages.
Query took  0.128619909286  seconds.


In [159]:
# loops through all years from 2006 to 2017 and get 1000 most popular movies
years = range(2006,2016)

for year in years:
    
    '''
    # intial API request to get
    payload = {'api_key': '9290a6fe9125b32e7bbe5512036be0d0',
           'sort_by':'popularity.desc',
           'primary_release_date.gte':year,
           'primary_release_date.lte':year+1,
               'page': 1}

    r = requests.get('https://api.themoviedb.org/3/discover/movie?', params=payload)
    
    print 'For ', year, ' there are ', r.json()['total_results'], ' total results across ', r.json()['total_pages'], ' total pages.'

    tmdb_movies = pd.concat([tmdb_movies, pandas.io.json.json_normalize(r.json()['results'])])
    '''
    
    for page in range(1, 51):
            start = time.time()
            payload = {'api_key': '9290a6fe9125b32e7bbe5512036be0d0',
                       'sort_by':'popularity.desc',
                       'primary_release_date.gte':year,
                       'primary_release_date.lte':year+1,
                       'page': page}
            
            r = requests.get('https://api.themoviedb.org/3/discover/movie?', params=payload)
            
            if year == 2006 and page == 1:
                print 'For ', year, ' there are ', r.json()['total_results'], ' total results across ', r.json()['total_pages'], ' total pages.'
                tmdb_movies = pandas.io.json.json_normalize(r.json()['results'])
            else:
                tmdb_movies = pd.concat([tmdb_movies, pandas.io.json.json_normalize(r.json()['results'])])
            
            delay = time.time()-start
            if delay < 0.25:
                time.sleep(0.25-delay)

For  2006  there are  8760  total results across  438  total pages.


In [169]:
from collections import Counter

Counter(map(tuple, tmdb_movies.genre_ids.tolist()))

Counter({(28, 18, 27, 878, 53): 2,
         (27, 53, 10769): 4,
         (28, 12, 80, 18): 1,
         (12, 14, 28, 35, 10751): 1,
         (35, 80, 18, 9648): 1,
         (14, 35, 10749): 4,
         (28, 80, 9648): 2,
         (18, 53, 9648, 28): 1,
         (10751, 35, 12): 1,
         (18, 878, 10769): 1,
         (28, 12, 14, 10752): 2,
         (80, 10769, 9648, 53): 1,
         (53, 80, 18): 17,
         (28, 12, 27, 53): 3,
         (35, 10751, 18): 1,
         (36, 99, 16): 1,
         (16, 35, 10769): 1,
         (12, 36, 28, 878, 35, 14): 1,
         (28, 35, 18, 14, 878): 1,
         (28, 12, 35, 10751): 7,
         (16, 14, 35, 878): 1,
         (18, 35, 80): 3,
         (36, 18, 53, 10752): 1,
         (16, 10751, 10749): 1,
         (18, 14, 10749): 2,
         (18, 10769, 10749, 10752): 1,
         (18, 28, 80): 11,
         (12, 28, 53): 10,
         (10770, 12, 14, 878): 1,
         (14, 28, 12, 35): 1,
         (35, 10751, 14): 6,
         (18, 80): 33,
         (107

In [170]:
tmdb_movies.shape

(10000, 14)

In [172]:
tmdb_movies.to_csv('movies_10000', encoding='utf8')