In [10]:
import pandas as pd
import numpy as np
import requests
import os
from IPython.display import display
import urllib.request
import time
import shutil

import random
random.seed(1)

In [11]:
if not os.path.exists('ml-20m'):

    import urllib.request
    urllib.request.urlretrieve('http://files.grouplens.org/datasets/movielens/ml-20m.zip', 'ml-20m.zip')

    import zipfile
    with zipfile.ZipFile('ml-20m.zip',"r") as zip_ref:
        zip_ref.extractall()
        
    os.remove('ml-20m.zip')

In [12]:
movies = pd.read_csv('ml-20m/movies.csv')
movies.dropna(inplace=True)
movies = movies[(movies['genres'] != '(no genres listed)') & (movies['genres'].map(lambda r: 'IMAX' not in r))]

links = pd.read_csv('ml-20m/links.csv')
links.dropna(inplace=True)
links.tmdbId = links.tmdbId.astype(int)

movies = pd.merge(movies, links, how='inner', on=['movieId'])
movies.dropna(inplace=True)
movies.head()

#links.head()
#movies.head()

Unnamed: 0,movieId,title,genres,imdbId,tmdbId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844
2,3,Grumpier Old Men (1995),Comedy|Romance,113228,15602
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,31357
4,5,Father of the Bride Part II (1995),Comedy,113041,11862


In [13]:
from collections import Set

def calc_existing_genres(movies):
    genres_set = set()
    for genre_list in movies['genres']:
        for genre in genre_list.split('|'):
            genres_set.add(genre)
    return genres_set
        
calc_existing_genres(movies)

{'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western'}

In [14]:
posters_dir = os.path.join('.', 'posters')
os.makedirs(posters_dir, exist_ok=True)

existing_posters = set()
for file in os.listdir(posters_dir):
    existing_posters.add(int(file.split('.')[0]))

In [15]:
posters_base_url = 'https://api.themoviedb.org/3/movie/{}/images?api_key=c4d5a44a2b7b36e4d40d0d1e713a8fee'

poster_base_url = 'https://image.tmdb.org/t/p/w300{}'

random_movies = movies.sample(frac=1, random_state=1)

count = 0
for index, row in random_movies.iterrows():
    
    if row.movieId not in existing_posters:
        
        time.sleep(0.5)

        try:
            response = requests.get(posters_base_url.format(row.tmdbId))
            print(response.json())
            break
        except:
            print('Data {} couldn\'t be retrieved'.format(row.tmdbId))
            continue
        json_response = response.json()
    
        if 'posters' in json_response and len(json_response['posters']) > 0:
            posters = json_response['posters']
            random_poster = random.choice(posters)
            poster_url = poster_base_url.format(random_poster['file_path'])
            poster_file_name = str(row.movieId) + '.' + random_poster['file_path'].split('.')[-1]
            poster_path = os.path.join(posters_dir, poster_file_name)
            try:
                poster_response = requests.get(poster_url, stream=True)
                with open(poster_path, 'wb') as out_file:
                    shutil.copyfileobj(poster_response.raw, out_file)
                count += 1
                if count % 10 == 0:
                    print('{} posters saved'.format(count))
            except:
                print('Poster {} couldn\'t be retrieved'.format(poster_url))
                continue
    else:
        count += 1
        
    if count >= 2500:
        break
    
#response.json()

1790 posters saved
1800 posters saved
1810 posters saved
1820 posters saved
1830 posters saved
1840 posters saved
1850 posters saved
1860 posters saved
1870 posters saved
1880 posters saved
1890 posters saved
1900 posters saved
1910 posters saved
1920 posters saved
1930 posters saved
1940 posters saved
1950 posters saved
1960 posters saved
1970 posters saved
1980 posters saved
1990 posters saved
2000 posters saved
2010 posters saved
2020 posters saved
2030 posters saved
2040 posters saved
2050 posters saved
2060 posters saved
2070 posters saved
2080 posters saved
2090 posters saved
2100 posters saved
2110 posters saved
2120 posters saved
2130 posters saved
2140 posters saved
2150 posters saved
2160 posters saved
2170 posters saved
2180 posters saved
2190 posters saved
2200 posters saved
2210 posters saved
2220 posters saved
2230 posters saved
2240 posters saved
2250 posters saved
2260 posters saved
2270 posters saved
2280 posters saved
2290 posters saved
2300 posters saved
2310 posters

In [16]:
movieIds = []
files = []
for file in os.listdir(posters_dir):
    movieId = file.split('.')[0]
    movieIds.append(int(movieId))
    files.append(file)
    
ids_and_files = pd.DataFrame({'movieId': movieIds, 'poster_img': files})
movies_and_posters = pd.merge(movies, ids_and_files, how='inner', on=['movieId'])
movies_and_posters.to_csv('movies_and_posters.csv', sep=';')
display(movies_and_posters.head())

Unnamed: 0,movieId,title,genres,imdbId,tmdbId,poster_img
0,12,Dracula: Dead and Loving It (1995),Comedy|Horror,112896,12110,12.jpg
1,24,Powder (1995),Drama|Sci-Fi,114168,12665,24.jpg
2,36,Dead Man Walking (1995),Crime|Drama,112818,687,36.jpg
3,40,"Cry, the Beloved Country (1995)",Drama,112749,34615,40.jpg
4,47,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,114369,807,47.jpg
