In [62]:
import pandas as pd
import numpy as np
import requests
import os
from IPython.display import display
import urllib.request
import time
import shutil

import random
random.seed(1)

In [25]:
if not os.path.exists('ml-20m'):

    import urllib.request
    urllib.request.urlretrieve('http://files.grouplens.org/datasets/movielens/ml-20m.zip', 'ml-20m.zip')

    import zipfile
    with zipfile.ZipFile('ml-20m.zip',"r") as zip_ref:
        zip_ref.extractall()
        
    os.remove('ml-20m.zip')

In [26]:
movies = pd.read_csv('ml-20m/movies.csv')
movies.dropna(inplace=True)
movies = movies[(movies['genres'] != '(no genres listed)') & (movies['genres'].map(lambda r: 'IMAX' not in r))]

links = pd.read_csv('ml-20m/links.csv')
links.dropna(inplace=True)
links.tmdbId = links.tmdbId.astype(int)

movies = pd.merge(movies, links, how='inner', on=['movieId'])
movies.dropna(inplace=True)
movies.head()

#links.head()
#movies.head()

In [53]:
from collections import Set

def calc_existing_genres(movies):
    genres_set = set()
    for genre_list in movies['genres']:
        for genre in genre_list.split('|'):
            genres_set.add(genre)
    return genres_set
        
calc_existing_genres(movies)

{'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western'}

In [None]:
posters_base_url = 'https://api.themoviedb.org/3/movie/{}/images?api_key=c4d5a44a2b7b36e4d40d0d1e713a8fee'

poster_base_url = 'https://image.tmdb.org/t/p/w300{}'

posters_dir = os.path.join('.', 'posters2')
os.makedirs(posters_dir, exist_ok=True)

random_movies = movies.sample(frac=1, random_state=1)

count = 0
for index, row in random_movies.iterrows():
    response = requests.get(posters_base_url.format(row.tmdbId))
    json_response = response.json()
    
    if 'posters' in json_response and len(json_response['posters']) > 0:
        count += 1
        posters = json_response['posters']
        random_poster = random.choice(posters)
        poster_url = poster_base_url.format(random_poster['file_path'])
        poster_file_name = str(row.movieId) + '.' + random_poster['file_path'].split('.')[-1]
        poster_path = os.path.join(posters_dir, poster_file_name)
        poster_response = requests.get(poster_url, stream=True)
        with open(poster_path, 'wb') as out_file:
            shutil.copyfileobj(poster_response.raw, out_file)
        #urllib.request.urlretrieve(poster_url, poster_path)
        
    if count >= 1000:
        break
    time.sleep(0.5)
    
#response.json()

In [84]:
movieIds = []
files = []
for file in os.listdir(posters_dir):
    movieId = file.split('.')[0]
    movieIds.append(int(movieId))
    files.append(file)
    
ids_and_files = pd.DataFrame({'movieId': movieIds, 'poster_img': files})
movies_and_posters = pd.merge(movies, ids_and_files, how='inner', on=['movieId'])
movies_and_posters.to_csv('movies_and_posters.csv', sep=';')
display(movies_and_posters.head())

Unnamed: 0,movieId,title,genres,imdbId,tmdbId,poster_path
0,2256,Parasite (1982),Horror|Sci-Fi,84472,48311,2256.jpg
1,6530,"Tenant, The (Locataire, Le) (1976)",Drama|Horror|Mystery|Thriller,74811,11482,6530.jpg
2,52202,His Kind of Woman (1951),Comedy|Crime|Drama|Film-Noir|Thriller,43643,33673,52202.jpg
3,61868,Jam (2006),Drama,482528,71139,61868.jpg
4,74297,Electric Shadows (Meng ying tong nian) (2004),Drama,424273,26137,74297.jpg
