In [1]:
import pandas as pd
import numpy as np
import requests
import os
from IPython.display import display
import urllib.request
import time
import shutil

import random
random.seed(1)

In [2]:
if not os.path.exists('ml-20m'):

    import urllib.request
    urllib.request.urlretrieve('http://files.grouplens.org/datasets/movielens/ml-20m.zip', 'ml-20m.zip')

    import zipfile
    with zipfile.ZipFile('ml-20m.zip',"r") as zip_ref:
        zip_ref.extractall()
        
    os.remove('ml-20m.zip')

In [3]:
movies = pd.read_csv('ml-20m/movies.csv')
movies.dropna(inplace=True)
movies = movies[(movies['genres'] != '(no genres listed)') & (movies['genres'].map(lambda r: 'IMAX' not in r))]

links = pd.read_csv('ml-20m/links.csv')
links.dropna(inplace=True)
links.tmdbId = links.tmdbId.astype(int)

movies = pd.merge(movies, links, how='inner', on=['movieId'])
movies.dropna(inplace=True)
movies.head()

#links.head()
#movies.head()

Unnamed: 0,movieId,title,genres,imdbId,tmdbId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844
2,3,Grumpier Old Men (1995),Comedy|Romance,113228,15602
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,31357
4,5,Father of the Bride Part II (1995),Comedy,113041,11862


In [4]:
from collections import Set

def calc_existing_genres(movies):
    genres_set = set()
    for genre_list in movies['genres']:
        for genre in genre_list.split('|'):
            genres_set.add(genre)
    return genres_set
        
calc_existing_genres(movies)

{'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western'}

In [5]:
if not os.path.exists('extended_movie_data.csv'):
    movies['release_year'] = ''
    movies['poster_url'] = ''
    movies.to_csv('extended_movie_data.csv', sep=';', index=False)
    
movies = pd.read_csv('extended_movie_data.csv', sep=';', keep_default_na=False)
movies.head()

Unnamed: 0,movieId,title,genres,imdbId,tmdbId,release_year,poster_url,language
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862,1995,https://image.tmdb.org/t/p/w300/rhIRbceoE9lR4v...,en
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844,1995,https://image.tmdb.org/t/p/w300/vgpXmVaVyUL7GG...,en
2,3,Grumpier Old Men (1995),Comedy|Romance,113228,15602,1995,https://image.tmdb.org/t/p/w300/6ksm1sjKMFLbO7...,en
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,31357,1995,https://image.tmdb.org/t/p/w300/16XOMpEaLWkrcP...,en
4,5,Father of the Bride Part II (1995),Comedy,113041,11862,1995,https://image.tmdb.org/t/p/w300/e64sOI48hQXyru...,en


In [6]:
from dateutil.parser import parse

tmdb_movie_url = 'https://api.themoviedb.org/3/movie/{}?api_key=c4d5a44a2b7b36e4d40d0d1e713a8fee'
poster_base_url = 'https://image.tmdb.org/t/p/w300{}'


def get_more_movie_data(tmdbId): 
    try:
        response = requests.get(tmdb_movie_url.format(tmdbId))
    except:
        print('Data {} couldn\'t be retrieved'.format(tmdbId))
        return None
    
    tmdb_movie_json = response.json()
    
    year = 'not found'
    language = 'not found'
    poster_url = 'not found'
    
    try:
        date_string = tmdb_movie_json['release_date']
    
        try:
            dt = parse(date_string)
            year = dt.year
        except:
            print(tmdb_movie_json)
            print('Date {} can\'t be parsed'.format(date_string))
    except:
        pass
    
    try:
        language = tmdb_movie_json['original_language']
    except:
        pass
    
    try:
        poster_url = poster_base_url.format(tmdb_movie_json['poster_path'])
    except:
        pass
    
    return year, poster_url, language
    

In [7]:
count = 0

for index, row in movies.iterrows():
    if row['release_year'] == '':
        time.sleep(0.5)
        extended_movie_data = get_more_movie_data(row.tmdbId)
        
        if extended_movie_data is None:
            movies.iat[index, movies.columns.get_loc('release_year')] = 'not found'
            movies.iat[index, movies.columns.get_loc('poster_url')] = 'not found'
            movies.iat[index, movies.columns.get_loc('language')] = 'not found'
        else:
            release_year, poster_url, language = extended_movie_data
            
            movies.iat[index, movies.columns.get_loc('release_year')] = release_year
            movies.iat[index, movies.columns.get_loc('poster_url')] = poster_url
            movies.iat[index, movies.columns.get_loc('language')] = language
            
        count += 1
        if count % 20 == 0:
            print('{} movies extended'.format(count))
            movies.to_csv('extended_movie_data.csv', sep=';', index=False)
 

movies.to_csv('extended_movie_data.csv', sep=';', index=False)

20 movies extended
40 movies extended
60 movies extended
80 movies extended
100 movies extended
120 movies extended
140 movies extended
160 movies extended
180 movies extended
200 movies extended
220 movies extended
240 movies extended
{'adult': False, 'backdrop_path': None, 'belongs_to_collection': None, 'budget': 0, 'genres': [{'id': 18, 'name': 'Drama'}, {'id': 10749, 'name': 'Romance'}], 'homepage': None, 'id': 99885, 'imdb_id': 'tt0979876', 'original_language': 'en', 'original_title': 'Divine Intervention', 'overview': "Four friends fight an insane man's crusade to eliminate sinners from a small Western town.", 'popularity': 0.6, 'poster_path': '/iXZ6bn4G3nzGChawX9PLLnjMTqI.jpg', 'production_companies': [], 'production_countries': [], 'release_date': '', 'revenue': 0, 'runtime': 87, 'spoken_languages': [], 'status': 'Released', 'tagline': 'His Judgement Cometh...', 'title': 'Divine Intervention', 'video': False, 'vote_average': 0.0, 'vote_count': 0}
Date  can't be parsed
260 movie

4380 movies extended
{'adult': False, 'backdrop_path': None, 'belongs_to_collection': None, 'budget': 0, 'genres': [], 'homepage': None, 'id': 139909, 'imdb_id': 'tt0140826', 'original_language': 'fi', 'original_title': 'Aurinkotuuli', 'overview': '', 'popularity': 0.6, 'poster_path': None, 'production_companies': [], 'production_countries': [], 'release_date': '', 'revenue': 0, 'runtime': None, 'spoken_languages': [], 'status': 'Released', 'tagline': '', 'title': 'Aurinkotuuli', 'video': False, 'vote_average': 0.0, 'vote_count': 0}
Date  can't be parsed
4400 movies extended
{'adult': False, 'backdrop_path': None, 'belongs_to_collection': None, 'budget': 0, 'genres': [], 'homepage': None, 'id': 367678, 'imdb_id': 'tt0185048', 'original_language': 'en', 'original_title': 'Enola Gay and the Atomic Bombing of Japan', 'overview': 'American Documentary', 'popularity': 0.6, 'poster_path': None, 'production_companies': [], 'production_countries': [], 'release_date': '', 'revenue': 0, 'runtime

6160 movies extended
6180 movies extended
6200 movies extended
6220 movies extended
6240 movies extended
6260 movies extended
6280 movies extended
6300 movies extended
6320 movies extended
6340 movies extended
6360 movies extended
6380 movies extended
6400 movies extended
6420 movies extended
6440 movies extended
6460 movies extended


In [10]:
movies['local_poster_file'] = ''
movies.to_csv('extended_movie_data_with_local_files.csv', sep=';', index=False)

In [12]:
filtered_movies = movies[movies.release_year != 'not found']
filtered_movies['release_year'] = pd.to_numeric(filtered_movies.release_year)
filtered_movies = filtered_movies[(filtered_movies.release_year >= 1995) & (movies.language == 'en')]
filtered_movies.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
  This is separate from the ipykernel package so we can avoid doing imports until


(10874, 9)

In [None]:
for index, row in filtered_movies.iterrows():
    poster_url = row.poster_url
    if row.local_poster_file == '' and poster_url != '' and poster_url != 'not found':
        try:
            poster_response = requests.get(poster_url, stream=True)
            local_poster_file = row.movieId + '.' + poster_url.split('.')[-1]
            poster_save_path = os.path.join('posters', local_poster_file)
            with open(poster_save_path, 'wb') as out_file:
                shutil.copyfileobj(poster_response.raw, out_file)
            
            movies.iat[index, movies.columns.get_loc('local_poster_file')] = local_poster_file 
            
            count += 1
            if count % 10 == 0:
                print('{} posters saved'.format(count))
                movies.to_csv('extended_movie_data_with_local_files.csv', sep=';', index=False)
        except:
            print('Poster {} couldn\'t be retrieved'.format(poster_url))
            continue
            
        
movies.iat[index, movies.columns.get_loc('local_poster_file')] = local_poster_file         

Poster https://image.tmdb.org/t/p/w300/rhIRbceoE9lR4veEXuwCC2wARtG.jpg couldn't be retrieved
Poster https://image.tmdb.org/t/p/w300/vgpXmVaVyUL7GGiDeiK1mKEKzcX.jpg couldn't be retrieved
Poster https://image.tmdb.org/t/p/w300/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg couldn't be retrieved
Poster https://image.tmdb.org/t/p/w300/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg couldn't be retrieved
Poster https://image.tmdb.org/t/p/w300/e64sOI48hQXyru7naBFyssKFxVd.jpg couldn't be retrieved
Poster https://image.tmdb.org/t/p/w300/zMyfPUelumio3tiDKPffaUpsQTD.jpg couldn't be retrieved
Poster https://image.tmdb.org/t/p/w300/zENmHYYCsR6xgdyh2uHXPX4tyXE.jpg couldn't be retrieved
Poster https://image.tmdb.org/t/p/w300/sGO5Qa55p7wTu7FJcX4H4xIVKvS.jpg couldn't be retrieved
Poster https://image.tmdb.org/t/p/w300/eoWvKD60lT95Ss1MYNgVExpo5iU.jpg couldn't be retrieved
Poster https://image.tmdb.org/t/p/w300/5c0ovjT41KnYIHYuF4AWsTe3sKh.jpg couldn't be retrieved
Poster https://image.tmdb.org/t/p/w300/lymPNGLZgPHuqM29rKMGV46ANij.jpg

Poster https://image.tmdb.org/t/p/w300/8TfLAfIh5Qxp2J4ZjOafHYhWtDb.jpg couldn't be retrieved
Poster https://image.tmdb.org/t/p/w300/5A8gKzOrF9Z7tSUX6xd5dEx4NXf.jpg couldn't be retrieved
Poster https://image.tmdb.org/t/p/w300/yhFcbTCnsWjg3nH3PLL6RoltjqS.jpg couldn't be retrieved
Poster https://image.tmdb.org/t/p/w300/tQ6HEWNxvbeF2WklTVEp3su446F.jpg couldn't be retrieved
Poster https://image.tmdb.org/t/p/w300/2qAgGeYdLjelOEqjW9FYvPHpplC.jpg couldn't be retrieved
Poster https://image.tmdb.org/t/p/w300/yTDzXDKNGlmGYk5HuEeZUAypo1p.jpg couldn't be retrieved
Poster https://image.tmdb.org/t/p/w300/l00ALsdVyRGmPIp5EWlEoplki0X.jpg couldn't be retrieved
Poster https://image.tmdb.org/t/p/w300/x2n4YIVsms3JL8qCfzrbOg2yx0x.jpg couldn't be retrieved
Poster https://image.tmdb.org/t/p/w300/yLEdmdH8sgK0hbHT4Oie1l1pt5d.jpg couldn't be retrieved
Poster https://image.tmdb.org/t/p/w300/kFjffUg7zcST9AW4GoWiJFoNsH3.jpg couldn't be retrieved
Poster https://image.tmdb.org/t/p/w300/dMSEHHiHaCS5lZ7gfkuAH701sBW.jpg

In [52]:


#posters_base_url = 'https://api.themoviedb.org/3/movie/{}/images?api_key=c4d5a44a2b7b36e4d40d0d1e713a8fee'

poster_base_url = 'https://image.tmdb.org/t/p/w300{}'

#random_movies = movies.sample(frac=1, random_state=1)

count = 0
for index, row in random_movies.iterrows():
    

    #break
    
    if row.movieId not in existing_posters:
        
        #print(row.movieId)
        
        time.sleep(0.5)
        
        poster_data = get_poster_data(row.tmdbId)
        
        if poster_data is not None:
            
            poster_url, poster_save_path = poster_data
            try:
                poster_response = requests.get(poster_url, stream=True)
                with open(poster_save_path, 'wb') as out_file:
                    shutil.copyfileobj(poster_response.raw, out_file)
                count += 1
                if count % 10 == 0:
                    print('{} posters saved'.format(count))
            except:
                print('Poster {} couldn\'t be retrieved'.format(poster_url))
                continue
    else:
        count += 1
        if count % 10 == 0:
            print('{} posters saved'.format(count))
        
    if count >= 2500:
        break
    
#response.json()

74297
2256
109667
98609
52202
93838
6530
126546
66118
117513
2637
6136
97730
491
81075
109671
1891
79565
99169
4276
7770
10 posters saved
4205
67869
2388
25737
65780
90897
4619
111119
26726
5005
108542
20 posters saved
552
47736
2518
117527
6054
88977
84730
6850
1143
83962
515
4939
30 posters saved
106495
73390
3265
32076
121143
39777
4634
80659
90556
3014
40 posters saved
102874
26410
50898
97008
2245
103519
123252
5184
91537
46790
101287
4981
6730
6141
60365
96543
50 posters saved
62925
33972
70978
78272
98385
126957
4869
60 posters saved
8802
5781
127044
111445
2565
117545
6073
100298
110649
7565
36850
27722
31502
84467
122045
1069
121677
2035
60475
114172
74791
7991
70 posters saved
8039
Data 11574 couldn't be retrieved
3333
1299


KeyboardInterrupt: 

In [None]:
posters_dir = os.path.join('.', 'posters_1995_up')
os.makedirs(posters_dir, exist_ok=True)

existing_posters = set()
for file in os.listdir(posters_dir):
    existing_posters.add(int(file.split('.')[0]))
    
print(existing_posters)

In [16]:
movieIds = []
files = []
for file in os.listdir(posters_dir):
    movieId = file.split('.')[0]
    movieIds.append(int(movieId))
    files.append(file)
    
ids_and_files = pd.DataFrame({'movieId': movieIds, 'poster_img': files})
movies_and_posters = pd.merge(movies, ids_and_files, how='inner', on=['movieId'])
movies_and_posters.to_csv('movies_and_posters.csv', sep=';')
display(movies_and_posters.head())

Unnamed: 0,movieId,title,genres,imdbId,tmdbId,poster_img
0,12,Dracula: Dead and Loving It (1995),Comedy|Horror,112896,12110,12.jpg
1,24,Powder (1995),Drama|Sci-Fi,114168,12665,24.jpg
2,36,Dead Man Walking (1995),Crime|Drama,112818,687,36.jpg
3,40,"Cry, the Beloved Country (1995)",Drama,112749,34615,40.jpg
4,47,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,114369,807,47.jpg
