In [1]:
#Import the required modules
import pandas as pd
import numpy as np
import io
import sys
import os.path
import urllib.request
from tqdm import tqdm
from os import listdir
from PIL import Image
import glob
import warnings
warnings.filterwarnings('ignore')

In [2]:
#Read movie data csv with poster url
df = pd.read_csv("Data/new.csv")
df.head()

Unnamed: 0,Title,Release_Year,Genre,Synopsis,Poster_URL,Movie_ID,Movie_URL,Duration,Certification,Voters,Rating
0,John Wick: Chapter 2,(2017),"Action,Crime,Thriller",After returning to the criminal underworld to ...,https://m.media-amazon.com/images/M/MV5BMjE2ND...,4425200,https://www.imdb.com/title/tt4425200/,122 min,A,438276,7.4
1,FBI: Most Wanted,(2020– ),"Action,Crime,Drama",It follows the division of the FBI tasked with...,https://m.media-amazon.com/images/M/MV5BZWE5Zm...,9742936,https://www.imdb.com/title/tt9742936/,45 min,,8681,6.9
2,Heartland,(II) (2007– ),"Drama,Family","A multi-generational saga set in Alberta, Cana...",https://m.media-amazon.com/images/M/MV5BY2Q2Zj...,1094229,https://www.imdb.com/title/tt1094229/,45 min,16,17385,8.5
3,Don't Look Up,(2021),"Comedy,Drama,Sci-Fi",Two low-level astronomers must go on a giant m...,https://m.media-amazon.com/images/M/MV5BZjcwZj...,11286314,https://www.imdb.com/title/tt11286314/,138 min,18,541908,7.2
4,Bosch,(2014–2021),"Crime,Drama",An L.A.P.D. homicide detective works to solve ...,https://m.media-amazon.com/images/M/MV5BZDBiYj...,3502248,https://www.imdb.com/title/tt3502248/,51 min,18,73251,8.5


In [3]:
#Shape of dataframe
df.shape

(10000, 11)

In [4]:
#Drop duplicates
df.drop_duplicates(subset="Movie_ID", inplace=True)

#Check null values(any column)
check_nan_in_df = df.isnull().values.any()
print (check_nan_in_df)

True


In [5]:
#Remove NaN valued entries
df= df.dropna(how='any')
df.shape

(6910, 11)

In [7]:
#df = df.sample(100)
#df['Movie_ID'] = df['Movie_ID'].astype(str)
#df = df.sort_values(by = 'Movie_ID')

In [8]:
#Method to download images using urllib
found = []
not_found = []
for index, row in tqdm(df.iterrows()):
    url = row['Poster_URL']
    imdb_id = row['Movie_ID']
    
    file_path = "Posters/" + str(imdb_id) + ".jpg"
    
    try:
        response = urllib.request.urlopen(url)
        data = response.read()
        file = open(file_path, 'wb')
        file.write(bytearray(data))
        file.close()
        found.append(file_path)
    except:
        not_found.append(imdb_id)
        
print("Number of posters not found = ", len(not_found))
print("Following IMDB_ID posters were not found::", not_found)

6910it [1:09:24,  1.66it/s]

Number of posters not found =  5
Following IMDB_ID posters were not found:: [11703244, 6431312, 12968224, 22375730, 26007910]





In [13]:
# Remove from dataframe those whose posters were not found
df = df[~df['Movie_ID'].isin(not_found)]
df.shape

(6905, 12)

In [10]:
# Check which downloaded images are corrupt and remove them 
bad_images = []
for file in glob.glob("Posters/*.jpg"):

    try:
        img = Image.open(file) # open image file
        img.verify() # verify its an image
    except (IOError, SyntaxError) as e:
        print('Bad file:', file) 

        bad_images.append(file)
        
print("Number of corrupt files:", len(bad_images))

Number of corrupt files: 0


In [11]:
# #Checking if all the imdb_id listed here actually have its poster image
# for file in glob.glob("Posters/*.jpg"):
#     image_list.append(file)
# print("Number of files found:", len(image_list))
df["Path"] = found

In [12]:
# Display and Save the updated Dataframe
df.to_csv('Data/data_with_path.csv', index=False)
display(df)

Unnamed: 0,Title,Release_Year,Genre,Synopsis,Poster_URL,Movie_ID,Movie_URL,Duration,Certification,Voters,Rating,Path
0,John Wick: Chapter 2,(2017),"Action,Crime,Thriller","After returning to the criminal underworld to repay a debt, John Wick discovers that a large bounty has been put on his life.","https://m.media-amazon.com/images/M/MV5BMjE2NDkxNTY2M15BMl5BanBnXkFtZTgwMDc2NzE0MTI@._V1_QL75_UX280_CR0,3,280,414_.jpg",4425200,https://www.imdb.com/title/tt4425200/,122 min,A,438276,7.4,Posters/4425200.jpg
2,Heartland,(II) (2007– ),"Drama,Family","A multi-generational saga set in Alberta, Canada and centered on a family getting through life together in both happy and trying times.","https://m.media-amazon.com/images/M/MV5BY2Q2ZjdhZGYtMmMwMS00ZWJhLTlhMjktNDU0ZDJiN2M3ZGNkXkEyXkFqcGdeQXVyMzc0ODEwMg@@._V1_QL75_UX280_CR0,3,280,414_.jpg",1094229,https://www.imdb.com/title/tt1094229/,45 min,16,17385,8.5,Posters/1094229.jpg
3,Don't Look Up,(2021),"Comedy,Drama,Sci-Fi",Two low-level astronomers must go on a giant media tour to warn humankind of an approaching comet that will destroy planet Earth.,"https://m.media-amazon.com/images/M/MV5BZjcwZjY3NjAtNzkxZS00NmFjLTg1OGYtODJmMThhY2UwMTc5XkEyXkFqcGdeQXVyODE5NzE3OTE@._V1_QL75_UX280_CR0,3,280,414_.jpg",11286314,https://www.imdb.com/title/tt11286314/,138 min,18,541908,7.2,Posters/11286314.jpg
4,Bosch,(2014–2021),"Crime,Drama",An L.A.P.D. homicide detective works to solve the murder of a 13-year-old boy while standing trial in federal court for the murder of a serial killer.,"https://m.media-amazon.com/images/M/MV5BZDBiYjg4OTgtZDg2YS00ZGIzLTk3ZWMtZWRlZDQ5M2I0MDNhXkEyXkFqcGdeQXVyMTEyMjM2NDc2._V1_QL75_UX280_CR0,3,280,414_.jpg",3502248,https://www.imdb.com/title/tt3502248/,51 min,18,73251,8.5,Posters/3502248.jpg
5,The Good Wife,(2009–2016),"Crime,Drama,Mystery","Alicia Florrick has been a good wife to her husband, a former state's attorney. After a very humiliating sex and corruption scandal, he is behind bars. She must now provide for her family and returns to work as a litigator in a law firm.","https://m.media-amazon.com/images/M/MV5BMTI2OTk4MDk3OF5BMl5BanBnXkFtZTcwMTY3NTc3Mg@@._V1_QL75_UX280_CR0,3,280,414_.jpg",1442462,https://www.imdb.com/title/tt1442462/,43 min,16,77212,8.4,Posters/1442462.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...
9992,Reign Over Me,(2007),Drama,A man who lost his family in the September 11 attack on New York City runs into his old college roommate. Rekindling the friendship is the one thing that appears able to help the man recover from his grief.,"https://m.media-amazon.com/images/M/MV5BOTYyMTExNTgwNF5BMl5BanBnXkFtZTcwMDY4MTEzMw@@._V1_QL75_UX280_CR0,3,280,414_.jpg",490204,https://www.imdb.com/title/tt0490204/,124 min,R,97522,7.4,Posters/490204.jpg
9993,The Humans,(2021),Drama,"During one evening, the Blake family gathers to celebrate thanksgiving in a broken-down flat newly rented by the daughter and her new man. As the darkness falls, we find that all have less to be thankful about.","https://m.media-amazon.com/images/M/MV5BZDFhNWJjZDQtODgxNC00ZGZmLTkyYzItNTc0ZTVlODZkYjZkXkEyXkFqcGdeQXVyMDM2NDM2MQ@@._V1_QL75_UX280_CR0,3,280,414_.jpg",10023286,https://www.imdb.com/title/tt10023286/,108 min,R,8597,6.2,Posters/10023286.jpg
9995,Man with a Plan,(2016–2020),Comedy,A dad finds out that parenting is harder than he thought after his wife goes back to work and he's left at home to take care of the kids.,"https://m.media-amazon.com/images/M/MV5BNjYwMzJiOGEtMjk4Ni00NDI0LTkxMDMtNTI3M2ZmZjFhZTgwXkEyXkFqcGdeQXVyNjg4NzAyOTA@._V1_QL75_UX280_CR0,3,280,414_.jpg",5536400,https://www.imdb.com/title/tt5536400/,30 min,All,10645,7.0,Posters/5536400.jpg
9996,Kung Pow: Enter the Fist,(2002),"Action,Comedy",A rough-around-the-edges martial arts master seeks revenge for his parents' death.,"https://m.media-amazon.com/images/M/MV5BMGQxZDEwZDctMjNkMi00YmIxLTgyN2MtYmJhYjEzZGY0NjljXkEyXkFqcGdeQXVyMTQxNzMzNDI@._V1_QL75_UX280_CR0,3,280,414_.jpg",240468,https://www.imdb.com/title/tt0240468/,81 min,PG-13,46970,6.2,Posters/240468.jpg
