In [None]:
import pandas as pd
import numpy as np
import io
import sys
import os.path
import urllib.request
from tqdm import tqdm
from os import listdir
from PIL import Image
import glob

pd.set_option('display.max_colwidth', None)
np.set_printoptions(threshold=sys.maxsize)


In [None]:
df = pd.read_csv("csv-files/MovieGenre.csv",encoding='ISO-8859-1')
df.head()

In [None]:
df.shape

In [None]:
check_nan_in_df = df.isnull().values.any()
print (check_nan_in_df)

In [None]:
#Remove NaN valued entries
df= df.dropna(how='any')
df.shape

In [None]:
#Method 1 to download images

not_found = []
for index, row in tqdm(df.iterrows()):
    
    url = row['Poster']
    imdb_id = row['imdbId']
    
    file_path = "Posters/" + str(imdb_id) + ".jpg"
    
    try:
        response = urllib.request.urlopen(url)
        data = response.read()
        file = open(file_path, 'wb')
        file.write(bytearray(data))
        file.close()
    except:
        not_found.append(imdb_id)
        
print("Number of posters not found = ", len(not_found))
print("Following IMDB_ID posters were not found::", not_found)

In [None]:
# Remove from dataframe those whose posters were not found
df_2 = df[~df['imdbId'].isin(not_found)]
df_2.shape

In [None]:
# Check which downloaded images are corrupt and remove them 
bad_images = []
for file in glob.glob("Posters/*.jpg"):

    try:
        img = Image.open(file) # open image file
        img.verify() # verify its an image
    except (IOError, SyntaxError) as e:
        print('Bad file:', file) 

        bad_images.append(file)
        
print("Number of corrupt files:", len(bad_images))

In [None]:
# No corrupt files found so not performing any further data-cleaning in terms of unfound/bad data

In [None]:
df_2.columns
df_2.head()

In [None]:
# Keep only IMDB_ID, Genres, Title

df_3 = df_2[['imdbId','Title','Genre']]
print(df_3.shape)
df_3.head()

In [None]:
#Checking if all the imdb_id listed here actually have its poster image

image_list = []
for file in glob.glob("Posters/*.jpg"):
    image_list.append(file)
    
print("Number of files found:", len(image_list))

In [None]:
# Mis-match still exists, so we further perform data-cleaning
# This is performed in the next python script: "Clean_data"
# We export the df_3 as csv, to be used with images downloaded in the Posters folder

df_3.to_csv("MovieGenre_cleaned.csv", index = None)