In [1]:
import os
import sys
import requests
import functools
import numpy as np
import pandas as pd
import multiprocessing
from bs4 import BeautifulSoup

In [2]:
def is_ethnicity(given_tag):
    if 'Ethnicity' in given_tag:
        return True
    else:
        return False

In [3]:
nationality_list = pd.read_csv('demonyms.csv', header=None)[0]

def is_nationality(given_tag, nationality = nationality_list):
    if any(word in given_tag for word in nationality):
        return True
    else:   
        return False

In [4]:
def is_gender(given_tag):
    gender_words = ('Men', 'Man', 'Boy', 'Boys' 'Males', 'Male', 
                    'Girl', 'Girls', 'Women', 'Woman', 'Female', 'Females')
    
    if any(word in given_tag for word in gender_words):
        return True
    else:
        return False

In [5]:
def is_maturity(given_tag):
    maturity_words = ('Young', 'Old', 'Adult', 'Child', 'Mature')
    
    if any(word in given_tag for word in maturity_words):
        return True
    else:
        return False

In [6]:
def filter_tags(tags):
    ethnicity = []
    nationality = []
    gender = []
    maturity = []
    
    for tag in tags:
        if is_gender(tag):
            gender.append(tag)
        
        if is_maturity(tag):
            maturity.append(tag)
            
        if is_ethnicity(tag):
            ethnicity.append(tag.replace(" Ethnicity", ""))
        elif is_nationality(tag):
            nationality.append(tag)
            
    return {'Ethnicity': ethnicity, 'Nationality': nationality, 
                   'Gender': gender, 'Maturity': maturity}

In [7]:
def get_photo_tags(image_page_url):
    tag_list = []
    
    image_page = requests.get(image_page_url).content
    for image_tag in BeautifulSoup(image_page,
                                   'html.parser').find_all(class_ = 'keyword'):
        tag_list.append(image_tag.get_text())
        
    return filter_tags(tag_list)

In [8]:
def download_image():
    while(True):
        input_option = input("Do You Want To Download Image Or URL (I/U) : ")
    
    
        if((input_option == 'I')|(input_option == 'i')):
            return True
        elif ((input_option == 'U')|(input_option == 'u')):
            return False
        else:
            print("Wrong Option Try Again\n")

In [9]:
def save_image(image):
    img = requests.get(image['Image URL'])
    if img.status_code == 200:
        file_location = os.getcwd() + "/data/img/" 
        if not os.path.isdir(file_location):
            os.makedirs(file_location)

        with open(file_location + image['Image Name'] + '.jpg', 'wb') as file:
            file.write(img.content)

In [10]:
def get_search_page(search_text, number_of_image):
    file_location = os.getcwd() + "/data/" 
    if not os.path.isdir(file_location):
        os.makedirs(file_location)
            
    search_url = "https://www.gettyimages.in/photos/"
    page_tag = '?page='
    page_no = 1
    
    number_of_page = int(number_of_image/60) + 1
    
    for words in list(search_text.split()):
        search_url = search_url + words.lower() + '-'
        
    search_url = search_url[:-1]
    
    search_link = search_url
    
    while(page_no <= number_of_page):
        requested_page = requests.get(search_link).content
        image_page = BeautifulSoup(requested_page, 'html.parser')
        
        yield image_page.find_all('article')
        
        page_no += 1
        search_link = search_url + page_tag + str(page_no)

In [11]:
def get_page_data(image_webpage, download_option):
    image_page_start = "https://www.gettyimages.in/detail/photo/"
    image_link = "https://media.gettyimages.com/photos/"
    image_page_tag = "-royalty-free-image/"
    
    columns_name = ['Image Name', 'Ethnicity', 'Nationality',
                    'Gender', 'Maturity', 'Image URL']
    
    page_image_data = pd.DataFrame(columns=columns_name)
    
    for image_data in image_webpage:
        image_extracted_data  = image_data.find('img')
        
        image_src = image_extracted_data['src']
        image_url = image_src.replace('?', " ").split()[0]
        
        image_page_url = image_url.replace(image_link,
                                           image_page_start).replace('-id', 
                                                                     image_page_tag)
        
        image_name = image_extracted_data['alt']
        
        image_information = {'Image Name': image_name, 'Image URL': image_url}
        
        if download_option:
            save_image(image_information)
        
        image_tag = get_photo_tags(image_page_url)
        
        image_information.update(image_tag)
        
        for tag_name, tag_element in image_tag.items():
            if not tag_element:
                del image_information[tag_name]
        
        page_image_data = page_image_data.append(image_information, ignore_index=True)
    
    return page_image_data

In [12]:
def scrap_page():
    keyword = str(input("\nEnter Keyword: "))
    pic_number = int(input("\nEnter The Number of Picture You Want To Download : "))
    download_option = download_image()
    
    if __name__ == '__main__':
        sys.setrecursionlimit(1000000000)
        processes = multiprocessing.cpu_count()
    
        columns_name = ['Image Name', 'Ethnicity', 'Nationality',
                        'Gender', 'Maturity', 'Image URL']
    
        image_data = pd.DataFrame(columns=columns_name)
    
        get_web_page = get_search_page(keyword, pic_number)
        
        number_of_page = int(pic_number/60) + 1
        number_of_full_process = int(number_of_page/4)
        number_of_incomplete_process = number_of_page - number_of_full_process*4
        get_page_data_option  = functools.partial(get_page_data,
                                                  download_option = download_option)
        
        for __ in range(number_of_full_process):
            with multiprocessing.Pool(processes=processes) as pool:
                result = pool.map(get_page_data_option, 
                                  [next(get_web_page) for _ in range(processes)])
            
                for page_image_data in result:
                    image_data = image_data.append(page_image_data, ignore_index=True)
                    
        with multiprocessing.Pool(processes = number_of_incomplete_process) as pool:
            result = pool.map(get_page_data_option,
                              [next(get_web_page) for _ in range(number_of_incomplete_process)])
            
            for page_image_data in result:
                image_data = image_data.append(page_image_data, ignore_index=True)
                    
        image_data.to_csv('data/image_data.csv')

In [None]:
scrap_page()