In [1]:
import os
import sys
import requests
import functools
import numpy as np
import pandas as pd
import multiprocessing
from bs4 import BeautifulSoup

In [2]:
#This Function searches for the word Ethenticity.
#The page gettyimage usually uses the word ethnicity
#to for describing the ethnicity of a person in the image.
#If it finds it it returns true.

def is_ethnicity(given_tag):
    if 'Ethnicity' in given_tag:
        return True
    
    else:
        return False

In [3]:
#The demonyms.csv have the list of word describing nationality 
#of people. If it finds these words in the tag, it returns true.

nationality_list = pd.read_csv('demonyms.csv', header=None)[0]

def is_nationality(given_tag, nationality = nationality_list):
    if any(word in given_tag for word in nationality):
        return True
    else:   
        return False

In [4]:
#Few tags are found in the image tags that denotes the Gender of
#the person. This function tries to find these word and if it finds
#the words it returns true.

def is_gender(given_tag):
    gender_words = ('Men', 'Man', 'Boy', 'Boys' 'Males', 'Male', 
                    'Girl', 'Girls', 'Women', 'Woman', 'Female', 
                    'Females')
    
    if any(word in given_tag for word in gender_words):
        return True
    else:
        return False

In [5]:
#Few tags are found in the image tags that denotes the Maturity of
#the person. This function tries to find these word and if it finds
#the words it returns true.

def is_maturity(given_tag):
    maturity_words = ('Young', 'Old', 'Adult', 'Child', 'Mature')
    
    if any(word in given_tag for word in maturity_words):
        return True
    else:
        return False

In [6]:
#This function detects the tag which indicstes ethnicity, nationality, 
#gender, maturity and returns a dictionary cointining a list of these.

def filter_tags(tags):
    
    ethnicity = []
    nationality = []
    gender = []
    maturity = []
    
    for tag in tags:
        if is_gender(tag): 
            gender.append(tag)
        
        if is_maturity(tag):
            maturity.append(tag)
        
        #Same word can describe both Ethnicity and Nationality but 
        #Name tag does not. So, if it is describing Ethnicity then
        #it is not describing Nationality.
        
        if is_ethnicity(tag): 
            ethnicity.append(tag.replace(" Ethnicity", ""))
        elif is_nationality(tag):
            nationality.append(tag)
            
    return {'Ethnicity': ethnicity, 'Nationality': nationality, 
                   'Gender': gender, 'Maturity': maturity}

In [7]:
#It takes the url for the page that has the tags and extracts the tags and
#uses fliter tag to catagorize it.

def get_photo_tags(image_page_url):
    tag_list = []
    
    image_page = requests.get(image_page_url).content
    
    #It searches for all the HTML tags with class "keyword" and creates 
    #a tag list
    html_image_page = BeautifulSoup(image_page,'html.parser')
    
    for image_tag in html_image_page.find_all(class_ = 'keyword'):
        tag_list.append(image_tag.get_text())
        
    return filter_tags(tag_list) #It returns dictionary 

In [8]:
#This Function extracts Image name and Image URL from a Image Tag
def extract_image_info(image_extracted_data):
    
    image_page_start = "https://www.gettyimages.in/detail/photo/"
    image_link = "https://media.gettyimages.com/photos/"
    image_page_tag = "-royalty-free-image/"
    
    #Url of the image and the URL of the page cointaning tags of the image
    #are determined.
    image_url = image_extracted_data['src'].replace('?', " ").split()[0]
    image_temp_url = image_url.replace(image_link, image_page_start)
    tag_page_url = image_temp_url.replace('-id', image_page_tag)
        
    image_name = image_extracted_data['alt'] #Image Name is being obtained
    
    image_information = {'Image Name': image_name, 'Image URL': image_url}
    
    image_tag = get_photo_tags(tag_page_url) #Filtered Tags are obtained
    image_information.update(image_tag) #and added to the dictionary
    
    return image_information

In [9]:
#It checks if a folder named in the argument exists, if not then creates
#it.

def create_folder(folder_name):
    file_location = os.getcwd() + "/" + folder_name
    if not os.path.isdir(file_location):
        os.makedirs(file_location)

In [10]:
#This function ask input from user if theywant to download the Images scrapped
#or just save the URL of the picture.

def image_download_choice():
    while(True):
        input_message = "Do You Want To Download Image Or URL (I/U) : "
        input_option = input(input_message)
    
        if ((input_option == 'U')|(input_option == 'u')):
            return False
        elif ((input_option == 'I')|(input_option == 'i')):
            return True
        else:
            print("Wrong Option Try Again\n")

In [11]:
#This function saves the image in the url and with the name that is
#present in the argument dictionary image if user want to save it.

def save_image(image, download_option):
    if download_option:
        img = requests.get(image['Image URL'])
        
        if img.status_code == 200:
            create_folder("data/img/")
        
            relative_image_location = "/data/img/"+ image['Image Name']
            image_file_name = os.getcwd() + relative_image_location

            with open(image_file_name  + '.jpg' , 'wb') as file:
                file.write(img.content)

In [12]:
#This function returns a dataframe with the template required to store data.
def get_data_frame():
    columns_name = ['Image Name', 'Ethnicity', 'Nationality',
                    'Gender', 'Maturity', 'Image URL']
    
    return pd.DataFrame(columns = columns_name)

In [13]:
#This function is a generator which yields a list of information
#in a html tag 'articles'. Every time it yields a result it moves
#on to the next page of the search results and yields it in required
#form untill the required number of pages a yielded.

def get_search_page(search_text, number_of_image):
    
    create_folder("data")
    number_of_page = int(number_of_image/60) + 1
    page_no = 1 #keeps track number of page currently in.
            
    search_url = "https://www.gettyimages.in/photos/" #Initial part of the url
    page_tag = '?page=' #this should be added to the search url if it is fetching
    #image beyond 1st page.
    
    for words in list(search_text.split()):
        search_url = search_url + words.lower() + '-'
    
    search_url = search_url[:-1]
    search_link = search_url #Url For the 1st page

    while(page_no <= number_of_page):
        html_page = requests.get(search_link).content
        web_page = BeautifulSoup(webpage,'html.parser')
        
        page_no += 1
        #Url for the next page :
        search_link = search_url + page_tag + str(page_no)
        
        yield web_page.find_all('article')

In [14]:
#It extracts all the image data present in the search result page

def get_page_data(search_webpage, download_option):    
    page_image_data = get_data_frame()
    
    for image_data in search_webpage: #Getting the individual image info
        image_extracted_data  = image_data.find('img')
        
        image_information = extract_image_info(image_extracted_data)
        save_image(image_information, download_option)
        
        #The dictionary tags which does not have any data are removed.
        for tag_name, tag_element in image_information.items():
            if not tag_element:
                del image_information[tag_name] 
        
        page_image_data = page_image_data.append(image_information,
                                                 ignore_index=True)
    
    return page_image_data

In [15]:
#This function utilizes multicore processing to process data where each
#core processes a search result page. Here core_use is equal to number 
#of core that will be used and the number of search result page that will
#be processed

def multiprocessor_scrapping(search_page, core_use, download_option):
    image_data = get_data_frame()
    get_page_data_option  = functools.partial(get_page_data,
                                              download_option = download_option)
    
    if __name__ == '__main__':
        with multiprocessing.Pool(processes=core_use) as pool:
            search_pages = [next(search_page) for _ in range(core_use)]
            result = pool.map(get_page_data_option, search_pages)
    
    #Storing the search result into dataframes
    for page_image_data in result:
        result_image_data = image_data.append(page_image_data, 
                                              ignore_index=True)
                    
    return result_image_data

In [16]:
def scrap_page():
    keyword = str(input("\nEnter Keyword: "))
    pic_number = int(input("\nEnter The Number of Picture You Want To Download : "))
    download_option = image_download_choice()
    
    sys.setrecursionlimit(1000000000)
    processes = multiprocessing.cpu_count()
    
    image_data = get_data_frame()
    searched_page = get_search_page(keyword, pic_number)
        
    number_of_page = int(pic_number/60) + 1
    full_process = int(number_of_page/processes)
    incomplete_process = number_of_page - number_of_full_process*processes
        
    for _ in range(full_process):
        result_image_data = multiprocessor_scrapping(searched_page,
                                                     processes, download_option)   
        image_data = image_data.append(result_image_data, ignore_index=True)
                    
    result_image_data = multiprocessor_scrapping(searched_page,
                                                 incomplete_process, download_option)
    image_data = image_data.append(result_image_data, ignore_index=True)
                    
    image_data.to_csv('data/image_data.csv')

In [None]:
scrap_page()