In [34]:
import os
import sys
import requests
import numpy as np
import pandas as pd
import multiprocessing
from bs4 import BeautifulSoup
from contextlib import suppress

In [61]:
class Scrapper:
    
    def __init__(self, search_page, search_keyword_prefix,
                 search_text_postfix, page_num_prefix,
                 white_space_subtitute, search_img_class,
                 img_per_page):
        
        self.search_page           = search_page
        self.img_per_page          = img_per_page
        self.page_num_prefix       = page_num_prefix
        self.search_img_class      = search_img_class
        self.search_text_prefix    = search_text_prefix
        self.search_text_postfix   = search_text_postfix
        self.white_space_subtitute = white_space_subtitute
        self.search_keyword_prefix = search_keyword_prefix
        
        self.download_img = False
        self.create_folder("")
        
        
    def create_folder(self, folder_name):
        class_name      = self.__class__.__name__
        new_folder_name = "/" + "data" + "/" + class_name
        
        if(folder_name != ""):
            new_folder_name += "/" + folder_name
        
        file_location = os.getcwd() + new_folder_name
        if not os.path.isdir(file_location):
            os.makedirs(file_location)
            
        return file_location


    def search_image(self, keyword, num_of_img):
        total_search_pages = int(num_of_img/self.img_per_page) + 1
        
        key_text = keyword.replace(" ", self.white_space_subtitute)
        key_text = key_text.lower()
        page_no  = 1
        
        search_url  = self.search_page + self.search_keyword_prefix + \
                      key_text + self.search_text_postfix
        search_page = search_url
            
        while(page_no <= total_search_pages):
            search_request= requests.get(search_page).content
            web_page      = BeautifulSoup(search_request,'html.parser')
            search_result = web_page.find_all("a", 
                                              class_=self.search_img_class)
            page_no      += 1
            search_page   =search_url + self.page_num_prefix + str(page_no)
            
            for image_data in search_result:
                yield self.search_page + image_data["href"]
    
    
    def save_image(self, image, image_name):
        img = requests.get(image)
        folder = self.create_folder("Image")
        file_name = folder + "/" + image_name
        
        with suppress(RuntimeError):
            while img.status_code != 200:
                continue
        
        with open(file_name, 'wb') as file:
            file.write(img.content)
            

    def multiprocessor_scrapping(self, image_page):
        images     = self.get_data_frame()
        core_count = multiprocessing.cpu_count()
        sys.setrecursionlimit(1000000000)
        
        while(True):
            searched_images = []
            core_utilise    = 0
            
            for _ in range(core_count):
                with suppress(StopIteration):
                    page_link     = next(image_page)
                    core_utilise += 1
                    searched_images.append(page_link)
            
            if __name__ == '__main__':
                with multiprocessing.Pool(processes=core_utilise) as pool:
                    result = pool.map(self.image_data, searched_images)
            
            for page_image_data in result:
                images = images.append(page_image_data, ignore_index=True)
            
            if core_utilise < core_count:
                break
        
        return images
    
    def image_data(self, image_link):
        pass
    
    
    def get_data_frame(self):
        pass
    
    def scrap_it(self):
        pass

In [62]:
class gettyimages_scrapper(Scrapper):
    def __init__(self, keyword, num_of_img, download_img):
        self.search_page           = "https://www.gettyimages.in"
        self.img_per_page          = 60
        self.page_num_prefix       = "?page="
        self.search_img_class      = "search-result-asset-link"
        self.search_text_postfix   = ""
        self.white_space_subtitute = "-"
        self.search_keyword_prefix = "/photos/"
        
        self.keyword      = keyword
        self.num_of_img   = num_of_img
        self.download_img = download_img
        self.create_folder("")
    
    
    def image_data(self, image_link):
        tag_list      = []
        image_request = requests.get(image_link).content
        image_page    = BeautifulSoup(image_request,'html.parser')
        image         = image_page.find_all("img", src=True)[-2]
        
        for image_tag in image_page.find_all(class_ = 'keyword'):
            tag_list.append(image_tag.get_text())
            
        tags = self.filter_tag(tag_list)
        tags.update({'Name':image['alt'], 'URL':image['src']})
        
        if self.download_img:
            self.save_image(tags['URL'], tags['Name'])
            
        return tags
    
    def get_data_frame(self):
        columns_name = ['Image Name', 'Ethnicity', 'Nationality',
                        'Gender', 'Maturity', 'Image URL']
        return pd.DataFrame(columns = columns_name)
    
    def filter_tag(self, tag_list):
        gender           = []
        maturity         = []
        ethnicity        = []
        nationality      = []
        
        folder           = self.create_folder("")
        nationality_file = folder + "/" + 'demonyms.csv'
        nationality_list = pd.read_csv(nationality_file, header=None)[0]
        maturity_words   = ('Young', 'Old', 'Adult', 'Child', 'Mature')
        gender_words     = ('Men', 'Man', 'Boy', 'Boys' 'Males', 'Male',
                            'Girl', 'Girls', 'Women', 'Woman', 'Female',
                            'Females')
        
        for tag in tag_list:
            if any(word in tag for word in gender_words):
                gender.append(tag)
            if any(word in tag for word in maturity_words):
                maturity.append(tag)
            if 'Ethnicity' in tag:
                ethnicity.append(tag.replace(" Ethnicity", ""))
            elif any(word in tag for word in nationality_list):
                nationality.append(tag)
        
        tags = {'Ethnicity': ethnicity, 'Nationality': nationality,
                'Gender': gender, 'Maturity': maturity}
        
        for key in list(tags):
            if not tags[key]:
                tags.pop(key)
                
        return tags
    
    def scrap_it(self):
        image_pages = self.search_image(self.keyword, self.num_of_img)
        image_data  = self.multiprocessor_scrapping(image_pages)
        
        image_data.to_csv(self.create_folder("") + '/image_data.csv')

In [63]:
sc = gettyimages_scrapper("Hot Women", 50, True)
sc.scrap_it()

MissingSchema: Invalid URL '/gallery-app/assets/consumer_gallery/stories_mobile_453227066-c5ed1b83adbfce1fa55c51901ec3401a929e24d87c7d73e410724682f482e20e.jpg': No schema supplied. Perhaps you meant http:///gallery-app/assets/consumer_gallery/stories_mobile_453227066-c5ed1b83adbfce1fa55c51901ec3401a929e24d87c7d73e410724682f482e20e.jpg?

In [40]:
r = sc.search_image("Hot Women", 50)