In [1]:
import os, sys
import shutil
from pathlib import Path
from pathlib import Path
import urllib.request
import urllib
import imghdr
import posixpath
import re


class Bing:
    def __init__(self, query, limit, output_dir, adult, timeout,  filters='', verbose=True):
        self.download_count = 0
        self.query = query
        self.output_dir = output_dir
        self.adult = adult
        self.filters = filters
        self.verbose = verbose

        assert type(limit) == int, "limit must be integer"
        self.limit = limit
        assert type(timeout) == int, "timeout must be integer"
        self.timeout = timeout

        # self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0'}
        self.page_counter = 0
        self.headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) ' 
      'AppleWebKit/537.11 (KHTML, like Gecko) '
      'Chrome/23.0.1271.64 Safari/537.11',
      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
      'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
      'Accept-Encoding': 'none',
      'Accept-Language': 'en-US,en;q=0.8',
      'Connection': 'keep-alive'}

    def save_image(self, link, file_path):
        request = urllib.request.Request(link, None, self.headers)
        image = urllib.request.urlopen(request, timeout=self.timeout).read()
        if not imghdr.what(None, image):
            print('[Error]Invalid image, not saving {}\n'.format(link))
            raise ValueError('Invalid image, not saving {}\n'.format(link))
        with open(str(file_path), 'wb') as f:
            f.write(image)

    
    def download_image(self, link):
        self.download_count += 1
        # Get the image link
        try:
            path = urllib.parse.urlsplit(link).path
            filename = posixpath.basename(path).split('?')[0]
            file_type = filename.split(".")[-1]
            if file_type.lower() not in ["jpe", "jpeg", "jfif", "exif", "tiff", "gif", "bmp", "png", "webp", "jpg"]:
                file_type = "jpg"
                
            if self.verbose:
                # Download the image
                print("[%] Downloading Image #{} from {}".format(self.download_count, link))
                
            self.save_image(link, self.output_dir.joinpath("Image_{}.{}".format(
                str(self.download_count), file_type)))
            if self.verbose:
                print("[%] File Downloaded !\n")

        except Exception as e:
            self.download_count -= 1
            print("[!] Issue getting: {}\n[!] Error:: {}".format(link, e))

    
    def run(self):
        while self.download_count < self.limit:
            if self.verbose:
                print('\n\n[!!]Indexing page: {}\n'.format(self.page_counter + 1))
            # Parse the page source and download pics
            request_url = 'https://www.bing.com/images/async?q=' + urllib.parse.quote_plus(self.query) \
                          + '&first=' + str(self.page_counter) + '&count=' + str(self.limit) \
                          + '&adlt=' + self.adult + '&qft=' + ('' if self.filters is None else str(self.filters))
            request = urllib.request.Request(request_url, None, headers=self.headers)
            response = urllib.request.urlopen(request)
            html = response.read().decode('utf8')
            if html ==  "":
                print("[%] No more images are available")
                break
            links = re.findall('murl&quot;:&quot;(.*?)&quot;', html)
            if self.verbose:
                print("[%] Indexed {} Images on Page {}.".format(len(links), self.page_counter + 1))
                print("\n===============================================\n")

            for link in links:
                if self.download_count < self.limit:
                    self.download_image(link)

            self.page_counter += 1
        print("\n\n[%] Done. Downloaded {} images.".format(self.download_count))
        print("===============================================\n")
        print("Khatam.. bye-bye... tata... good-bye.. gaya")
        print("\n===============================================\n")

def download(query, limit=100, output_dir='dataset', adult_filter_off=True,force_replace=False, timeout=60, verbose=True):

    adult = 'on'
    image_dir = Path(output_dir).joinpath(query).absolute()

    if force_replace:
        if Path.isdir(image_dir):
            shutil.rmtree(image_dir)
    # check directory and create if necessary
    try:
        if not Path.is_dir(image_dir):
            Path.mkdir(image_dir, parents=True)
    except Exception as e:
        print('[Error]Failed to create directory.', e)
        sys.exit(1)
    
    print("\nPerfect Okay! baa panni thudangam\n")
    print("[%] Downloading Images to {}".format(str(image_dir.absolute())))
    bing = Bing(query, limit, image_dir, adult, timeout, verbose)
    bing.run()


search_term=input("Enter the search Term ")
n=input("Enter the required number of images ")

download(search_term, output_dir="dataset", limit=int(n), timeout=1)



Enter the search Term  Car maruti 800 white back
Enter the required number of images  10



Perfect Okay! baa panni thudangam

[%] Downloading Images to /home/achu/luree Ai internship/Bing-Image scrapper/dataset/Car maruti 800 white back


[!!]Indexing page: 1

[%] Indexed 10 Images on Page 1.


[%] Downloading Image #1 from https://c1.staticflickr.com/3/2739/4224084933_655e882743.jpg
[%] File Downloaded !

[%] Downloading Image #2 from https://ic1.maxabout.us/autos/cars_india/M/2014/9/maruti-alto-800-superior-white.jpg
[%] File Downloaded !

[%] Downloading Image #3 from https://cheapbazzar.com/wp-content/uploads/2020/09/Maruti-Alto-800.jpg
[!] Issue getting: https://cheapbazzar.com/wp-content/uploads/2020/09/Maruti-Alto-800.jpg
[!] Error:: <urlopen error timed out>
[%] Downloading Image #3 from https://www.autovista.in/assets/img/new_cars_colour_variants/alto-800-colour-superior-white.jpg
[%] File Downloaded !

[%] Downloading Image #4 from https://media.mahindrafirstchoice.com/live_web_images/usedcarsimg/mfc/7/371720/front_view-20191219175350.jpg
[%] File Downloaded !

[%