**Dataset Collection**

Collecting datasets of ~300 images for:

*   eggplant
*   olives
*   green beans
*   bell peppers
*   onions
*   potatoes
*   spinach
*   tomatoes
*   lettuce

using Bing Image Search API

In [None]:
import requests
from PIL import Image
from io import BytesIO
import os

import torch, torchvision
from torch import nn, optim
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import math

subscription_key = "" #removed for privacy
search_url = "https://api.bing.microsoft.com/v7.0/images/search"

In [None]:
headers = {"Ocp-Apim-Subscription-Key" : subscription_key}

Creating Directories for Datasets:

In [None]:
eggplant_dir = os.path.join("dataset","eggplant")
os.makedirs(eggplant_dir, exist_ok=True)

olives_dir = os.path.join("dataset","olives")
os.makedirs(olives_dir, exist_ok=True)

greenbeans_dir = os.path.join("dataset","greenbeans")
os.makedirs(greenbeans_dir, exist_ok=True)

peppers_dir = os.path.join("dataset","bellpeppers")
os.makedirs(peppers_dir, exist_ok=True)

onions_dir = os.path.join("dataset","onions")
os.makedirs(onions_dir, exist_ok=True)

potatoes_dir = os.path.join("dataset","potatoes")
os.makedirs(potatoes_dir, exist_ok=True)

spinach_dir = os.path.join("dataset","spinach")
os.makedirs(spinach_dir, exist_ok=True)

tomato_dir = os.path.join("dataset","tomato")
os.makedirs(tomato_dir, exist_ok=True)

lettuce_dir = os.path.join("dataset","lettuce")
os.makedirs(lettuce_dir, exist_ok=True)

**Function for Scraping Images via Bing Image Search API**

For this part, the quickstart guide at https://docs.microsoft.com/en-us/bing/search-apis/bing-image-search/quickstarts/rest/python was used as a reference

The function downloadImages will collect numImages images for the query and stored the images at the path specified by destination


Inputs:
*   query - the search term
*   numImages - number of images to collect
*   destination - destination folder to store scraped images
*   offset - offset at which to start scraping images
*   namingoffset - by default, images will be named from 0 to (numImages - 1). If namingoffset is not 0, images will be named from namingoffset to (namingoffset + numImages - 1) 


In [None]:
def downloadImages(query, numImages, destination, offset=0, namingoffset=0):
    count = numImages
    if numImages > 150:
      count = 150

    offset = 0
    imgNum = namingoffset
    batch_count = count;
    numLeft = numImages

    for num_search in range(0, math.ceil(numImages / 150)) :  
        params  = {"q": query, "license": "public", "imageType": "photo", "count": count, "offset":offset}
        response = requests.get(search_url, headers=headers, params=params)
        response.raise_for_status()
        search_results = response.json()
        thumbnail_urls = [img["thumbnailUrl"] for img in search_results["value"][:numImages]]
        offset = search_results["nextOffset"]

        for i in range(0, batch_count):
          image_data = requests.get(thumbnail_urls[i])
          image_data.raise_for_status()
          image = Image.open(BytesIO(image_data.content)) 
          full_path = os.path.join(destination, f"{imgNum:03}.jpg")
          image.save(full_path, 'JPEG')
          imgNum += 1

        numLeft -= count
        if numLeft < 150 and numLeft > 0:
          batch_count = numLeft

Eggplant Dataset:

In [None]:
downloadImages("eggplant", 400, eggplant_dir)

In [None]:
!zip -r ./eggplant_set.zip ./dataset/eggplant
from google.colab import files
files.download("./eggplant_set.zip")

Olive Dataset:

In [None]:
downloadImages("olives", 350, olives_dir)

In [None]:
!zip -r ./olives_set.zip ./dataset/olives
from google.colab import files
files.download("./olives_set.zip")

Green Beans Dataset:

In [None]:
downloadImages("green beans", 350, greenbeans_dir)

In [None]:
!zip -r ./greenbeans_set.zip ./dataset/greenbeans
from google.colab import files
files.download("./greenbeans_set.zip")

Bell pepper Dataset:

In [None]:
downloadImages("bell peppers", 350, peppers_dir)

In [None]:
!zip -r ./bellpepper_set.zip ./dataset/bellpeppers
from google.colab import files
files.download("./bellpepper_set.zip")

Onion Dataset

In [None]:
downloadImages("onions", 350, onions_dir)

In [None]:
!zip -r ./onions_set.zip ./dataset/onions
from google.colab import files
files.download("./onions_set.zip")

Alternate Queries for Onions

More specific queries were needed for the onion dataset since using "onions" as the query also gave green onions as results when we are referring to yellow/white/red onions

In [None]:
redonions_dir = os.path.join("dataset","onions_red")
os.makedirs(redonions_dir, exist_ok=True)

In [None]:
whiteonions_dir = os.path.join("dataset","onions_white")
os.makedirs(whiteonions_dir, exist_ok=True)

In [None]:
yellowonions_dir = os.path.join("dataset","onions_yellow")
os.makedirs(yellowonions_dir, exist_ok=True)

In [None]:
downloadImages("red onions", 120, redonions_dir)

In [None]:
downloadImages("white onions", 120, whiteonions_dir, 0, 120)

In [None]:
downloadImages("yellow onions", 120, yellowonions_dir, 101, 240) #use offset 101 for yellow onions since I am expecting significant overlap between white & yellow onions

In [None]:
!zip -r ./redonions_set.zip ./dataset/onions_red
from google.colab import files
files.download("./redonions_set.zip")

In [None]:
!zip -r ./whiteonions_set.zip ./dataset/onions_white
from google.colab import files
files.download("./whiteonions_set.zip")

In [None]:
!zip -r ./yellowonions_set.zip ./dataset/onions_yellow
from google.colab import files
files.download("./yellowonions_set.zip")

Potato Dataset:

In [None]:
downloadImages("potatoes", 350, potatoes_dir)

In [None]:
!zip -r ./potato_set.zip ./dataset/potatoes
from google.colab import files
files.download("./potato_set.zip")

Alternate Queries for Potatoes

More specific queries were needed for the potato dataset since using "potatoes" as the query also gave many images of cooked potato dishes as results

In [None]:
rawpotatoes_dir = os.path.join("dataset","potatoes_raw")
os.makedirs(rawpotatoes_dir, exist_ok=True)

In [None]:
downloadImages("raw potatoes", 100, rawpotatoes_dir, 0, 301)

In [None]:
!zip -r ./rawpotato_set.zip ./dataset/potatoes_raw
from google.colab import files
files.download("./rawpotato_set.zip")

In [None]:
downloadImages("russet potato", 100, potatoes_dir, 200, 401)

In [None]:
!zip -r ./russetpotato_set.zip ./dataset/potatoes_russet
from google.colab import files
files.download("./russetpotato_set.zip")

Spinash Dataset:

In [None]:
downloadImages("spinach", 350, spinach_dir)

In [None]:
!zip -r ./spinach_set.zip ./dataset/spinach
from google.colab import files
files.download("./spinach_set.zip")

Tomato Dataset:

In [None]:
downloadImages("tomato", 350, tomato_dir)

In [None]:
!zip -r ./tomato_set.zip ./dataset/tomato
from google.colab import files
files.download("./tomato_set.zip")

Lettuce Dataset:

In [None]:
downloadImages("lettuce", 350, lettuce_dir)

In [None]:
!zip -r ./lettuce_set.zip ./dataset/lettuce
from google.colab import files
files.download("./lettuce_set.zip")