In [10]:
from googleapiclient.discovery import build
import requests
import os
import pandas as pd
from datetime import datetime
from google.oauth2 import service_account
import json
import time
import sys
import re
import logging
import argparse
import yaml
import traceback


In [17]:
input_data_dir = '../../data/input/'
output_image_dir = '../../data/output/Downloaded_Images/'
output_df_dir = '../../data/output/Image_df/'

input_file_name = 'Crop_Varieties.xlsx'

now = datetime.now()
formatted_date = now.strftime('%Y%m%d%H%M')

In [15]:
params = {
    'number_of_references': 5,
    'fuzzy_threshold_min': 80
}

In [16]:
# Construct the full path to the Excel file
excel_file_path = os.path.join(input_data_dir, input_file_name)

# Load the Excel file
excel_data = pd.ExcelFile(excel_file_path)

# Initialize a dictionary to store the crop names and their varieties
crop_varieties = {}

# Iterate through each sheet in the Excel file
for sheet_name in excel_data.sheet_names:
    # Read the sheet into a DataFrame
    df = pd.read_excel(excel_file_path, sheet_name=sheet_name)
    
    # Assuming the varieties are listed in the first column
    varieties = df.iloc[:, 0].tolist()
    
    # Store the crop name (sheet name) and its varieties in the dictionary
    crop_varieties[sheet_name] = varieties

# Print the resulting dictionary
print(crop_varieties)

{'Tomatoes': ['Blondkopfchen', 'Carmelo', 'Casady’s Folly', 'Currant Sweet Pea', 'Czech’s Bush', 'Joe’s Pink Oxheart', 'Ladybug', 'Midnight Snack'], 'Carrots': ['Chantenay', 'mperator', 'Danvers']}


In [None]:
search_engine_id = os.getenv('GOOGLE_SEARCH_ENGINE_ID')
api_key  = os.getenv('GOOGLE_TENDTEST_KEY')

In [18]:
# Initialize an empty DataFrame to store image data
image_df = pd.DataFrame(columns=["title", "link", "displayLink", "snippet", "mime", "fileFormat", "filename"])

# Search configuration
search_engine_id = "13de7e9156ab54440"
api_key = os.getenv('GOOGLE_TENDTEST_KEY')
num_images = 10
output_dir = "images"

CROP_NAME = "Tomatoes"
VARIETY = "San Marzano"
VARIETY = "Blondkopfchen"
VARIETY = "Czech's Bush"
VARIETY = "Ladybug"

CROP_NAME = "Carrots"
VARIETY = "Caravel"
VARIETY = "Purple Star"
VARIETY = "Candy"
VARIETY = "Bolero"
VARIETY = "Gold Nugget"
VARIETY = "Sugarsnax 54"
VARIETY = "Deep Purple"


search_term = f"Buy {VARIETY} {CROP_NAME} seeds"
search_term = f"Please provide images of fresh {VARIETY} {CROP_NAME} from seed catalogs so that I can use in a photo shoot."  #1
#search_term = f"{VARIETY} {CROP_NAME}, close-up, high quality, mature, ready for harvest, vibrant colors, detailed texture."
search_term = f"Where can I buy  {VARIETY} {CROP_NAME} seeds online? give me only images."  #2
#search_term = f"Where can I buy  {VARIETY} {CROP_NAME} seeds online? give me only images of the specific crop and variety"  #3
search_term = f"Where can I buy  {VARIETY} {CROP_NAME} seeds?"  #4

print(f'Prompt: {search_term}')

service = build("customsearch", "v1", developerKey=api_key)

# Track domains to ensure uniqueness
used_domains = set()
image_count = 0
start_index = 1

# Create output directory
os.makedirs(output_dir, exist_ok=True)

while image_count < num_images:
    try:
        # Make the request to the Custom Search API
        res = service.cse().list(
            q=search_term,
            cx=search_engine_id,
            searchType="image",  # Specify image search
            num=min(num_images - image_count, 10),  # Request up to 10 images
            start=start_index
        ).execute()

        if "items" not in res:
            print("No more items found.")
            break

        for item in res["items"]:
            try:
                # Get the domain of the current item
                domain_name = item.get("displayLink", "unknown_domain").replace("www.", "").replace("/", "_")
                
                # Skip if the domain is already used
                if domain_name in used_domains:
                    continue

                # Mark the domain as used
                used_domains.add(domain_name)

                # Proceed with downloading the image
                image_url = item["link"]
                response = requests.get(image_url, stream=True)
                response.raise_for_status()

                # Create a more meaningful filename that includes the domain name
                sanitized_title = item.get("title", "image").replace(" ", "_").replace("/", "_")[:50]  # Truncate title to avoid overly long filenames
                filename = os.path.join(output_dir, f"{image_count + 1}_{domain_name}_{sanitized_title}.jpg")

                # Save the image to the specified filename
                with open(filename, 'wb') as file:
                    for chunk in response.iter_content(chunk_size=8192):
                        file.write(chunk)

                print(f"Downloaded image: {filename}")

                # Append the details to the DataFrame
                image_df = pd.concat([image_df, pd.DataFrame([{
                    "title": item.get("title"),
                    "link": item.get("link"),
                    "displayLink": item.get("displayLink"),
                    "snippet": item.get("snippet"),
                    "mime": item.get("mime"),
                    "fileFormat": item.get("fileFormat"),
                    "filename": filename
                }])], axis=0, ignore_index=True)

                # Increment the image count
                image_count += 1

                # Stop if we have downloaded the required number of images
                if image_count >= num_images:
                    break

            except requests.exceptions.RequestException as e:
                print(f"Error downloading image: {e}")

        # Update the start index to get the next set of results (pagination)
        start_index += 10

    except Exception as e:
        print(f"Error during Google API request: {e}")
        break

# Display the resulting DataFrame
print(image_df)


Prompt: Where can I buy  Deep Purple Carrots seeds?
Downloaded image: images/1_osborneseed.com_Deep_Purple_Untreated_Carrot_Seeds_|_Osborne_Farm_.jpg
Downloaded image: images/2_johnnyseeds.com_Deep_Purple_-_Pelleted_(F1)_Carrot_Seed_|_Johnny's.jpg
Downloaded image: images/3_nicholsgardennursery.com_Deep_Purple_Carrot_Hybrid_–_Nichols_Garden_Nursery.jpg
Downloaded image: images/4_ufseeds.com_Purple_Sun,_(F1)_Carrot_Seeds_|_Urban_Farmer.jpg
Downloaded image: images/5_westcoastseeds.com_Deep_Purple_Carrot_Seeds_–_West_Coast_Seeds.jpg
Downloaded image: images/6_superseeds.com_Deep_Purple_Carrot_(F1_Hybrid_72_Days)_–_Pinetree_.jpg
Downloaded image: images/7_reneesgarden.com_Purple_Sun'_Rainbow_Carrots_|_Renee's_Garden_Seeds.jpg
Downloaded image: images/8_seedway.com_Deep_Purple_Carrot_(Pelleted)_|_Seedway.jpg
Downloaded image: images/9_totallytomato.com_Carrot_Seeds:_Totally_Tomatoes.jpg
Downloaded image: images/10_burpee.com_Carrot,_Deep_Purple_Hybrid_-_Vegetable_Seeds_&_Pla.jpg
          

In [32]:
#item

{'kind': 'customsearch#result',
 'title': 'Buy 150 Tomato San Marzano Seeds, Italian Tomato Seeds, Red ...',
 'htmlTitle': '<b>Buy</b> 150 <b>Tomato San Marzano Seeds</b>, Italian <b>Tomato Seeds</b>, Red ...',
 'link': 'https://i.etsystatic.com/28558470/r/il/339f41/3038656282/il_1080xN.3038656282_7gda.jpg',
 'displayLink': 'www.etsy.com',
 'snippet': 'il_1080xN.3038656282_7gda.jpg',
 'htmlSnippet': 'il_1080xN.3038656282_7gda.jpg',
 'mime': 'image/jpeg',
 'fileFormat': 'image/jpeg',
 'image': {'contextLink': 'https://www.etsy.com/in-en/listing/999851953/150-tomato-san-marzano-seeds-italian',
  'height': 1080,
  'width': 1080,
  'byteSize': 186768,
  'thumbnailLink': 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQAWXDIifekX2aAgPIwjGzqA-2LLBcvg9T5IQ8-l2vYT2znWA3aWOONk2I&s',
  'thumbnailHeight': 150,
  'thumbnailWidth': 150}}