## Importing required libraries

In [1]:
from web_crawler import CustomWebCrawler
from computer_vision import VisionTransformer, BLIPImageCaptioning, download_image
from PIL import Image
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
MODEL_PATH = "./models/vit-base-patch16-224"
CAPTION_MODEL_PATH = "./models/blip-image-captioning-large"

PROJECT_TOPIC = "travel"
IMAGE_LIMIT = 10000
IMAGES_PER_GROUP = 1000

## Scrapping Images

In [3]:
if not os.path.exists("./outputs/crawler.json"):
    os.makedirs("./outputs", exist_ok=True)

    spider = CustomWebCrawler(photo_limit=IMAGE_LIMIT, photos_per_group=IMAGES_PER_GROUP)

    data = [x for x in spider.crawl(PROJECT_TOPIC)]

    spider.close()

    with open("./outputs/crawler.json", "w") as f:
        f.write(str(data))
else:
    with open("./outputs/crawler.json", "r") as f:
        data = eval(f.read())

## Extract features from Images

In [4]:
def save_image(id, image):
    if not os.path.exists("./images"):
        os.mkdir("./images")
    
    image.save(f"./images/{id}.jpg")

def load_image(id, src):
    if os.path.exists(f"./images/{id}.jpg"):
        return Image.open(f"./images/{id}.jpg")
    else:
        try:
            return download_image(src)
        except:
            print(f"Failed to download image {id} from {src}")
            return None

if not os.path.exists("./outputs/search-data.json"):
    os.makedirs("./outputs", exist_ok=True)

    vit = VisionTransformer(model_path=MODEL_PATH)
    blip = BLIPImageCaptioning(model_path=CAPTION_MODEL_PATH)

    for i, item in enumerate(data):
        print(f"\rProcessing Image {i+1}/{len(data)}...", end="")

        if item.get('caption') is not None:
            continue

        image = load_image(i, item["image_src"])
        if image is None:
            continue

        classification = vit.classify(image)
        caption = blip.caption(image)

        item["cv"] = classification
        item["caption"] = caption
        item["docno"] = i
        save_image(i, image)    

    print("\nDone!")

    with open("./outputs/search-data.json", "w") as f:
        f.write(str(data))
else:
    with open("./outputs/search-data.json", "r") as f:
        data = eval(f.read())

In [5]:
# Filter out images which failed to download
data = [x for x in data if x.get('caption') is not None]

## Data Preprocessing

In [8]:
len(data)

9999

## Deployment

### Uploading data to Firestore

In [17]:
from google.cloud import firestore

db = firestore.Client(project="ca6005-search-engine", database="search-engine")

In [None]:
COLLECTION_NAME = "image-data"

batch = db.batch()
batch_size = 500

def add_to_batch(doc, batch):
    doc_ref = db.collection(COLLECTION_NAME).document(str(doc.get("docno")))
    batch.set(doc_ref, doc)

for i, item in enumerate(data):
    print(f"\rUploading Image {i+1}/{len(data)}...", end="")

    add_to_batch(item, batch)

    if i % batch_size == 0:
        batch.commit()
        batch = db.batch()

batch.commit()

### Uploading images to Cloud Storage

In [9]:
from google.cloud.storage import transfer_manager
from google.cloud import storage

storage_client = storage.Client(project="ca6005-search-engine")
bucket = storage_client.bucket("ca6005-search-engine")

# Upload images to bucket
source_directory = "./images"
workers = 10

filenames = [f"{source_directory}/{item.get('docno')}.jpg" for item in data]

if len(filenames) > 0:
    results = transfer_manager.upload_many_from_filenames(
        bucket, filenames, max_workers=workers
    )

    for name, result in zip(filenames, results):

        if isinstance(result, Exception):
            print("Failed to upload {} due to exception: {}".format(name, result))
        else:
            print("Uploaded {} to {}.".format(name, bucket.name))

In [25]:
ls = []
for blob in bucket.list_blobs():
    ls.append(blob.name)

In [27]:
ls[-1]

'images/9999.jpg'

## Building Index

In [7]:
from indexing import Indexing

index_file = "./index-data.pkl"

index = Indexing()
index.build_index(data)
index.save_object(index_file)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Rajat\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rajat\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Indexing... (9999 / 9999)
Indexing completed.
