In [1]:
import glob
import os
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import random

import seaborn as sn
import boto3
import re
from io import BytesIO
import base64
import tqdm

import torch
import torch.nn as nn
import torchvision.models as models
from torch.autograd import Variable
from torchvision import transforms

In [2]:
import boto3
import sagemaker
import time
import json
from sagemaker.utils import name_from_base

In [3]:
role = sagemaker.get_execution_role()
sess = sagemaker.Session()
region = sess.boto_region_name
bucket = sess.default_bucket()

# Herlper functions

In [4]:
#return all s3 keys
def get_all_s3_keys(bucket, filt=None):
    """Get a list of all keys in an S3 bucket."""    
    keys = []

    kwargs = {'Bucket': bucket}
    while True:
        resp = s3.list_objects_v2(**kwargs)
        for obj in resp['Contents']:
            key = obj['Key']
            if filt is not None:
                if filt not in key:
                    continue
            keys.append('s3://' + bucket + '/' + key)

        try:
            kwargs['ContinuationToken'] = resp['NextContinuationToken']
        except KeyError:
            break

    return keys

# Make Model

In [None]:

resnet50 = models.resnet50(pretrained=True)

_ = resnet50.eval()
# _ = resnet50.cuda()

modules=list(resnet50.children())[:-1]
resnet50=nn.Sequential(*modules)
for p in resnet50.parameters():
    p.requires_grad = False

transform = transforms.Compose([transforms.ToTensor()])

device = torch.device('cuda:0' if torch.cuda.is_available() else "cpu")

resnet50 = resnet50.to(device)

device


# Test model with a single image

In [5]:
s3 = boto3.client('s3')

In [6]:
data_bucket = 'sagemaker-us-east-2-333209439517'

In [7]:
s3_uris = get_all_s3_keys(data_bucket, filt='jpg')

s3_uris[0], len(s3_uris)

('s3://sagemaker-us-east-2-333209439517/geological_similarity/andesite/012L6.jpg',
 29998)

In [8]:
s3_uri = s3_uris[0]
s3_uri

's3://sagemaker-us-east-2-333209439517/geological_similarity/andesite/012L6.jpg'

In [None]:
s3_uri = 's3://sagemaker-us-east-2-333209439517/geological_similarity/andesite/012L6.jpg'

In [9]:
payload = s3.get_object(Bucket=data_bucket,Key=s3_uri.replace(f's3://{data_bucket}/', ''))['Body'].read()

In [None]:
im_file = BytesIO(payload)  # convert image to file-like object
img = Image.open(im_file)   # img is now PIL Image object

img

In [None]:
im = np.asarray(img)# convert image to numpy array

In [None]:
im.shape

In [None]:
im = np.moveaxis(im, -1, 0)

In [None]:
im.shape

In [None]:
img = transform(im) # convert to tensor
#img = img.reshape(1,3,28,28)
img = torch.unsqueeze(img, 0)
img = img.to(device)


In [None]:
img.shape

In [None]:

with torch.no_grad():
    feature = resnet50(img)

feature = feature.cpu().detach().numpy().reshape(-1)

feature.shape

# Save model to s3

In [None]:
img.shape

In [None]:
import tarfile

In [None]:
input_shape = [1, 3, 28, 28]
trace = torch.jit.trace(resnet50.float().eval(), torch.zeros(input_shape).float())

In [None]:
trace.save("model.pth")

In [None]:
with tarfile.open("model.tar.gz", "w:gz") as f:
    f.add("model.pth")

In [None]:
bucket

In [None]:
compilation_job_name = name_from_base("TorchVision-ResNet50")
prefix = compilation_job_name + "/model"

In [None]:
compilation_job_name, prefix

In [None]:
model_path = sess.upload_data(path="model.tar.gz", key_prefix=prefix)

In [None]:
model_path = 's3://sagemaker-us-east-1-333209439517/TorchVision-ResNet50-2021-09-13-00-30-10-117/model/model.tar.gz'

# Deploy model

In [None]:
import sagemaker
from sagemaker.pytorch import PyTorchModel
from sagemaker import get_execution_role, Session

In [None]:
role

In [None]:
predictor.delete_endpoint()

In [None]:
model = PyTorchModel(
    entry_point="inference.py",
    source_dir="code",
    role=role,
    model_data=model_path,
    framework_version="1.5.0",
    py_version="py3",
)

In [None]:
# SageMakerFullAccess - policy is a managed policy that includes all the necessary permissions required to perform most actions on SageMaker

In [None]:
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer

# set local_mode to False if you want to deploy on a remote
# SageMaker instance

local_mode = False

if local_mode:
    instance_type = "local"
else:
    instance_type = "ml.t2.medium" #"ml.c4.xlarge"

predictor = model.deploy(
    initial_instance_count=1,
    instance_type=instance_type,
    serializer=JSONSerializer(),
    deserializer=JSONDeserializer(),
)

In [None]:
predictor.__dict__

In [None]:
predictor.endpoint_name

In [None]:
predictor.content_type, predictor.serializer, predictor.deserializer

In [10]:
encoded_image = base64.b64encode(payload).decode('utf-8')
req = {'inputs':encoded_image}

In [None]:
im_bytes = base64.b64decode(encoded_image)   # im_bytes is a binary image
im_file = BytesIO(im_bytes)  # convert image to file-like object
image = Image.open(im_file)   # img is now PIL Image object
im = np.asarray(image)# convert image to numpy array
print(im.shape)
image

## Predict using 'predict' function

In [None]:
res = predictor.predict(req)

In [None]:
embedding = np.array(res).reshape(-1)

In [None]:
embedding.shape

In [None]:
ENDPOINT_NAME = predictor.endpoint_name

## Predict using 'invoke_endpoint'

In [11]:
# define a function to extract image features
from time import sleep 

sm_client = boto3.client('sagemaker-runtime')

In [12]:
ENDPOINT_NAME = 'pytorch-inference-2021-09-13-14-49-32-180'

In [13]:
resp = sm_client.invoke_endpoint(EndpointName=ENDPOINT_NAME, Body=json.dumps(req),ContentType='application/json') 
embedding = json.loads((resp['Body'].read()))
embedding = list(np.array(embedding).reshape(-1))

In [14]:
len(embedding)

2048

## Utility functions for prediction on an s3 object

In [15]:
def get_predictions(payload): 
    return sm_client.invoke_endpoint(EndpointName=ENDPOINT_NAME, 
                                     Body=payload,
                                     ContentType='application/json') 

def extract_features(s3_uri, bucket): 
    key = s3_uri.replace(f's3://{bucket}/', '') 
    payload = s3.get_object(Bucket=bucket,Key=key)['Body'].read() 

    sleep(0.1) 
    
    # get image in base64 format
    encoded_image = base64.b64encode(payload).decode('utf-8')
    # get predictions
    response = get_predictions(json.dumps({'inputs':encoded_image})) 
    # get response
    response_body = json.loads((response['Body'].read())) 
    # reshape to list of len=2048
    feature_lst = list(np.array(response_body).reshape(-1))
    
    return s3_uri, feature_lst

## Get features for all images

In [16]:
s3_uri = s3_uris[0]
s3_uri

's3://sagemaker-us-east-2-333209439517/geological_similarity/andesite/012L6.jpg'

In [17]:
uri, features = extract_features(s3_uri, data_bucket)

In [18]:
uri

's3://sagemaker-us-east-2-333209439517/geological_similarity/andesite/012L6.jpg'

In [19]:
len(features)

2048

In [None]:
all_features = []
for s3_uri in tqdm.tqdm(s3_uris):
    _, features = extract_features(s3_uri, data_bucket)
    all_features.append(features)

  1%|          | 227/29998 [00:46<1:41:25,  4.89it/s]

In [None]:
len(all_features)

In [None]:
data = {'uris':s3_uris, 'features':all_features}

In [None]:
s3

In [None]:
str(json.dumps(data))

In [None]:
bucket

In [None]:
import json
import boto3

s3 = boto3.client('s3')
data = 'your_json_object here'
s3.put_object(
     Body=str(json.dumps(data))
     Bucket='your_bucket_name'
     Key='your_key_here'
)

In [None]:
np_feats = np.array(all_fe)

# Connect to Elasticsearch

In [None]:
# !pip install requests_aws4auth
# !pip install elasticsearch=='7.13.4' #https://opensearch.org/docs/clients/index/

In [None]:
# !pip install  nltk
# !pip install jsonlines
# !pip install pandarallel
# !pip install --upgrade grpcio 
# !pip install --upgrade s3fs


In [None]:

from requests_aws4auth import AWS4Auth
from elasticsearch import Elasticsearch, RequestsHttpConnection


In [None]:
import elasticsearch

In [None]:
elasticsearch.__version__

In [None]:
region

In [None]:
service = 'es'

In [None]:
# ssm = boto3.client('ssm', region_name=region)

In [None]:
credentials = boto3.Session().get_credentials()
awsauth = AWS4Auth(credentials.access_key, credentials.secret_key,
                   region, service, session_token=credentials.token)

In [None]:
role

In [None]:
credentials

In [None]:
host = "search-search-all-wroz4g764qrjndse4eqbika2ia.us-east-1.es.amazonaws.com"
es_index = 'knn-test-1'
# "search-domainname-yourDomainEndpoint.REGION.es.amazonaws.com"

In [None]:
def connect_to_ES(esEndPoint):
    print ('Connecting to the ES Endpoint {0}'.format(esEndPoint))
    try:
        esClient = Elasticsearch(
            hosts=[{'host': esEndPoint, 'port': 443}],
            http_auth=('cac','Test123$'),
            use_ssl=True,
            verify_certs=True,
            connection_class=RequestsHttpConnection)
        return esClient
    except Exception as E:
        print("Unable to connect to {0}".format(esEndPoint))
        print(E)

In [None]:
es = connect_to_ES(host)
es

In [None]:
es.indices.exists(index=es_index)

In [None]:
def create_index(index):
    """
    This function will create an index using knn settings
    """
    if not es.indices.exists(index=index):
        index_settings = {
            "settings": {
                "index.knn": True,
                "index.mapping.total_fields.limit": "2000"
            },
            "mappings": {
                "properties": {
                    "embeddings": {
                        "type": "knn_vector",
                        "dimension": 2048
                    }
                }
            }
        }

        es.indices.create(index=index, body=json.dumps(index_settings))
        print("Created the elasticsearch index successufly ")
    else:
        print("elasticsearch index already exists")

In [None]:
es_index

In [None]:
#Create the index using knn settings
create_index(es_index)

In [None]:
# You can check if the index is created within your es cluster
es.indices.get_alias("*")

In [None]:
def es_import(i):
    es.index(index='idx_zalando',
             id=vector_id,
             body={"vector": i[1], 
                   "image": i[0]}
            )
def ingest_data_into_es(event):
    
    loaded_keys = []
    
    bucket = event['bucket']
    key = event['key']

    loaded_keys += [key]

    obj = s3_client.get_object(Bucket=bucket, Key=key)

    records = json.loads(obj['Body'].read().decode('utf-8'))


    count = 0
    lost_records = 0

    for record in records:
        # Get the primary key for use as the Elasticsearch ID
        record_id = record['id']

        try:
            if 'embeddings' in record:
                record['embeddings'] = ast.literal_eval(record['embeddings'])

            es.index(index=es_index, id=record_id, doc_type='_doc', body=record)
    
            count += 1
        except Exception as error:
            logger.error(f"An error {error} for record {record}")
            lost_records += 1

        
    logger.info(f'{lost_records} out of {len(records)} are lost records')

    logger.info(f'{count} out of {len(records)} records has been processed')

    return {
        'statusCode': 200,
        'body': json.dumps(str(count) + ' records processed.')
    }



In [None]:

#Check that data is indeed in ES
res = es.search(index=es_index, body={
                    "query": {
                            "match_all": {}
                        }},
           size=10)



es_query ={
            "query": {
                "knn": {
                    "embeddings": {
                        "vector": query_embeddings,
                        "k": 5
                    }
                }
            }
    }

res = es.search(index=es_index, body=es_query, size=page_size)

