In [None]:
#Need to run when data loading for the first time
!pip install opensearch-py

In [None]:
index = 'adadis_shopping_demo_0912'
loadType = 'text' #text / image / text_and_image
llmKeywords = 'No'
llmModel = 'anthropic.claude-3-haiku-20240307-v1:0'
language = 'chinese'

textEmbeddingEndpoint = 'bge-m3-2024-03-31-02-43-25-634-endpoint'
imageEmbeddingEndpoint = ''
textEmbeddingModelId = ''
imageEmbeddingModelId = ''


file = "aws_product_description-06-01.json"

model_parameters = {"max_tokens_to_sample": 700,"temperature": 0}
prompt_template = """
you are a e-commerce website product manager, Your task is to extract the product's features base on product infomation, to make it easier for customers to search for the product, 
and the product's key features include: name, category, color, size, material and functionality, The feature text needs to be short, output as json format.

<product information>
{product_information}
</product information>

No need to preface, directly output the product’s key features as json format.

"""

In [None]:
import sys
sys.path.append(r"../../lambda/lambda_layer_folder/python")

import json
import re
import os
import json
import traceback
import urllib.parse
import boto3
from datetime import datetime
import time
from opensearch_search import add_products,get_opensearch_client
from embeddings import get_image_embedding_sagemaker,get_embedding_sagemaker,get_embedding_bedrock,get_embedding_bedrock_multimodal
from bedrock_model import invoke_model_local
from tqdm.notebook import trange 


product_info_list = []
product_embedding_list = []
image_embedding_list = []
metadatas = []
i = 0

with open(file) as f:
    items = json.load(f)
    for item in items:
        product_info = ''
        description_info = ''
        metadata = {}
        image_url = item['pdp_image_urls'][0] if 'pdp_image_urls' in item.keys() and len(item['pdp_image_urls']) > 0 else ''
        product_code = item['product_code'] if 'product_code' in item.keys() and len(item['product_code']) > 0 else ''
        product_name = item['product_name'] if 'product_name' in item.keys() and len(item['product_name']) > 0 else ''
        reverse = item['reverse'] if 'reverse' in item.keys() and len(item['reverse']) > 0 else ''
        keyFeatures = {}
        
        formatted_description_info = item['formatted_description_info'] if 'formatted_description_info' in item.keys() and len(item['formatted_description_info']) > 0 else ''
        if 'bullets' in formatted_description_info.keys():
            bullets = formatted_description_info['bullets']
            if len(bullets) > 0:
                bullets_info = ','.join([bullet.replace('•   ','') for bullet in bullets])
                product_info += bullets_info
                description_info += bullets_info
                
        if 'contents' in formatted_description_info.keys():
            contents = formatted_description_info['contents']
            if len(contents) > 0:
                for content in contents:
                    if 'text' in content.keys() and len(content['text']) > 0:
                        description_info += (',' + content['text'].strip())
        
        if 'title' in formatted_description_info.keys():
            title = formatted_description_info['title'].strip()
            if len(title) > 0:
                description_info += (','+title)
        
        if len(product_name) > 0:
            product_info += (','+product_name)
            
        if len(product_info) > 0: 
            metadata['media_url'] = image_url
            metadata['product_code'] = product_code
            metadata['product_name'] = product_name
            metadata['reverse'] = reverse
            metadata['description_info'] = description_info
        else:
            continue
            
        if llmKeywords == "Yes":
            keyFeatures['product_info'] = product_info
            product_information = json.dumps(keyFeatures)
            print('product_information:',product_information)
            prompt = prompt_template.format(product_information=product_information)
            #print('prompt:',prompt)
            if True:
            # try:
                search_term = invoke_model_local(prompt,llmModel,model_parameters)
                metadata['SEARCH_TERM'] = search_term
                product_info = search_term
            # except:
            #     print("invoke model to get search term error")

        image_embedding = ''
        if loadType.find('image') >= 0 and len(image_url) > 0:
            try:
                if len(imageEmbeddingModelId) > 0:
                    image_embedding = get_embedding_bedrock_multimodal(url=image_url,model_id=imageEmbeddingModelId)
                elif len(imageEmbeddingEndpoint) > 0:
                    image_embedding = get_image_embedding_sagemaker(imageEmbeddingEndpoint,image_url)
            except:
                error_records.append(metadata)
                print("image embedding error")

        text_embedding = ''
        if loadType.find('text') >= 0 and len(product_info) > 0:
            try:
                if len(textEmbeddingModelId) > 0:
                    text_embedding = get_embedding_bedrock(product_info,textEmbeddingModelId)
                elif len(textEmbeddingEndpoint) > 0:
                    text_embedding = get_embedding_sagemaker(textEmbeddingEndpoint,product_info,language=language)
            except:
                error_records.append(metadata)
                print("text embedding error")

        if loadType.find('image') >= 0 and loadType.find('text') >= 0:
            if len(image_embedding) == 0 or len(text_embedding) == 0:
                continue
        elif loadType.find('image') >= 0 and len(image_embedding) == 0:
            continue
        elif loadType.find('text') >= 0 and len(text_embedding) == 0:
            continue
            
        metadatas.append(metadata)
        if len(product_info) > 0:
            product_info_list.append(product_info)
        if len(text_embedding) > 0:
            product_embedding_list.append(text_embedding)
        if len(image_embedding) > 0:
            image_embedding_list.append(image_embedding)

        
        i += 1
        print('i:',i)
        if i % 100 == 0:
            if len(image_embedding_list) > 0 or len(product_embedding_list) > 0:
                add_products(index,product_info_list,product_embedding_list,metadatas,image_embedding_list)
                product_info_list = []
                product_embedding_list = []
                image_embedding_list = []
                metadatas = []
            print('finish add products to opensearch,index is:' + index)
        
print('product number:',len(product_info_list))

add_products(index,product_info_list,product_embedding_list,image_embedding_list,metadatas)

print('finish add products to opensearch,index is:' + index)