In [None]:
import sys
sys.path.append(r"./python")

import os
import json
import traceback
import urllib.parse
import boto3
from datetime import datetime
import time
from opensearch_vector_search import OpenSearchVectorSearch
from langchain.embeddings.sagemaker_endpoint import EmbeddingsContentHandler
from langchain.embeddings.sagemaker_endpoint import SagemakerEndpointEmbeddings
from model import *
from typing import Dict, List, Tuple

In [None]:
#根据时间情况修改index和language值

index =  ""
language = "english"
embedding_endpoint_name = "cohere.embed-multilingual-v3"

port = 443
bulk_size = 10000000

sm_client = boto3.client('secretsmanager')
master_user = sm_client.get_secret_value(SecretId='opensearch-host-url')['SecretString']
data= json.loads(master_user)
es_host_name = data.get('host')
host = es_host_name+'/' if es_host_name[-1] != '/' else es_host_name# cluster endpoint, for example: my-test-domain.us-east-1.es.amazonaws.com/
host = host[8:-1]
region = boto3.Session().region_name # e.g. cn-north-1
print('host:',host)
print('region:',region)

# retrieve secret manager value by key using boto3                                             
sm_client = boto3.client('secretsmanager')
master_user = sm_client.get_secret_value(SecretId='opensearch-master-user')['SecretString']
data= json.loads(master_user)
username = data.get('username')
password = data.get('password')


In [None]:
embedding_type = 'bedrock' if embedding_endpoint_name.find('titan') or embedding_endpoint_name.find('cohere') else 'sagemaker'
embeddings = init_embeddings_bedrock(embedding_endpoint_name)
vector_store=init_vector_store(embeddings,index,host,port,username,password)

In [None]:
import sys
sys.path.append(r"./python")

from tqdm import tqdm
import fitz
from PIL import Image
import numpy as np
import base64
from model import init_model_bedrock

model_name = "anthropic.claude-3-sonnet-20240229-v1:0"
llm = init_model_bedrock(model_name)
text_max_length = 2000

def is_json(myjson):
    try:
        json.loads(myjson)
    except ValueError as e:
        return False
    return True

prompt = """
You are a document organizer of bicycle company and your task is to extract useful information from images.Please refer to the format of the content of the previous page to extract text information from the image on the current page.
If the content on this page contains tables or maintenance process, please organize the tables or maintenance process into json format. 
If the content of this page is a table and has no header, use the header of the previous page. 
If the content of this page is a maintenance process and does not specify the specific maintenance object, use the maintenance object on the previous page.

<previous page content>
{context}
</previous page content>
No preface, just output the content directly.
"""

files_path = '../../docs/giant/'
os.mkdir('images/')
files = os.listdir(files_path)
for file in files:
    file_path = files_path + file
    print(file_path)
    fname = file.split('/')[-1].split('.')[0]
    print(fname)
    os.mkdir('images/'+fname)

    doc = fitz.open(file_path)
    previous_page_content = ''

    for i in tqdm(range(doc.page_count)):
        if i < 2:
            continue
        texts = []
        metadatas = []
        page = doc.load_page(i)
        pix = page.get_pixmap()
        
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        img_name = fname + '-' + str(i) + '.jpg'
        img.save('images/'+fname + '/' + img_name, "JPEG")
        
        imgb64 = base64.b64encode(pix.tobytes()).decode("utf-8")
        model_kwargs = {'image': imgb64,'max_tokens':2048}
        llm.model_kwargs = model_kwargs
        new_prompt = prompt.format(context=previous_page_content)
        response = llm(prompt=new_prompt)
        response = response.strip()
        previous_page_content = response
        
        # print('response:',response)
        response_set = set()
        if is_json(response) and response.find('{') >=0:
            response = json.loads(response)
            for data in response:
                if isinstance(response[data], dict):
                    for sub_key in response[data]:
                        text = str(sub_key) + '\n' + str(response[data][sub_key])
                        response_set.add(text)
                else:
                    text = str(data) + '\n' + str(response[data])
                    response_set.add(text)
            response = json.dumps(response)
        else:
            response_list = response.split('\n')
            for data in response_list:
                data = data.strip().replace('null','')
                if len(data) > 2:
                    if is_json(data) and data.find('{') >=0:
                        data = json.loads(data)
                        for sub_key in data:
                            text = str(sub_key) + '\n' + str(data[sub_key])
                            response_set.add(text)
                    else:
                        response_set.add(data)
        for text in response_set:
            texts.append(str(response).replace(': null,',''))
            metadata = {}
            metadata['sentence'] = text[:text_max_length] if len(text) > text_max_length else text
            metadata['sources'] = file.split('/')[-1]
            metadata['type'] = 'pdf'
            metadata['page'] = i
            metadata['image'] = img_name
            metadatas.append(metadata)

        print('texts len:',len(texts))
        print('metadatas len:',len(metadatas))
        print('begin to save in vectore store')
        vector_store.add_texts_sentence_in_metadata(
            texts=texts,
            metadatas=metadatas,
            bulk_size=10000,
            batch_size=100,
            text_field='paragraph',
            vector_field='sentence_vector',
            embedding_type=embedding_type
        )
        print('finish save in vectore store')

In [None]:
### Just for srt file data load

import pysrt
import os

source_map={
'Script - SXC 32 Fork Remote Lockout Service [2023_SM_INC].srt':'SXC 32 Fork Remote Lockout Service [2023_SM_INC].mp4',
'subtitle_Contact Switch Dropper Seatpost Service [2023_SM_INC].srt':'Contact Switch Dropper Seatpost Service [2023_SM_INC]-selected.mp4',
'Crest Fork Series Differentiation [2022_SM_INC].srt':'Crest Fork Series Differentiation [2022_SM_INC].mp4',
'Script - STL 34 Fork Air Spring Service [2023_SM_INC].srt':'STL 34 Fork Air Spring Service [2023_SM_INC].mp4',
'subtitles_Contact Switch AT Dropper - Dropper Seatpost Service [2024_SM_INC].srt':'final_Contact Switch AT Dropper - Dropper Seatpost Service [2024_SM_INC].mp4',
'Script - Contact Aerolight Stem Range Introduction.srt':'Contact Aerolight Stem/ Range Introduction [2023_SM_INC].mp4',
'Script - SXC 32 Fork Air Spring Service [2023_SM_INC].srt':'SXC 32 Fork Air Spring Service [2023_SM_INC].mp4',
'CREST 34 SL Lower Leg Service [2022_SM_INC].srt':'CREST 34 SL Lower Leg Service [2022_SM_INC].mp4',
'subtitles_Contact Switch AT Dropper - Remote Lever Installation [2024_SM_INC].srt':'final_Contact Switch AT Dropper - Remote Lever Installation [2024_SM_INC].mp4',
'CREST 34 SL Damper Service [2022_SM_INC].srt':'CREST 34 SL Damper Service [2022_SM_INC].mp4'
}

def time_to_num(time_str):
    hh, mm , ss = map(int, time_str.split(':'))
    return ss + 60*(mm + 60*hh)

srt_path = '../docs/giant_srt/'
files = os.listdir(srt_path)
texts = []
metadatas = []
for file in files:
    print(file)
    if file.find('checkpoints') >=0:
        continue
    video_name = source_map[file]
    subs = pysrt.open(srt_path + file)
    for sub in subs:
        text = sub.text
        texts.append(text)
        start = sub.start.to_time()
        hour = start.hour
        minute = start.minute
        second = start.second
        metadata = {}
        metadata['type'] = 'video'
        metadata['start'] = second + 60*minute + 3600*hour
        metadata['source'] = video_name
        metadata['sentence'] = text
        metadatas.append(metadata)
print(len(texts))
print(len(metadatas))

print('begin to save in vectore store')
vector_store.add_texts_sentence_in_metadata(
    texts=texts,
    metadatas=metadatas,
    bulk_size=10000,
    batch_size=100,
    text_field='paragraph',
    vector_field='sentence_vector',
    embedding_type=embedding_type
)
print('finish save in vectore store')