In [None]:
%pip install python-dotenv langchain langchain-community langchain-openai langchainhub openai tiktoken azure-ai-documentintelligence azure-identity azure-search-documents==11.6.0b3

In [1]:
"""
This code loads environment variables using the `dotenv` library and sets the necessary environment variables for Azure services.
The environment variables are loaded from the `.env` file in the same directory as this notebook.
"""
import os
from dotenv import load_dotenv


os.environ["AZURE_OPENAI_ENDPOINT"] = os.getenv("AZURE_OPENAI_ENDPOINT")
os.environ["AZURE_OPENAI_KEY"] = os.getenv("AZURE_OPENAI_KEY")
doc_intelligence_endpoint = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT")
doc_intelligence_key = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_KEY")

In [2]:
from langchain import hub
from langchain_openai import AzureChatOpenAI
from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader
from langchain_openai import AzureOpenAIEmbeddings
from langchain.schema import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain.vectorstores.azuresearch import AzureSearch

In [3]:
book_file_path="/Users/huqianghui/Downloads/books-translations/books/TheInnovatorsDilemmaWhenNewTechnoloClaytonMChristensen-chart1.pdf"

# Initiate Azure AI Document Intelligence to load the document. You can either specify file_path or url_path to load the document.
loader = AzureAIDocumentIntelligenceLoader(
    file_path=book_file_path, 
    api_key = doc_intelligence_key, 
    api_endpoint = doc_intelligence_endpoint, 
    api_model="prebuilt-layout")

# analysis_features=["ocr_high_resolution"]
# Specify the pages to analyze as an optional parameter
# analyze_options = {
#     "pages": "10-16",  # This specifies that only pages 1 through 52 should be analyzed
#     "reading_order": "natural",  # 自然阅读顺序
#     "text_angle": True,  # 检测文本角度
#     "ocr_high_resolution": True  # 启用高分辨率OCR
# }

# Set the analyze options via a method or directly if the load method does not support extra parameters

docs = loader.load()

# Split the document into chunks base on markdown headers.
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]
text_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

docs_string = docs[0].page_content
splits = text_splitter.split_text(docs_string)

print("Length of splits: " + str(len(splits)))



Length of splits: 18


In [11]:
import tiktoken

encoding = tiktoken.get_encoding("o200k_base")

def num_tokens_from_string(string: str) -> int:
    """Returns the number of tokens in a text string."""
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [None]:
# The base string to split
book_name = "TheInnovatorsDilemmaWhenNewTechnoloClaytonMChristensen-chart1"

# Directory to save the chunks
output_dir = "/Users/huqianghui/Downloads/books-translations/book-chunks"

max_index_length = len(str(len(splits) - 1))

for index, document in enumerate(splits):
    # Construct the file name
    padded_index = f"{index:0{max_index_length}d}"
    file_name = f"{book_name}-{padded_index}.md"
    file_path = os.path.join(output_dir, file_name)
    
    # Write the chunk to the file
    with open(file_path, 'w') as file:
        content = document.page_content.replace("Z 转转大师", "")
        content = content.replace("扫描转换,就是高效", "")
        num_tokens_o200k_base = num_tokens_from_string(content)
        print(f"file_name:{file_name},gpt4-o token:{num_tokens_o200k_base}...")
        file.write(content)


file_name:TheInnovatorsDilemmaWhenNewTechnoloClaytonMChristensen-chart1-0.md,gpt4-o token:374...
file_name:TheInnovatorsDilemmaWhenNewTechnoloClaytonMChristensen-chart1-1.md,gpt4-o token:48...
file_name:TheInnovatorsDilemmaWhenNewTechnoloClaytonMChristensen-chart1-2.md,gpt4-o token:2980...
file_name:TheInnovatorsDilemmaWhenNewTechnoloClaytonMChristensen-chart1-3.md,gpt4-o token:188...
file_name:TheInnovatorsDilemmaWhenNewTechnoloClaytonMChristensen-chart1-4.md,gpt4-o token:490...
file_name:TheInnovatorsDilemmaWhenNewTechnoloClaytonMChristensen-chart1-5.md,gpt4-o token:328...
file_name:TheInnovatorsDilemmaWhenNewTechnoloClaytonMChristensen-chart1-6.md,gpt4-o token:478...
file_name:TheInnovatorsDilemmaWhenNewTechnoloClaytonMChristensen-chart1-7.md,gpt4-o token:456...
file_name:TheInnovatorsDilemmaWhenNewTechnoloClaytonMChristensen-chart1-8.md,gpt4-o token:3452...
file_name:TheInnovatorsDilemmaWhenNewTechnoloClaytonMChristensen-chart1-9.md,gpt4-o token:211...
file_name:TheInnovatorsDilemm

In [None]:
%pip install PyMuPDF

In [5]:
import fitz  # PyMuPDF
import pymupdf
# 打开PDF文件
doc = fitz.open(book_file_path)

# 遍历每一页
for page_num in range(len(doc)):
    page = doc.load_page(page_num)
    breakpoint()
    # 获取页面中的图像信息
    images = page.get_images(full=True)
    for img_index, img_info in enumerate(images):
        xref = img_info[0]  # 图像在PDF中的索引
        base_image = doc.extract_image(xref)
        pix = pymupdf.Pixmap(doc, xref)
        image_bytes = base_image["image"]  # 图像的二进制数据
        image_ext = base_image["ext"]  # 图像的文件扩展名，例如 jpg、png 等

        # 保存图像到文件
        with open(f"/Users/huqianghui/Downloads/books-translations/book-chunks/images/image_page{page_num + 1}_{img_index}.{image_ext}", "wb") as img_file:
            img_file.write(image_bytes)

# 关闭PDF文件
doc.close()

In [None]:
%pip install azure-ai-vision-imageanalysis

In [10]:
import os
import shutil
from azure.ai.vision.imageanalysis import ImageAnalysisClient
from azure.ai.vision.imageanalysis.models import VisualFeatures
from azure.core.credentials import AzureKeyCredential
from PIL import Image

# Set the values of your computer vision endpoint and computer vision key
# as environment variables:
try:
    endpoint = os.environ["AZURE_VISION_ENDPOINT"]
    key = os.environ["AZURE_VISION_KEY"]
except KeyError:
    print("Missing environment variable 'VISION_ENDPOINT' or 'VISION_KEY'")
    print("Set them before running this sample.")
    exit()

# Create an Image Analysis client
client = ImageAnalysisClient(
    endpoint=endpoint,
    credential=AzureKeyCredential(key)
)

images_src_folder_path="/Users/huqianghui/Downloads/books-translations/book-chunks/images"
images_target_folder_path="/Users/huqianghui/Downloads/books-translations/book-chunks/filtered-images"

image_noises="扫描转换,就是高效"

for filename in os.listdir(images_src_folder_path):
    if filename.endswith(".jpeg") or filename.endswith(".png"):
        image_path = os.path.join(images_src_folder_path, filename)
        
        # 使用Pillow获取图像尺寸
        with Image.open(image_path) as img:
            width, height = img.size
        
        # 检查图像尺寸是否符合要求
        if width < 50 or height < 50 or width > 16000 or height > 16000:
            print(f"Skipping {filename}: Invalid image size {width}x{height}.")
            continue

        with open(image_path, "rb") as f:
            image_data = f.read()

        result = client.analyze(
        image_data=image_data,
        visual_features=[VisualFeatures.READ])

        textInImage = ""
        if result.read is not None:
            for line in result.read.blocks[0].lines:
                #print(f"   Line: '{line.text}', Bounding box {line.bounding_polygon}")
                textInImage += line.text + "\n"
        if(not image_noises in textInImage):
           shutil.move(image_path, images_target_folder_path) 




Skipping image_page28_1.jpeg: Invalid image size 22x39.
Skipping image_page2_1.jpeg: Invalid image size 13x12.
