# LangChain data chunking example
...

## Load File

In [17]:
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain.document_loaders import TextLoader
import os

# loader = UnstructuredMarkdownLoader("sales_dscnt_strategy.md", mode="single",)
file_name = "sales_dscnt_strategy.md"
loader = TextLoader(file_name)
docs = loader.load()
print(docs)

[Document(metadata={'source': 'sales_dscnt_strategy.md'}, page_content='# iPhone的折扣策略:\n\n购买任何iPhone总价超200, 打九折(10% off).\niPhone 7 打八折.\n\n# Apple pro vision的折扣策略\n\n无折扣优惠\n\n# Vivo的折扣策略\n\n无折扣优惠\n\n# Samsung的折扣策略:\n\n购买 Samsung Galaxy Z Flip 手机打九折(10% off)\n\n# Nokida的折扣策略:\n\n无折扣优惠\n\n# 不分品牌的折扣策略:\n\n购买任何手机总价超400, 在原有折扣的基础上再打九折.')]


## Chunk Using TextSpliter

In [18]:
import tiktoken
# print(tiktoken.encoding_for_model('gpt-4o').name)

### RecursiveCharacterTextSplitter

In [15]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# from_tiktoken_encoder enables use to split on tokens rather than characters
recursive_text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
   encoding_name=tiktoken.encoding_for_model('gpt-4o').name,
   chunk_size=20, 
   chunk_overlap=6
)

recursive_text_splitter_chunks = recursive_text_splitter.split_documents(docs)

for d in recursive_text_splitter_chunks:
  print(d.page_content)
  print("----------"*10)

iPhone的折扣策略:
----------------------------------------------------------------------------------------------------
购买任何iPhone总价超200, 打九折(10% off). iPhone
----------------------------------------------------------------------------------------------------
off). iPhone 7 打八折.
----------------------------------------------------------------------------------------------------
Apple pro vision的折扣策略

无折扣优惠

Vivo的折扣策略
----------------------------------------------------------------------------------------------------
无折扣优惠

Samsung的折扣策略:
----------------------------------------------------------------------------------------------------
购买 Samsung Galaxy Z Flip 手机打九折(10% off)
----------------------------------------------------------------------------------------------------
Nokida的折扣策略:

无折扣优惠
----------------------------------------------------------------------------------------------------
无折扣优惠

不分品牌的折扣策略:
----------------------------------------------------------------------------------

### SemanticChunker

In [27]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import AzureOpenAIEmbeddings

semantic_text_splitter = SemanticChunker(AzureOpenAIEmbeddings(model='text-embedding-3-small'),
                                         breakpoint_threshold_type="percentile",
                                         breakpoint_threshold_amount=95)
semantic_splitter_chunks = semantic_text_splitter.split_documents(docs)


for d in semantic_splitter_chunks:
  print(d.page_content)
  print("----------"*10)

# iPhone的折扣策略:

购买任何iPhone总价超200, 打九折(10% off).
----------------------------------------------------------------------------------------------------
iPhone 7 打八折. # Apple pro vision的折扣策略

无折扣优惠

# Vivo的折扣策略

无折扣优惠

# Samsung的折扣策略:

购买 Samsung Galaxy Z Flip 手机打九折(10% off)

# Nokida的折扣策略:

无折扣优惠

# 不分品牌的折扣策略:

购买任何手机总价超400, 在原有折扣的基础上再打九折.
----------------------------------------------------------------------------------------------------


### MarkdownHeaderTextSplitter

In [51]:
from langchain.text_splitter import MarkdownHeaderTextSplitter

mdhead_text_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=[("#", "Header 1")])
mdhead_splitter_chunks = mdhead_text_splitter.split_text(docs[0].page_content) #@# MarkdownHeaderTextSplitter has no split_documents()


for d in mdhead_splitter_chunks:
  print(d.metadata['Header 1'])
  print(d.page_content)
  print("----------"*10)

iPhone的折扣策略:
购买任何iPhone总价超200, 打九折(10% off).
iPhone 7 打八折.
----------------------------------------------------------------------------------------------------
Apple pro vision的折扣策略
无折扣优惠
----------------------------------------------------------------------------------------------------
Vivo的折扣策略
无折扣优惠
----------------------------------------------------------------------------------------------------
Samsung的折扣策略:
购买 Samsung Galaxy Z Flip 手机打九折(10% off)
----------------------------------------------------------------------------------------------------
Nokida的折扣策略:
无折扣优惠
----------------------------------------------------------------------------------------------------
不分品牌的折扣策略:
购买任何手机总价超400, 在原有折扣的基础上再打九折.
----------------------------------------------------------------------------------------------------


### MarkdownTextSplitter

In [50]:
from langchain.text_splitter import MarkdownTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_text_splitters.base import Language

separators = MarkdownTextSplitter.get_separators_for_language(Language.MARKDOWN)
separators = separators[:-2]
mdtxt_text_splitter = RecursiveCharacterTextSplitter(separators=separators, chunk_size=0, chunk_overlap=0)
# mdtxt_text_splitter = MarkdownTextSplitter(chunk_size=0, chunk_overlap=0)
mdtxt_splitter_chunks = mdtxt_text_splitter.split_documents(docs)


for d in mdtxt_splitter_chunks:
  print(d.page_content)
  print("----------"*10)

# txtLs = mdtxt_text_splitter.split_text(docs[0].page_content)

# for t in txtLs:
#   print(t)
#   print("----------"*10)


# iPhone的折扣策略:
----------------------------------------------------------------------------------------------------


----------------------------------------------------------------------------------------------------

购买任何iPhone总价超200, 打九折(10% off).
----------------------------------------------------------------------------------------------------

iPhone 7 打八折.
----------------------------------------------------------------------------------------------------


----------------------------------------------------------------------------------------------------

# Apple pro vision的折扣策略
----------------------------------------------------------------------------------------------------


----------------------------------------------------------------------------------------------------

无折扣优惠
----------------------------------------------------------------------------------------------------


-----------------------------------------------------------------------------------------

# Upload to Azure AI Search

## Load .env file (Copy .env-sample to .env and update accordingly)

In [57]:
from dotenv import load_dotenv
from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential
import os

load_dotenv('../.env') # take environment variables from .env.

# Variables not used here do not need to be updated in your .env file
search_endpoint = os.environ["AZURE_SEARCH_SERVICE_ENDPOINT"]
azure_openai_endpoint = os.environ["AZURE_OPENAI_ENDPOINT_4AISEARCH"]
azure_openai_embedding_deployment_id = "text-embedding-3-small"#@# REFACTOR 可以用其它的emb或ranker,基于CLS的或非CLS但是多路的
recursivetextsplitter_searchindex = 'sale_strategy_idx_tst2'  #os.environ["AZURE_SEARCH_LANGCHAIN_RECURSIVETEXTSPLITTER_INDEX"]

search_credential = AzureKeyCredential(os.environ["AZURE_SEARCH_ADMIN_KEY"]) if len(os.environ["AZURE_SEARCH_ADMIN_KEY"]) > 0 else DefaultAzureCredential()
azure_openai_key = os.environ["AZURE_OPENAI_API_KEY_4AISEARCH"] if len(os.environ["AZURE_OPENAI_API_KEY_4AISEARCH"]) > 0 else None

## Setup Clinet

In [35]:
from openai import AzureOpenAI
from azure.identity import get_bearer_token_provider

azure_openai_client = None
# if azure_openai_key:
azure_openai_client = AzureOpenAI(
    api_key=azure_openai_key, 
    api_version="2023-05-15",
    azure_deployment=azure_openai_embedding_deployment_id,
    azure_endpoint=azure_openai_endpoint)
# else:
#     azure_openai_client = AzureOpenAI(
#         azure_ad_token_provider=get_bearer_token_provider(DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default"),
#         api_version="2023-05-15",
#         azure_deployment=azure_openai_embedding_deployment_id,
#         azure_endpoint=azure_openai_endpoint)

## Create Index

In [58]:
from azure.search.documents.indexes import SearchIndexClient

from azure.search.documents.indexes.models import (
    SearchIndex,
    SearchIndexer,
    SearchIndexerDataSourceConnection,
    SearchIndexerDataContainer,
    SearchField,
    SearchFieldDataType,
    VectorSearch,
    VectorSearchProfile,
    HnswAlgorithmConfiguration,
    AzureOpenAIEmbeddingSkill,
    SplitSkill,
    SemanticConfiguration,
    SemanticPrioritizedFields,
    SemanticField,
    SemanticSearch,
)
# Required to use the preview SDK
from azure.search.documents.indexes._generated.models import (
    SearchIndexerSkillset,
    AzureOpenAIVectorizer,
    AzureOpenAIParameters,
    SearchIndexerIndexProjections,
    SearchIndexerIndexProjectionSelector,
    SearchIndexerIndexProjectionsParameters,
    InputFieldMappingEntry,
    OutputFieldMappingEntry
)

def create_search_index(index_name, azure_openai_endpoint, azure_openai_embedding_deployment_id, azure_openai_key=None):
    return SearchIndex(
        name=index_name,
        fields=[
            SearchField(
                name="chunk_id",
                type=SearchFieldDataType.String,
                key=True,
                hidden=False,
                filterable=True,
                sortable=True,
                facetable=False,
                searchable=True,
                analyzer_name="keyword"
            ),
            SearchField(
                name="parent_id",
                type=SearchFieldDataType.String,
                hidden=False,
                filterable=True,
                sortable=False,
                facetable=False,
                searchable=True
            ),
            SearchField(
                name="chunk",
                type=SearchFieldDataType.String,
                hidden=False,
                filterable=False,
                sortable=False,
                facetable=False,
                searchable=True
            ),
            SearchField(
                name="title",
                type=SearchFieldDataType.String,
                hidden=False,
                filterable=False,
                sortable=False,
                facetable=False,
                searchable=True
            ),
            SearchField(
                name="titleChunkVector",
                type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                hidden=False,
                filterable=False,
                sortable=False,
                facetable=False,
                searchable=True,
                vector_search_dimensions=1536,
                vector_search_profile_name="vecprofile"
            )
        ],
        vector_search=VectorSearch(
            profiles=[
                VectorSearchProfile(
                    name="vecprofile",
                    algorithm_configuration_name="hnsw-algorithm",
                    vectorizer="azure-openai-vectorizer"
                )
            ],
            algorithms=[
                HnswAlgorithmConfiguration(name="hnsw-algorithm")
            ],
            vectorizers=[
                AzureOpenAIVectorizer(
                        name="azure-openai-vectorizer",
                        azure_open_ai_parameters=AzureOpenAIParameters(
                            resource_uri=azure_openai_endpoint,
                            deployment_id=azure_openai_embedding_deployment_id,
                            model_name=azure_openai_embedding_deployment_id,
                            api_key=azure_openai_key # Optional if using RBAC authentication
                        )
                    )
            ]
        ),
        semantic_search=SemanticSearch(configurations=[
            SemanticConfiguration(
                name="my-semantic-config",
                prioritized_fields=SemanticPrioritizedFields(
                    # title_field=SemanticField(field_name="title"),
                    # keywords_fields=[SemanticField(field_name="category")],
                    content_fields=[SemanticField(field_name="chunk")]
                )
            )
        ])   
    )


search_index_client = SearchIndexClient(endpoint=search_endpoint, credential=search_credential)
rts_searchindex = create_search_index(
    recursivetextsplitter_searchindex,
    azure_openai_endpoint,
    azure_openai_embedding_deployment_id,
    azure_openai_key
)
search_index_client.create_or_update_index(rts_searchindex)

print("Created recursive text splitter index")


Created recursive text splitter index


## Embed Chunks

In [41]:
chunk_content = [f"{chunk.metadata['Header 1']} \n {chunk.page_content}" for chunk in mdhead_splitter_chunks]
chunk_content

['iPhone的折扣策略: \n 购买任何iPhone总价超200, 打九折(10% off).\niPhone 7 打八折.',
 'Apple pro vision的折扣策略 \n 无折扣优惠',
 'Vivo的折扣策略 \n 无折扣优惠',
 'Samsung的折扣策略: \n 购买 Samsung Galaxy Z Flip 手机打九折(10% off)',
 'Nokida的折扣策略: \n 无折扣优惠',
 '不分品牌的折扣策略: \n 购买任何手机总价超400, 在原有折扣的基础上再打九折.']

In [52]:

mdhead_splitter_embeddings = azure_openai_client.embeddings.create(input=chunk_content, model=azure_openai_embedding_deployment_id)
mdhead_splitter_embeddings = [result.embedding for result in mdhead_splitter_embeddings.data]


## Upload chunks to search index

In [59]:
recursive_search_client = search_index_client.get_search_client(recursivetextsplitter_searchindex)


docs2idx = [
    {
        "parent_id": file_name,
        "chunk_id": f"sales_dscnt_strategy_md_{i}",
        "chunk": chunk.page_content,
        "title": chunk.metadata['Header 1'],
        "titleChunkVector": mdhead_splitter_embeddings[i]
    }
    for i, chunk in enumerate(mdhead_splitter_chunks)
]

recursive_search_client.upload_documents(docs2idx)

print("Uploaded chunks and embeddings for recursive text splitter")

Uploaded chunks and embeddings for recursive text splitter


# Search

## Hybrid Search

In [71]:
# Hybrid Search

from azure.search.documents.models import VectorizableTextQuery
from azure.search.documents.models import VectorizedQuery

query = "买两台手机有什么优惠么?"  

embedding = azure_openai_client.embeddings.create(input=query, model=azure_openai_embedding_deployment_id).data[0].embedding 
                                                  # dimensions=azure_openai_embedding_dimensions).data[0].embedding

aoai_vector_query = VectorizedQuery(vector=embedding, k_nearest_neighbors=3, fields="titleChunkVector")
  
vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=3, fields="titleChunkVector", weight=0.8)

results = recursive_search_client.search(  
    search_text=query,  
    vector_queries=[vector_query, aoai_vector_query],
    select=["title", "chunk"],
    top=3
)
  
for result in results:  
    print(f"Title: {result['title']}")  
    print(f"Content: {result['chunk']}")  
    print(f"Score: {result['@search.score']} \n")  

Title: 不分品牌的折扣策略:
Content: 购买任何手机总价超400, 在原有折扣的基础上再打九折.
Score: 0.046666670590639114 

Title: Samsung的折扣策略:
Content: 购买 Samsung Galaxy Z Flip 手机打九折(10% off)
Score: 0.04516128823161125 

Title: iPhone的折扣策略:
Content: 购买任何iPhone总价超200, 打九折(10% off).
iPhone 7 打八折.
Score: 0.0448928102850914 



## Semantic Hybrid Search

In [70]:
# Hybrid Search
from azure.search.documents.models import QueryType, QueryCaptionType, QueryAnswerType

query = "买两台手机有什么优惠么?"  

embedding = azure_openai_client.embeddings.create(input=query, model=azure_openai_embedding_deployment_id).data[0].embedding 
                                                  # dimensions=azure_openai_embedding_dimensions).data[0].embedding

aoai_vector_query = VectorizedQuery(vector=embedding, k_nearest_neighbors=3, fields="titleChunkVector")
  
vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=3, fields="titleChunkVector", weight=0.8)

results = recursive_search_client.search(  
    search_text=query,  
    vector_queries=[vector_query, aoai_vector_query],
    select=["title", "chunk"],
    query_type=QueryType.SEMANTIC, semantic_configuration_name='my-semantic-config', 
    # query_caption=QueryCaptionType.EXTRACTIVE, query_answer=QueryAnswerType.EXTRACTIVE,
    top=3
)
  
for result in results:  
    print(f"Title: {result['title']}")  
    print(f"Content: {result['chunk']}")  
    print(f"Score: {result['@search.score']}")  
    print(f"Reranker Score: {result['@search.reranker_score']} \n")

Title: Samsung的折扣策略:
Content: 购买 Samsung Galaxy Z Flip 手机打九折(10% off)
Score: 0.04516128823161125
Reranker Score: 2.231081485748291 

Title: 不分品牌的折扣策略:
Content: 购买任何手机总价超400, 在原有折扣的基础上再打九折.
Score: 0.046666670590639114
Reranker Score: 2.2171988487243652 

Title: iPhone的折扣策略:
Content: 购买任何iPhone总价超200, 打九折(10% off).
iPhone 7 打八折.
Score: 0.0448928102850914
Reranker Score: 2.123516082763672 

