# Azure AI Search LangChain vector code sample

Execution environment used for testing is `python 3.10.0`

In [10]:
# pip install -r requirements.txt

## Import required libraries and environment variables

In [4]:
# Import required libraries  
import openai
import os  
from dotenv import load_dotenv
from langchain_community.vectorstores.azuresearch import AzureSearch
from langchain.embeddings import OpenAIEmbeddings, AzureOpenAIEmbeddings

from azure.search.documents.indexes.models import (
    FreshnessScoringFunction,
    FreshnessScoringParameters,
    ScoringProfile,
    SearchableField,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    TextWeights,
    SemanticSettings,
    SemanticConfiguration,
    PrioritizedFields,
    SemanticField
)


## Configure Azure OpenAI settings

In [5]:
# TODO: change to .env-{myname} and set environment variables.
load_dotenv(override=True, dotenv_path='../.env-leo')

openai.api_type: str = "azure"  
openai.api_key = os.getenv("AZURE_OPENAI_API_KEY")  
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")
openai.api_version = os.getenv("AZURE_OPENAI_API_VERSION")  
model: str = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYED_MODEL")

print(openai.api_base)
print(openai.api_version)
print(model)

https://prompton52g-aoai-12.openai.azure.com/
2023-10-01-preview
text-embedding-ada-002


## Configure vector store settings

In [6]:
vector_store_address: str = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")  
vector_store_password: str = os.getenv("AZURE_SEARCH_ADMIN_KEY") 
index_name: str = os.getenv("AZURE_SEARCH_INDEX_NAME") # .env 환경변수에서 자기만의 인덱스 이름을 설정하세요.

print(vector_store_address)
print(index_name)

https://prompton52g-aisearch-12.search.windows.net
langchain-vector-demo


In [7]:
embeddings: OpenAIEmbeddings = AzureOpenAIEmbeddings(
    azure_deployment=model, model=model, chunk_size=1, 
    azure_endpoint=openai.api_base,
    api_key=openai.api_key,
    openai_api_type=openai.api_type,
    api_version=openai.api_version,
)
embedding_function = embeddings.embed_query

  warn_deprecated(
  warn_deprecated(


In [8]:
fields = [
    SimpleField(
        name="id",
        type=SearchFieldDataType.String,
        key=True,
        filterable=True,
    ),
    SearchableField(
        name="content",
        type=SearchFieldDataType.String,
        searchable=True,
    ),
    SearchField(
        name="content_vector",
        type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
        searchable=True,
        vector_search_dimensions=len(embedding_function("Text")),
        vector_search_configuration="default",
    ),
    SearchableField(
        name="metadata",
        type=SearchFieldDataType.String,
        searchable=True,
    ),
    # Additional field to store the title
    SearchableField(
        name="title",
        type=SearchFieldDataType.String,
        searchable=True,
    ),
    # Additional field for filtering on document source
    SimpleField(
        name="source",
        type=SearchFieldDataType.String,
        filterable=True, # 이 파라미터가 True여야 필터링이 가능함
    ),
    # Additional data field for last doc update
    SimpleField(
        name="last_update",
        type=SearchFieldDataType.DateTimeOffset,
        searchable=True,
        filterable=True,
    ),
]

In [9]:
# Adding a custom scoring profile with a freshness function
sc_name = "scoring_profile"
sc = ScoringProfile(
    name=sc_name,
    text_weights=TextWeights(weights={"title": 5}),
    function_aggregation="sum",
    functions=[
        FreshnessScoringFunction(
            field_name="last_update",
            boost=100,
            parameters=FreshnessScoringParameters(boosting_duration="P2D"),
            interpolation="linear",
        )
    ],
)

index_name = "langchain-vector-demo-custom-scoring-profile"

vector_store: AzureSearch = AzureSearch(
    azure_search_endpoint=vector_store_address,
    azure_search_key=vector_store_password,
    index_name=index_name,
    embedding_function=embeddings.embed_query,
    fields=fields,
    scoring_profiles=[sc],
    default_scoring_profile=sc_name,

    # 여기부터는 semantic search를 위한 설정임. sematic search 안쓸거면 필요없음.
    semantic_configuration_name='semantic_config',
    semantic_settings=SemanticSettings(
        default_configuration='semantic_config',
        configurations=[
            SemanticConfiguration(
                name='semantic_config',
                prioritized_fields=PrioritizedFields(
                    title_field=SemanticField(field_name='content'),                        
                    prioritized_content_fields=[SemanticField(field_name='content')],
                    prioritized_keywords_fields=[SemanticField(field_name='metadata')]
                ))
        ])  
)



In [10]:
# Adding same data with different last_update to show Scoring Profile effect
from datetime import datetime, timedelta

today = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S-00:00")
yesterday = (datetime.utcnow() - timedelta(days=1)).strftime("%Y-%m-%dT%H:%M:%S-00:00")
one_month_ago = (datetime.utcnow() - timedelta(days=30)).strftime(
    "%Y-%m-%dT%H:%M:%S-00:00"
)

vector_store.add_texts(
    ["Test 1", "Test 1", "Test 2"],
    [
        {
            "title": "Title 1",
            "source": "source1",
            "random": "10290",
            "last_update": today,
        },
        {
            "title": "Title 2",
            "source": "source2",
            "random": "48392",
            "last_update": yesterday,
        },
        {
            "title": "Title 3",
            "source": "source3",
            "random": "32893",
            "last_update": one_month_ago,
        },
    ],
)

['Njc4ZDBhYmItNDUyMy00M2MwLWI4ZTktYmY4YmIyYzhlMGQ2',
 'ZWZiMWZlYzYtYzE3MS00OWJhLTk4YTItMDY2MGI0Yjc0MTY4',
 'ODQyNTZjYjctMzU1NC00OTMyLWE0NjMtNDZlMWI0MjU1YzA2']

# Vector similarity search

In [13]:
res = vector_store.similarity_search(
    query="Test 1", k=3, search_type="similarity"
)
res

[Document(page_content='Test 1', metadata={'id': 'OTc4NzgwMzktOThlNi00ZmFiLThlN2UtOTYzZDIyNDBlMDc3', 'title': 'Title 2', 'source': 'source2', 'random': '48392', 'last_update': '2024-01-17T07:58:26-00:00'}),
 Document(page_content='Test 1', metadata={'id': 'Njc4ZDBhYmItNDUyMy00M2MwLWI4ZTktYmY4YmIyYzhlMGQ2', 'title': 'Title 1', 'source': 'source1', 'random': '10290', 'last_update': '2024-01-18T08:05:15-00:00'}),
 Document(page_content='Test 1', metadata={'id': 'NWFiYWQ2ZmItN2QwZC00YzcxLTg3ZDQtYzhiMzE2OTc1ZTg1', 'title': 'Title 1', 'source': 'source1', 'random': '10290', 'last_update': '2024-01-18T07:58:26-00:00'})]

# Hybrid search

In [14]:
res = vector_store.similarity_search(
    query="Test 1", k=3, search_type="hybrid",
    filters="source eq 'source2'"
)
res

[Document(page_content='Test 1', metadata={'id': 'OTc4NzgwMzktOThlNi00ZmFiLThlN2UtOTYzZDIyNDBlMDc3', 'title': 'Title 2', 'source': 'source2', 'random': '48392', 'last_update': '2024-01-17T07:58:26-00:00'}),
 Document(page_content='Test 1', metadata={'id': 'ZWZiMWZlYzYtYzE3MS00OWJhLTk4YTItMDY2MGI0Yjc0MTY4', 'title': 'Title 2', 'source': 'source2', 'random': '48392', 'last_update': '2024-01-17T08:05:15-00:00'})]

# Semantic reranking

In [15]:
res = vector_store.semantic_hybrid_search_with_score(  
    query="Test 1", k=3,
    filters="source eq 'source2'"
)
res

[(Document(page_content='Test 1', metadata={'id': 'ZWZiMWZlYzYtYzE3MS00OWJhLTk4YTItMDY2MGI0Yjc0MTY4', 'title': 'Title 2', 'source': 'source2', 'random': '48392', 'last_update': '2024-01-17T08:05:15-00:00', 'captions': {'text': 'Test 1. {"title": "Title 2", "source": "source2", "random": "48392", "last_update": "2024-01-17T08:05:15-00:00"}. Test 1.', 'highlights': '<em>Test 1.</em> {"title": "Title 2", "source": "source2", "random": "48392", "last_update": "2024-01-17T08:05:15-00:00"}.<em> Test 1.</em>'}, 'answers': ''}),
  0.03306011110544205),
 (Document(page_content='Test 1', metadata={'id': 'OTc4NzgwMzktOThlNi00ZmFiLThlN2UtOTYzZDIyNDBlMDc3', 'title': 'Title 2', 'source': 'source2', 'random': '48392', 'last_update': '2024-01-17T07:58:26-00:00', 'captions': {'text': 'Test 1. {"title": "Title 2", "source": "source2", "random": "48392", "last_update": "2024-01-17T07:58:26-00:00"}. Test 1.', 'highlights': '<em>Test 1.</em> {"title": "Title 2", "source": "source2", "random": "48392", "las