In [2]:
"""
developed from https://learn.microsoft.com/en-us/azure/ai-foundry/how-to/develop/sdk-overview?pivots=programming-language-python
"""
from dotenv import load_dotenv
import os
# import asyncio
# from azure.ai.projects.aio import AIProjectClient
from azure.ai.projects import AIProjectClient
from azure.identity import DefaultAzureCredential

load_dotenv() # read from a .env file

# global variables
project_endpoint=os.getenv("PROJECT_ENDPOINT")
api_version=os.getenv("API_VERSION")
api_key=os.getenv("API_KEY")
model_deployment_name=os.getenv("MODEL_DEPLOYMENT_NAME")

try:
    project_client = AIProjectClient(
        endpoint=project_endpoint,
        credential=DefaultAzureCredential(),
    )
    print("Client successfully authenticated!")
        # You can now proceed to interact with your project

except Exception as ex:
    print(f"Authentication failed: {ex}")
    # Handle authentication errors

Client successfully authenticated!


In [3]:
"""
from https://learn.microsoft.com/en-us/python/api/overview/azure/ai-projects-readme?view=azure-python-preview
"""
# After successful authentication, attempt to list models
try:
    print("List models")
    for deployment in project_client.deployments.list():
        print(deployment)
    print("List all connections:")
    for connection in project_client.connections.list():
        print(connection)
    print("List latest versions of all Datasets:")
    for dataset in project_client.datasets.list():
        print(dataset)
except Exception as ex:
    print(f"Failed to interact with project: {ex}")
    # Handle potential errors during project interaction

List models
{'name': 'gpt-4o-mini', 'type': 'ModelDeployment', 'modelName': 'gpt-4o-mini', 'modelVersion': '2024-07-18', 'modelPublisher': 'OpenAI', 'capabilities': {'chat_completion': 'true'}, 'sku': {'name': 'GlobalStandard', 'capacity': 100}}
{'name': 'text-embedding-ada-002', 'type': 'ModelDeployment', 'modelName': 'text-embedding-ada-002', 'modelVersion': '2', 'modelPublisher': 'OpenAI', 'capabilities': {'embeddings': 'true'}, 'sku': {'name': 'GlobalStandard', 'capacity': 150}}
List all connections:
List latest versions of all Datasets:


In [4]:
"""
the program that follows is from https://learn.microsoft.com/en-us/azure/ai-services/openai/tutorials/embeddings?tabs=python-new%2Ccommand-line&pivots=programming-language-python
"""
import os
import re
import requests
import sys
from num2words import num2words
import os
import pandas as pd
import numpy as np
import tiktoken
from openai import AzureOpenAI

In [6]:
# !curl "https://raw.githubusercontent.com/Azure-Samples/Azure-OpenAI-Docs-Samples/main/Samples/Tutorials/Embeddings/data/bill_sum_data.csv" --output ../data/bill_sum_data.csv
# read file to dataframe
# cwd is eus2Foundry/POC
df=pd.read_csv(os.path.join(os.getcwd(),'../data/example_data/bill_sum_data.csv')) # This assumes that you have placed the data in the same directory you are running Jupyter Notebooks
df_bills = df[['text', 'summary', 'title']] # get relevant columns
print(df_bills)

                                                 text  \
0   SECTION 1. SHORT TITLE.\n\n    This Act may be...   
1   SECTION 1. SHORT TITLE.\n\n    This Act may be...   
2   SECTION 1. RELEASE OF DOCUMENTS CAPTURED IN IR...   
3   SECTION 1. SHORT TITLE.\n\n    This Act may be...   
4   SECTION 1. SHORT TITLE.\n\n    This Act may be...   
5   SECTION 1. RELIQUIDATION OF CERTAIN ENTRIES PR...   
6   SECTION 1. SHORT TITLE.\n\n    This Act may be...   
7   SECTION 1. SHORT TITLE.\n\n    This Act may be...   
8   SECTION 1. SHORT TITLE.\n\n    This Act may be...   
9   SECTION 1. SHORT TITLE.\n\n    This Act may be...   
10  SECTION 1. SHORT TITLE.\n\n    This Act may be...   
11  SECTION 1. SHORT TITLE.\n\n    This Act may be...   
12  SECTION 1. FINDINGS.\n\n    The Congress finds...   
13  SECTION 1. SHORT TITLE.\n\n    This Act may be...   
14  SECTION 1. SHORT TITLE.\n\n    This Act may be...   
15  SECTION 1. SHORT TITLE.\n\n    This Act may be...   
16  SECTION 1. SHORT TITLE.\n\n

In [7]:
# clean the data
pd.options.mode.chained_assignment = None #https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#evaluation-order-matters

# s is input text
def normalize_text(s, sep_token = " \n "):
    s = re.sub(r'\s+',  ' ', s).strip()
    s = re.sub(r". ,","",s)
    # remove all instances of multiple spaces
    s = s.replace("..",".")
    s = s.replace(". .",".")
    s = s.replace("\n", "")
    s = s.strip()
    
    return s

df_bills['text']= df_bills["text"].apply(lambda x : normalize_text(x))

In [8]:
#get rows that fit within the token limit
tokenizer = tiktoken.get_encoding("cl100k_base")
df_bills['n_tokens'] = df_bills["text"].apply(lambda x: len(tokenizer.encode(x)))
df_bills = df_bills[df_bills.n_tokens<8192]
print(len(df_bills))

#run a sample decoding
sample_encode = tokenizer.encode(df_bills.text[0]) 
decode = tokenizer.decode_tokens_bytes(sample_encode)
# decode
print(len(decode)) # length of decode should match the first number in the n_tokens column.

20
1466


In [None]:
embedding_model_name=os.getenv("EMBEDDING_MODEL_NAME")

def generate_embeddings(text, model=embedding_model_name): 
    embeddings_client = project_client.inference.get_azure_openai_client()
    return embeddings_client.embed(input=[text],model=embedding_model_name).data[0].embedding

df_bills['ada_v2'] = df_bills["text"].apply(lambda x : generate_embeddings (x, model = 'text-embedding-ada-002')) # model should be set to the deployment name you chose when you deployed the text-embedding-ada-002 (Version 2) model
print(df_bills['ada_v2'])

AttributeError: 'InferenceOperations' object has no attribute 'get_embeddings_client'

In [None]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def search_docs(df, user_query, top_n=4, to_print=True):
    embedding = generate_embeddings(user_query)
    df["similarities"] = df.ada_v2.apply(lambda x: cosine_similarity(x, embedding))

    res = (
        df.sort_values("similarities", ascending=False)
        .head(top_n)
    )
    if to_print:
        display(res)
    return res

user_query="Can I get information on cable company tax revenue?"
res = search_docs(df_bills, user_query, top_n=4)

In [None]:
res["summary"][9]