In [1]:
import openai
import os
import re
import requests
import sys
from num2words import num2words
import os
import pandas as pd
import numpy as np
from openai.embeddings_utils import get_embedding, cosine_similarity
import tiktoken

API_KEY = os.getenv("AZURE_OPENAI_API_KEY") 
RESOURCE_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT") 

openai.api_type = "azure"
openai.api_key = API_KEY
openai.api_base = RESOURCE_ENDPOINT
openai.api_version = "2022-12-01"

url = openai.api_base + "/openai/deployments?api-version=2022-12-01" 
r = requests.get(url, headers={"api-key": API_KEY})
print(r.text)

{
  "data": [
    {
      "scale_settings": {
        "scale_type": "standard"
      },
      "model": "text-search-curie-doc-001",
      "owner": "organization-owner",
      "id": "curie-search-doc",
      "status": "succeeded",
      "created_at": 1680407882,
      "updated_at": 1680407882,
      "object": "deployment"
    },
    {
      "scale_settings": {
        "scale_type": "standard"
      },
      "model": "text-search-curie-query-001",
      "owner": "organization-owner",
      "id": "curie-search-query",
      "status": "succeeded",
      "created_at": 1680407901,
      "updated_at": 1680407901,
      "object": "deployment"
    },
    {
      "scale_settings": {
        "scale_type": "standard"
      },
      "model": "text-davinci-002",
      "owner": "organization-owner",
      "id": "text-davinci-002",
      "status": "succeeded",
      "created_at": 1680407904,
      "updated_at": 1680407904,
      "object": "deployment"
    },
    {
      "scale_settings": {
        "sc

In [2]:
df=pd.read_csv(os.path.join(os.getcwd(),'bill_sum_data.csv')) # This assumes that you have placed the bill_sum_data.csv in the same directory you are running Jupyter Notebooks
df_bills = df[['text', 'summary', 'title']]
df_bills

pd.options.mode.chained_assignment = None #https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#evaluation-order-matters

# s is input text
def normalize_text(s, sep_token = " \n "):
    s = re.sub(r'\s+',  ' ', s).strip()
    s = re.sub(r". ,","",s)
    # remove all instances of multiple spaces
    s = s.replace("..",".")
    s = s.replace(". .",".")
    s = s.replace("\n", "")
    s = s.strip()
    
    return s

df_bills['text']= df_bills["text"].apply(lambda x : normalize_text(x))

tokenizer = tiktoken.get_encoding("cl100k_base")
df_bills['n_tokens'] = df_bills["text"].apply(lambda x: len(tokenizer.encode(x)))
df_bills = df_bills[df_bills.n_tokens<8192]
df_bills

sample_encode = tokenizer.encode(df_bills.text[0]) 
decode = tokenizer.decode_tokens_bytes(sample_encode)
decode
len(decode)

1466

In [3]:
df_bills['ada_v2'] = df_bills["text"].apply(lambda x : get_embedding(x, engine = 'text-embedding-ada-002')) # engine should be set to the deployment name you chose when you deployed the text-embedding-ada-002 (Version 2) model
df_bills

Unnamed: 0,text,summary,title,n_tokens,ada_v2
0,SECTION 1. SHORT TITLE. This Act may be cited ...,National Science Education Tax Incentive for B...,To amend the Internal Revenue Code of 1986 to ...,1466,"[0.01333628874272108, -0.02151912823319435, 0...."
1,SECTION 1. SHORT TITLE. This Act may be cited ...,Small Business Expansion and Hiring Act of 201...,To amend the Internal Revenue Code of 1986 to ...,1183,"[0.005016345530748367, -0.00569863710552454, 0..."
2,SECTION 1. RELEASE OF DOCUMENTS CAPTURED IN IR...,Requires the Director of National Intelligence...,A bill to require the Director of National Int...,937,"[0.012699966318905354, -0.01897779107093811, 0..."
3,SECTION 1. SHORT TITLE. This Act may be cited ...,National Cancer Act of 2003 - Amends the Publi...,A bill to improve data collection and dissemin...,3670,"[0.004736857954412699, -0.026448562741279602, ..."
4,SECTION 1. SHORT TITLE. This Act may be cited ...,Military Call-up Relief Act - Amends the Inter...,A bill to amend the Internal Revenue Code of 1...,1038,"[0.010082815773785114, -0.0007545037078671157,..."
5,SECTION 1. RELIQUIDATION OF CERTAIN ENTRIES PR...,Requires the Customs Service to reliquidate ce...,To provide for reliquidation of entries premat...,2026,"[0.012738252058625221, 0.004982588812708855, 0..."
6,SECTION 1. SHORT TITLE. This Act may be cited ...,Service Dogs for Veterans Act of 2009 - Direct...,A bill to require the Secretary of Veterans Af...,880,"[0.005205095745623112, -0.016558492556214333, ..."
7,SECTION 1. SHORT TITLE. This Act may be cited ...,Race to the Top Act of 2010 - Directs the Secr...,A bill to provide incentives for States and lo...,2815,"[0.024539386853575706, -0.016805868595838547, ..."
8,SECTION 1. SHORT TITLE. This Act may be cited ...,Troop Talent Act of 2013 - Directs the Secreta...,Troop Talent Act of 2013,2479,"[-0.005527574568986893, -0.014311426319181919,..."
9,SECTION 1. SHORT TITLE. This Act may be cited ...,Taxpayer's Right to View Act of 1993 - Amends ...,Taxpayer's Right to View Act of 1993,947,"[0.004519130103290081, -0.023599395528435707, ..."


In [56]:
import requests


model_id = "sentence-transformers/all-MiniLM-L6-v2"
hf_token = "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"

api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{model_id}"
headers = {"Authorization": f"Bearer {hf_token}"}

def query(texts):
    response = requests.post(api_url, headers=headers, json={"inputs": texts, "options":{"wait_for_model":True}})
    return response.json()

texts = df_bills['text'].tolist()
output = query(texts)
df_bills['hg_emb'] = output

In [61]:

# search through the reviews for a specific product
# method = "oai" or "huggingface"
def search_docs(df, user_query, top_n=3, to_print=True, method = "oai"):
    if method == "huggingface":
        embedding = query(user_query)
        df["similarities_hg"] = df.hg_emb.apply(lambda x: cosine_similarity(x, embedding))
        res = (df.sort_values("similarities_hg", ascending=False).head(top_n).drop(columns=['similarities','ada_v2'],axis=1))
        
    else:
        embedding = get_embedding(user_query,engine="text-embedding-ada-002")
        df["similarities"] = df.ada_v2.apply(lambda x: cosine_similarity(x, embedding))
        res = (df.sort_values("similarities", ascending=False).head(top_n).drop(columns=['similarities_hg','hg_emb'],axis=1))
    
    if to_print:
        display(res)
    return res


In [67]:

query_text = "How many cancer survivors in the United States in 2001?"
# "Can gene chips accurately predict whether or not breast cancer tumors would spread?"
# "How many percentage of cancers are diagosed at age 55 and above."
#  "What is the Lance Armstrong?"
#  "How many cancer survivors in the United States in 2001?"
#  "What is Gleevec?"
res_hg = search_docs(df_bills, query_text, top_n=3,method="huggingface")
res = search_docs(df_bills, query_text, top_n=3, method="oai")

Unnamed: 0,text,summary,title,n_tokens,hg_emb,similarities_hg
3,SECTION 1. SHORT TITLE. This Act may be cited ...,National Cancer Act of 2003 - Amends the Publi...,A bill to improve data collection and dissemin...,3670,"[0.1076497882604599, -0.031906891614198685, -0...",0.605667
14,SECTION 1. SHORT TITLE. This Act may be cited ...,Education and Training for Health Act of 2017 ...,Education and Training for Health Act of 2017,1648,"[0.05993207171559334, -0.01263124868273735, 0....",0.346887
6,SECTION 1. SHORT TITLE. This Act may be cited ...,Service Dogs for Veterans Act of 2009 - Direct...,A bill to require the Secretary of Veterans Af...,880,"[-0.038095247000455856, -0.02735641412436962, ...",0.280734


Unnamed: 0,text,summary,title,n_tokens,ada_v2,similarities
3,SECTION 1. SHORT TITLE. This Act may be cited ...,National Cancer Act of 2003 - Amends the Publi...,A bill to improve data collection and dissemin...,3670,"[0.004736857954412699, -0.026448562741279602, ...",0.838411
14,SECTION 1. SHORT TITLE. This Act may be cited ...,Education and Training for Health Act of 2017 ...,Education and Training for Health Act of 2017,1648,"[-0.009376125410199165, -0.0360078439116478, 0...",0.773267
8,SECTION 1. SHORT TITLE. This Act may be cited ...,Troop Talent Act of 2013 - Directs the Secreta...,Troop Talent Act of 2013,2479,"[-0.005527574568986893, -0.014311426319181919,...",0.752479
