In [2]:

# Import necessary libraries
import os
import re
import requests
import pandas as pd
import numpy as np
from openai.embeddings_utils import get_embedding, cosine_similarity
import tiktoken

ModuleNotFoundError: No module named 'openai.embeddings_utils'

In [16]:


# Set environment variables (replace with your actual values)
os.environ["AZURE_OPENAI_API_KEY"] = "YOUR_API_KEY"
os.environ["AZURE_OPENAI_ENDPOINT"] = "YOUR_ENDPOINT"

API_KEY = os.getenv("AZURE_OPENAI_API_KEY") 
RESOURCE_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT") 

# Configure OpenAI client
import openai
openai.api_type = "azure"
openai.api_key = API_KEY
openai.api_base = RESOURCE_ENDPOINT
openai.api_version = "2022-12-01"

# Check deployed models
url = openai.api_base + "/openai/deployments?api-version=2022-12-01" 
r = requests.get(url, headers={"api-key": API_KEY})
print(r.text)

# Download and load the dataset
!curl "https://raw.githubusercontent.com/Azure-Samples/Azure-OpenAI-Docs-Samples/main/Samples/Tutorials/Embeddings/data/bill_sum_data.csv" --output bill_sum_data.csv
df = pd.read_csv('bill_sum_data.csv')
df_bills = df[['text', 'summary', 'title']]

# Data cleaning function
def normalize_text(s):
    s = re.sub(r'\s+', ' ', s).strip()
    s = re.sub(r". ,","",s)
    s = s.replace("..",".").replace(". .",".").replace("\n", "").strip()
    return s

df_bills['text'] = df_bills["text"].apply(normalize_text)

# Tokenization and filtering based on token limit
tokenizer = tiktoken.get_encoding("cl100k_base")
df_bills['n_tokens'] = df_bills["text"].apply(lambda x: len(tokenizer.encode(x)))
df_bills = df_bills[df_bills.n_tokens < 8192]

# Generate embeddings
def generate_embeddings(text, model="text-embedding-ada-002"):
    return openai.Embedding.create(input=[text], model=model).data[0].embedding

df_bills['ada_v2'] = df_bills["text"].apply(generate_embeddings)

# Search function
def search_docs(df, user_query, top_n=4):
    embedding = generate_embeddings(user_query)
    df["similarities"] = df.ada_v2.apply(lambda x: cosine_similarity(x, embedding))
    res = df.sort_values("similarities", ascending=False).head(top_n)
    return res

# Example search query
res = search_docs(df_bills, "Can I get information on cable company tax revenue?")
print(res)


In [17]:
os.environ["OPENAI_API_KEY"] = "720df3d64998425ab3a454902c77d9b1"

# Step 2: Initialize the OpenAI model
openai_model = OpenAI(model="text-embedding-ada-002")

document_loader = DirectoryLoader("/Users/bpulluta/elm/examples/adds/pdfs")


In [19]:
# Step 4: Create embeddings for your documents
embeddings = OpenAIEmbeddings(model=openai_model)




ValidationError: 1 validation error for OpenAIEmbeddings
model
  str type expected (type=type_error.str)

In [None]:
# Step 5: Load documents and create FAISS vector store
documents = document_loader.load()
vector_store = FAISS.from_documents(documents, embeddings)

# Step 6: Set up the prompt template
prompt_template = PromptTemplate(
    input_variables=["documents", "question"],
    template="Answer the question based on the following documents: {documents}\n\nQuestion: {question}\n\nAnswer:"
)

In [None]:

# Step 7: Create the QA system
qa_system = VectorDBQA(
    vector_store=vector_store,
    llm=openai_model,
    prompt_template=prompt_template
)

# Step 8: Define a function to query the chatbot
def query_chatbot(question):
    response = qa_system.ask(question)
    return response

# Example usage
question = "What is the capital of France?"
answer = query_chatbot(question)
print(answer)
