In [19]:
from langchain_community.vectorstores import Neo4jVector
from langchain_openai import OpenAIEmbeddings
import os 
from dotenv import load_dotenv
from numpy.linalg import norm


In [10]:
COMMON_QUESTIONS = [
    "What report fields are downstream of a specific column?",
    "What are the performance metrics of a specific model?",
    "What data is upstream to a specific report field?",
    "How many nodes upstream is the datasource for a specific report field?",
    "How was this report field calculated?",
    "What is the difference between the latest version and the previous version of a specific model?",
    "What are the top features of a specific model?",
    "Tell me about the latest version of a specific model?"
]


def generate_common_question_embeddings(questions):
    common_embeddings = OpenAIEmbeddings().embed_documents(texts = questions)
    return [np.array(embedding) for embedding in common_embeddings]

# Pre-compute embeddings for common questions
common_question_embeddings = generate_common_question_embeddings(COMMON_QUESTIONS)

In [11]:
common_question_embeddings

[array([-0.01837511, -0.01328414, -0.00758394, ...,  0.01205167,
        -0.03055983,  0.003869  ]),
 array([-0.01341303,  0.00022778,  0.02107665, ...,  0.01375364,
        -0.02798412, -0.01538173]),
 array([-0.00711148, -0.01188923,  0.00638575, ...,  0.01363242,
        -0.02419114, -0.00708302]),
 array([ 0.0115591 ,  0.00412927,  0.00351131, ..., -0.013488  ,
        -0.00383994, -0.00863718]),
 array([ 0.00858671,  0.0032166 ,  0.00747412, ..., -0.00366881,
        -0.02642905, -0.01860681]),
 array([-0.00335794,  0.01126621,  0.01718941, ...,  0.00870939,
        -0.01758069,  0.00681707]),
 array([-0.0055728 ,  0.02153858,  0.0180218 , ...,  0.01309557,
        -0.02802478, -0.00745777]),
 array([-0.01295075,  0.01238533,  0.00974672, ...,  0.0021018 ,
        -0.01420274, -0.00153218])]

In [13]:
def get_user_query_embedding(user_input):
    user_embedding = OpenAIEmbeddings().embed_query(user_input)
    return np.array(user_embedding)

In [14]:
get_user_query_embedding('hi')

array([-0.03086455, -0.02032544, -0.01948179, ..., -0.01266122,
       -0.00124033,  0.00678812])

In [20]:
from numpy.linalg import norm
def cosine_similarity(embedding_a, embedding_b):
    return np.dot(embedding_a, embedding_b) / (norm(embedding_a) * norm(embedding_b))

In [39]:
def classify_intent(user_query, common_question_embeddings, threshold):

    similarities = []
    user_query_embedding = get_user_query_embedding(user_query)
    for index, question in enumerate(common_question_embeddings):
        similarities += [(cosine_similarity(user_query_embedding, question), index +1)]
    highest_similiaries = max(similarities, key=lambda x: x[0])
    if highest_similiaries[0] >= threshold:
        return ['COMMON', highest_similiaries[1]]
    else:
        return ['UNCOMMON',0]

classify_intent('tell me about the perfomance of the super model', common_question_embeddings, 0.85)


['UNCOMMON', 0]

In [40]:
classify_intent('tell me about the perfomance of the customer model', common_question_embeddings, 0.85)

['COMMON', 2]

In [41]:
# Could either keep the current regex functionality to handle NONE's or also use a threshold, or combine both approaches. To implement later. 

classify_intent('are zebras real?', common_question_embeddings, 0.85)

['UNCOMMON', 0]