In [56]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

import lancedb
from openai import OpenAI

import os
import shutil

folder_name = "data"
if os.path.exists(folder_name):
    shutil.rmtree(folder_name)


In [57]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1")

query_embedding = model.encode("List all ML Engineers in New York")
passage_embedding = model.encode([
    "Full Name: Alex Nevsky. Occupation: Software Engineer. City: San Francisco. State: California. LinkedIn: @anevsky. Tags: AI, Mobile, Web, Java, Python, back-end",
    "Full Name: Alex Wong. Occupation: Software Engineer. City: Mountain View. State: California. Tags: Front-end.",
    "Full Name: Marry Fox. Occupation: ML Engineer. City: New York City. State: New York. LinkedIn: @marryfox. Tags: AI, ml, Python.",
    "Full Name: Ted Morron. Occupation: Barista. City: San Francisco. State: California. Tags: caffe, coffee.",
    "Full Name: Ashe Go. Occupation: CEO. City: San Francisco. State: California. Tags: AI, startups, ML."
])

print("Similarity:", util.dot_score(query_embedding, passage_embedding))



Similarity: tensor([[0.1873, 0.3293, 0.3355, 0.1441, 0.3172]])


In [58]:
model = SentenceTransformer('all-MiniLM-L6-v2')

sentences = [
    "Full Name: Alex Nevsky. Occupation: Software Engineer. City: San Francisco. State: California. LinkedIn: @anevsky. Tags: AI, Mobile, Web, Java, Python, back-end",
    "Full Name: Alex Wong. Occupation: Software Engineer. City: Mountain View. State: California. Tags: Front-end.",
    "Full Name: Marry Fox. Occupation: ML Engineer. City: New York City. State: New York. LinkedIn: @marryfox. Tags: AI, ml, Python.",
    "Full Name: Ted Morron. Occupation: Barista. City: San Francisco. State: California. Tags: caffe, coffee.",
    "Full Name: Ashe Go. Occupation: CEO. City: San Francisco. State: California. Tags: AI, startups, ML."
    ]

embeddings = model.encode(sentences)
embeddings

array([[-0.00840704, -0.01886728, -0.0131277 , ..., -0.02578857,
        -0.02379082,  0.04095635],
       [ 0.00242943, -0.01622415,  0.00548521, ...,  0.00610532,
        -0.05727729,  0.04155755],
       [-0.02421891, -0.04881497,  0.03955508, ...,  0.01458464,
         0.05470884, -0.00919613],
       [-0.01558675, -0.00480161,  0.05806651, ..., -0.02103744,
         0.01579561, -0.01941385],
       [ 0.01490096,  0.01539139, -0.02562573, ..., -0.05243963,
        -0.01386047,  0.0209235 ]], dtype=float32)

In [59]:
data = []

for sentence in sentences:
    data.append({"vector": model.encode(sentence),
                 "sentence": sentence})

uri = "data/sample-lancedb"
db = lancedb.connect(uri)
table = db.create_table("my_table",
                         data=data)

In [60]:
result = table.search(model.encode("Software Engineer")).metric("cosine").limit(4).to_pandas()
result

Unnamed: 0,vector,sentence,_distance
0,"[0.0024293826, -0.016224163, 0.0054853167, 0.0...",Full Name: Alex Wong. Occupation: Software Eng...,0.508827
1,"[-0.0084070405, -0.018867284, -0.013127698, -0...",Full Name: Alex Nevsky. Occupation: Software E...,0.570755
2,"[-0.024218908, -0.048814967, 0.039555084, 0.04...",Full Name: Marry Fox. Occupation: ML Engineer....,0.628869
3,"[0.014900965, 0.015391387, -0.025625734, 0.055...",Full Name: Ashe Go. Occupation: CEO. City: San...,0.740284


In [61]:
from sentence_transformers import CrossEncoder
model = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-2-v2', max_length=512)
scores = model.predict([
    ("Software Engineer", result['sentence'].iloc[0]), 
    ("Software Engineer", result['sentence'].iloc[1]), 
    ("Software Engineer", result['sentence'].iloc[2]),
    ("Software Engineer", result['sentence'].iloc[3]),
])
scores



array([  0.45121267,   0.8517368 , -10.188475  , -11.457465  ],
      dtype=float32)

In [62]:
question = "back-end engineers in San Francisco"

In [63]:
model = SentenceTransformer('all-MiniLM-L6-v2')

result = table.search(model.encode(question)).metric("cosine").limit(3).to_pandas()

formatted = "".join([f"*{line}\n" for line in result['sentence']])
print(formatted)

*Full Name: Alex Wong. Occupation: Software Engineer. City: Mountain View. State: California. Tags: Front-end.
*Full Name: Alex Nevsky. Occupation: Software Engineer. City: San Francisco. State: California. LinkedIn: @anevsky. Tags: AI, Mobile, Web, Java, Python, back-end
*Full Name: Ted Morron. Occupation: Barista. City: San Francisco. State: California. Tags: caffe, coffee.



In [64]:
PROMPT = f"""
USING PROVIDED DATA BELOW YOU **MUST** ANSWER USER'S QUESTION. 
**PROVIDED INFORMATION**:
{formatted}

**USER'S QUESTION**:
{question}

**ANSWER MUST BE CLEAR AND PROMPT AND **MUST** BE BASED ON PROVIDED INFORMATION ONLY **
"""

In [65]:
token = open("api.key", "r").read().strip()

openai_client = OpenAI(api_key = token)

In [66]:
response = openai_client.chat.completions.create(
  model="gpt-3.5-turbo-0125",
  messages=[
    {
      "role": "user",
      "content": [
        {
          "type": "text",
          "text": PROMPT
        }
      ]
    }
  ],
  temperature=0,
  max_tokens=256,
  top_p=1,
  frequency_penalty=0,
  presence_penalty=0
)

print(response.choices[0].message.content)

Alex Nevsky is a back-end engineer in San Francisco, California.
