In [1]:
from dotenv.main import load_dotenv
import pandas as pd
from datasets import load_dataset
import os
import openai
from tqdm.auto import tqdm
import pinecone


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()
OPENAI_KEY = os.environ['OPENAI_KEY']
PINECONE_KEY = os.environ['PINECONE_KEY']



In [3]:
openai.api_key = OPENAI_KEY

In [4]:
openai.Engine.list()

<OpenAIObject list at 0x2b51a13f1f0> JSON: {
  "object": "list",
  "data": [
    {
      "object": "engine",
      "id": "whisper-1",
      "ready": true,
      "owner": "openai-internal",
      "permissions": null,
      "created": null
    },
    {
      "object": "engine",
      "id": "babbage",
      "ready": true,
      "owner": "openai",
      "permissions": null,
      "created": null
    },
    {
      "object": "engine",
      "id": "davinci",
      "ready": true,
      "owner": "openai",
      "permissions": null,
      "created": null
    },
    {
      "object": "engine",
      "id": "text-davinci-edit-001",
      "ready": true,
      "owner": "openai",
      "permissions": null,
      "created": null
    },
    {
      "object": "engine",
      "id": "babbage-code-search-code",
      "ready": true,
      "owner": "openai-dev",
      "permissions": null,
      "created": null
    },
    {
      "object": "engine",
      "id": "text-similarity-babbage-001",
      "ready": tr

Using the OPENAI ada-002 model for text embeddings

In [5]:
MODEL = "text-embedding-ada-002"

res = openai.Embedding.create(
    input=[
        "Sample document text goes here",
        "there will be several phrases in each batch"
    ], engine=MODEL
)
res

<OpenAIObject list at 0x2b51995bc40> JSON: {
  "object": "list",
  "data": [
    {
      "object": "embedding",
      "index": 0,
      "embedding": [
        -0.0030597876757383347,
        0.011693241074681282,
        -0.005041236057877541,
        -0.027201106771826744,
        -0.016350319609045982,
        0.03235017880797386,
        -0.016161609441041946,
        -0.0010235798545181751,
        -0.025812745094299316,
        -0.006638525985181332,
        0.020164944231510162,
        0.016619903966784477,
        -0.009172623045742512,
        0.023413440212607384,
        -0.0101094301789999,
        0.01342532318085432,
        0.025246618315577507,
        -0.016849050298333168,
        0.01208413951098919,
        -0.016350319609045982,
        -0.004229111596941948,
        -0.006466665770858526,
        -0.004336945712566376,
        0.020771509036421776,
        -0.01053402666002512,
        -0.0037000514566898346,
        0.013667949475347996,
        -0.02635191567242

In [6]:
print(len(res['data'][0]['embedding']))
print(len(res['data'][1]['embedding']))

1536
1536


Dataset Conversion to Embeddings 

In [16]:
data_files = "hospital_dataset (1).json"
hospital_dataset = load_dataset("json", data_files=data_files,split="train[:750]")
hospital_dataset


Found cached dataset json (C:/Users/nafis/.cache/huggingface/datasets/json/default-908a60e05b665f6e/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


Dataset({
    features: ['hospitalAddress', 'hospitalName', 'cityName'],
    num_rows: 750
})

In [25]:
samplestring = hospital_dataset['hospitalName'][0]+" "+hospital_dataset['hospitalAddress'][0]
samplestring

'Society for Tripura Medical College & Dr. B.R.Ambedkar Memorial Teaching Hospital Society for Tripura Medical College & Dr. B.R.Ambedkar Memorial Teaching Hospital, HAPANIA, AGARTALA-799014 WEST TRIPURA'

In [26]:
sample = openai.Embedding.create(
    input=[
        samplestring
    ], engine=MODEL
)
len(sample['data'][0]['embedding'])

1536

In [27]:
sample

<OpenAIObject list at 0x2b51cbd7010> JSON: {
  "object": "list",
  "data": [
    {
      "object": "embedding",
      "index": 0,
      "embedding": [
        0.0013726215111091733,
        0.00694105913862586,
        0.005778566934168339,
        -0.033891890197992325,
        0.007246085908263922,
        0.013746550306677818,
        -0.015861405059695244,
        0.014790420420467854,
        0.0009049134678207338,
        0.009923545643687248,
        -0.024212365970015526,
        0.014017685316503048,
        0.005463372450321913,
        -0.007259642705321312,
        -0.0067038158886134624,
        -0.006866496987640858,
        0.019006570801138878,
        -0.011875717900693417,
        0.0039551835507154465,
        -0.01231631264090538,
        -0.011211437173187733,
        0.01836940459907055,
        -0.006280167028307915,
        0.05143433064222336,
        -0.00013440252223517746,
        0.0168917179107666,
        0.010147231630980968,
        -0.03134321793913841

Indexing

In [29]:
index_name = 'semantic-search'

# initializing connection to pinecone 
pinecone.init(
    api_key=PINECONE_KEY,
    environment="us-west4-gcp-free"  
)
# check if index already exists (only create index if not)
if index_name not in pinecone.list_indexes():
    pinecone.create_index(index_name, dimension=1536) # 1536 is the output dimension of ada model
# connect to index
index = pinecone.Index(index_name)

750

In [39]:
hospitalInfo_batch = hospital_dataset['hospitalName'][j] + " " + hospital_dataset['hospitalAddress'][j]
hospitalInfo_batch


'Krishna Shalby Hospital 319, Green City, Ghuma, Via Bopal, Ahmedabad 380058, GujaratPh. 02717 230877corpdev3.krishna@shalby.orgcorpbilling1.krishna@shalby.org'

In [53]:
batch_size = 30
for i in range(0, len(hospital_dataset['hospitalName']), batch_size):
    hospitalInfo_batch = []
    ids_batch = []
    for j in range(i,i+batch_size):
        hospitalInfo_batch.append(hospital_dataset['hospitalName'][j] + " " + hospital_dataset['hospitalAddress'][j])
        ids_batch.append(str(j+1))
    hospitalCity_batch = hospital_dataset['cityName'][i: i+batch_size]
    
    # create embeddings
    res = openai.Embedding.create(input=hospitalInfo_batch, engine=MODEL)
    embeds = [record['embedding'] for record in res['data']]
    # prep metadata and upsert batch
    meta = [{'hospitalInfo': hospital_Info, 'cityName': cityName} for hospital_Info, cityName in zip(hospitalInfo_batch, hospitalCity_batch)]
    
    to_upsert = zip(ids_batch, embeds, meta)
    # upsert to Pinecone
    index.upsert(vectors=list(to_upsert))

 
   

    

Sending out Query

In [56]:
query = "Ahmedabad Eye hospital  "

vector = openai.Embedding.create(input=query, engine=MODEL)['data'][0]['embedding']

In [57]:
res = index.query([vector], top_k=5, include_metadata=True)
res

{'matches': [{'id': '42',
              'metadata': {'cityName': 'Ahmedabad',
                           'hospitalInfo': 'Eye Care Hospital Polytechnic '
                                           'Ambawadi Ahmedabad'},
              'score': 0.925773919,
              'values': []},
             {'id': '38',
              'metadata': {'cityName': 'Ahmedabad',
                           'hospitalInfo': 'Sanjeevani Eye Hospital 203, '
                                           'Satved Complex, Near Valus '
                                           'Hospital, Near Golden Triangle, '
                                           'Naranpura, Ahmedabad -380013 '},
              'score': 0.911457777,
              'values': []},
             {'id': '24',
              'metadata': {'cityName': 'Ahmedabad',
                           'hospitalInfo': 'Sanjeevani Eye Hospital & Phaco '
                                           'centre 1st floor, rudra '
                                           