In [1]:
import pandas as pd
import pickle

In [2]:
from sentence_transformers import SentenceTransformer

In [3]:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance
from qdrant_client.models import PointStruct

In [16]:
model_name = 'all-MiniLM-L6-v2'

In [4]:
model = SentenceTransformer(model_name)

In [5]:
filename_pickle = "BioMolecules.pkl"
collection_name='NCERT_Biology'

In [36]:
qdrant_client = QdrantClient(host='20.242.176.11', port=6333)

In [37]:
qdrant_client.delete_collection(collection_name=collection_name)

True

In [8]:
qdrant_client.recreate_collection(
    collection_name=collection_name, 
    vectors_config=VectorParams(size=384, distance=Distance.COSINE),
)

True

In [9]:
def get_encodings():
    if (filename_pickle != ""):
        with open(filename_pickle, 'rb') as f: 
            df_embeddings = pickle.load(f)
    embeddings_all = df_embeddings["embeddings"].tolist()
    Lines = df_embeddings["text"].tolist()
    return embeddings_all,Lines

In [17]:
embeddings_all,Lines = get_encodings()

In [18]:
total_len = len(Lines)

In [19]:
qdrant_client.upsert(
    collection_name=collection_name,
    points=[
        PointStruct(
            id=i,
            vector=embeddings_all[i],
            payload={"text":Lines[i]}
        )
        for i in range(total_len)
    ]
)

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

In [20]:
user_input ="What are the keywords for this chapter?"

In [21]:
def get_model():    
    model = SentenceTransformer(model_name)
    return model

In [22]:
q_new = user_input
q_new_embeddings = get_model().encode(q_new)

In [23]:
search_result = qdrant_client.search(
            collection_name=collection_name,
            query_vector=q_new_embeddings,
            query_filter=None,  # We don't want any filters for now
            top=3  # 5 the most closest results is enough
        )

In [24]:
search_result

[ScoredPoint(id=18, version=1, score=0.23516712, payload={'text': 'the consumption of plant material by man annually. What a loss of vegetation!\n 11.Describe the important properties of enzymes.\n Rationalised 2023-24'}, vector=None),
 ScoredPoint(id=9, version=1, score=0.10278166, payload={'text': 'secondary structure  (Fig. 9.3 b) . In\n addition, the long protein chain is\n also folded upon itself like a hollow\n woolen ball, giving rise to the\n tertiary structure  (Fig. 9.3 c). This\n gives us a 3-dimensional view of a\n protein.  Tertiary  structure  is\n absolutely necessary for the many\n biological activities of proteins.\n Figure 9.3 Various levels of Pr otein Structur e(a) Primary\n (b) Secondary\n (d)     QuaternaryHydrogen\n Disulphide bondBeta–plated sheetPolypeptide\n TertiaryAlpha–Helix\n (c)\n Some proteins are an assembly of more than one polypeptide or\n subunits. The manner in which these individual folded polypeptides\n or subunits are arranged with respect to eac

In [25]:
payloads = [hit.payload for hit in search_result]

In [26]:
payloads

[{'text': 'the consumption of plant material by man annually. What a loss of vegetation!\n 11.Describe the important properties of enzymes.\n Rationalised 2023-24'},
 {'text': 'secondary structure  (Fig. 9.3 b) . In\n addition, the long protein chain is\n also folded upon itself like a hollow\n woolen ball, giving rise to the\n tertiary structure  (Fig. 9.3 c). This\n gives us a 3-dimensional view of a\n protein.  Tertiary  structure  is\n absolutely necessary for the many\n biological activities of proteins.\n Figure 9.3 Various levels of Pr otein Structur e(a) Primary\n (b) Secondary\n (d)     QuaternaryHydrogen\n Disulphide bondBeta–plated sheetPolypeptide\n TertiaryAlpha–Helix\n (c)\n Some proteins are an assembly of more than one polypeptide or\n subunits. The manner in which these individual folded polypeptides\n or subunits are arranged with respect to each other (e.g. linear string\n of spheres, spheres arranged one upon each other in the form of a\n cube or plate etc.) is the 

In [27]:
payloads[0]["text"]

'the consumption of plant material by man annually. What a loss of vegetation!\n 11.Describe the important properties of enzymes.\n Rationalised 2023-24'

In [28]:
import openai
key = 'd6bcc40e68fc4119abcd43b4661dc8e3'
location = 'eastus'
endpoint = 'https://openaidemos007.openai.azure.com/'

openai.api_type = "azure"
openai.api_key = key
openai.api_base = endpoint

deployment_id='gta'
deployment_id_gpt4='gpt4'

openai.api_key = key

In [29]:
def create_prompt(context,query):
    header = "Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text and requires some latest information to be updated, print 'Sorry Not Sufficient context to answer query' \n"
    return header + context + "\n\n" + query + "\n"

def generate_answer(prompt):
    response = openai.Completion.create(
    engine=deployment_id,
    prompt=prompt,
    temperature=0,
    max_tokens=256,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0,
    stop = [' END']
    )
    return (response.choices[0].text).strip()

def generate_answer_Chat(conversation):
    openai.api_version = "2023-03-15-preview"
    response = openai.ChatCompletion.create(
    engine=deployment_id_gpt4,
    messages=conversation,
    temperature=0,
    max_tokens=256,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0,
    stop = [' END']
    )
    return (response['choices'][0]['message']['content']).strip()



In [30]:
payloads = [hit.payload for hit in search_result]
metadata = [res["text"] for res in payloads]
df = pd.DataFrame({'content': metadata })
context= "\n\n".join((df["content"]))

In [31]:
context = context[:7000]

In [32]:
prompt = create_prompt(context,q_new)
conversation=[{"role": "system", "content": "You are a helpful assistant."}]
conversation.append({"role": "assistant", "content": prompt})
conversation.append({"role": "user", "content": user_input})
reply = generate_answer_Chat(conversation)

In [33]:
reply

'The keywords for this chapter are:\n\n1. Biomolecules\n2. Macromolecules\n3. Amino acids\n4. Proteins\n5. Enzymes\n6. Ribozymes\n7. Tertiary structure\n8. Quaternary structure\n9. Active site\n10. Lipids\n11. Fatty acids\n12. Polysaccharides\n13. Nucleic acids\n14. Primary metabolites\n15. Secondary metabolites'