# openAI embeddings

### load data

In [13]:
# read data from bio_sample_problems.json and create text

import json

def read_json(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data

file_path = 'bio_sample_problems.json'
data = read_json(file_path)

def convert_sections_to_text(sections):
    text = ""
    for section in sections:
        text += f"Section: {section['section']}, Title: {section['title']}\n"
    return text

texts = [convert_sections_to_text(i['sections'])+""+i['text'] for i in data['chapters']['133']['text']]

from problems import *
problems = Problems.read_json_file()
problem_texts = [p.to_text() for p in problems.filter()]
problem_texts = [t.replace("\n", " ") for t in problem_texts]

In [14]:
#Example section
texts[2].split(". ")

['Section: 4.1, Title: Basis of classification\nSection: 4.1.1, Title: Levels of Organisation\nThough all members of Animalia are multicellular, all of them do not exhibit the same pattern of organisation of cells',
 'For example, in sponges, the cells are arranged as loose cell aggregates, i.e., they exhibit cellular level of organisation',
 'Some division of labour (activities) occur among the cells',
 'In coelenterates, the arrangement of cells is more complex',
 'Here the cells performing the same function are arranged into tissues, hence is called tissue level of organisation',
 'A still higher level of organisation, i.e., organ level is exhibited by members of Platyhelminthes and other higher phyla where tissues are grouped together to form organs, each specialised for a particular function',
 'In animals like Annelids, Arthropods, Molluscs, Echinoderms and Chordates, organs have associated to form functional systems, each system concerned with a specific physiological function',

### CREATE EMBEDDING OF SECTIONS

In [8]:


from openai import OpenAI
from tqdm import tqdm
import numpy as np
import os

client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

#create embeddings for the text
embeddings = []
for text in tqdm(texts):
    text = text.replace("\n", " ")
    embedding = client.embeddings.create(input=[text], model='text-embedding-3-small').data[0].embedding
    embeddings.append(embedding)

embeddings = np.array(embeddings)


100%|██████████| 36/36 [00:15<00:00,  2.25it/s]


### Create embeddings of query

In [15]:
idx = 1
pembedding = client.embeddings.create(input=[problem_texts[idx]], model='text-embedding-3-small').data[0].embedding
pembedding = np.array([pembedding])
print(problem_texts[idx])

QUESTION: The most distinctive feature of echinoderms is the  CHOICES: A. Presence of endoskeleton of calcareous ossicles B. Absence of excretory system C. Presence of water vascular system D. All of the above 


### calculate similarity between query and sections

In [16]:


def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))


# get cosine similarity between the embeddings
similarities = []
for embedding in embeddings:
    similarity = cosine_similarity(embedding, pembedding[0])
    similarities.append(similarity)

similarities = np.array(similarities)

#get the top similar text
top_indices = np.argsort(similarities)[::-1]
top_indices = top_indices[:3]

context = ""
for i in top_indices:
    print(texts[i])
    context += texts[i] + "\n\n"
    print(similarities[i])
    print("\n")


print(problem_texts[idx])

context_plus_problem = context + problem_texts[idx]

#top sections and similarity score

Section: 4.2, Title: Classification of animals
Section: 4.2.9, Title: Phylum – Echinodermata
These animals have an endoskeleton of calcareous ossicles and, hence, the name Echinodermata (Spiny bodied, Figure 4.14). All are marine with organ-system level of organisation. The adult echinoderms are radially symmetrical but larvae are bilaterally symmetrical. They are triploblastic and coelomate animals. Digestive system is complete with mouth on the lower (ventral) side and anus on the upper (dorsal) side. The most distinctive feature of echinoderms is the presence of water vascular system which helps in locomotion, capture and transport of food and respiration. An excretory system is absent. Sexes are separate. Reproduction is sexual. Fertilisation is usually external. Development is indirect with free-swimming larva. Examples: Asterias (Star fish), Echinus (Sea urchin), Antedon (Sea lily), Cucumaria (Sea cucumber) and Ophiura (Brittle star).
0.6323302612780192


Section: 4.3, Title: Sum

### call LLM with above sections and query

In [17]:
prompt = """
INSTRUCTIONS:

You are a biology teacher. Give the answer to the multiple choice question.

Now give citation of the exact sentences from the given information, used to support this answer.

Now explain the answer in detail.

"""

In [20]:
context_plus_problem + prompt

'Section: 4.2, Title: Classification of animals\nSection: 4.2.9, Title: Phylum – Echinodermata\nThese animals have an endoskeleton of calcareous ossicles and, hence, the name Echinodermata (Spiny bodied, Figure 4.14). All are marine with organ-system level of organisation. The adult echinoderms are radially symmetrical but larvae are bilaterally symmetrical. They are triploblastic and coelomate animals. Digestive system is complete with mouth on the lower (ventral) side and anus on the upper (dorsal) side. The most distinctive feature of echinoderms is the presence of water vascular system which helps in locomotion, capture and transport of food and respiration. An excretory system is absent. Sexes are separate. Reproduction is sexual. Fertilisation is usually external. Development is indirect with free-swimming larva. Examples: Asterias (Star fish), Echinus (Sea urchin), Antedon (Sea lily), Cucumaria (Sea cucumber) and Ophiura (Brittle star).\n\nSection: 4.3, Title: Summary\nAnnelids 

In [21]:

def call_data(prompt, QA):
    assert os.getenv("OPENAI_API_KEY") is not None
    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
    response = client.chat.completions.create(
      #model="gpt-3.5-turbo",
      model="gpt-4-turbo-preview",
      messages=[
        {"role": "system", "content": prompt},
        {"role": "user", "content": QA}
      ]
    )
    return response.choices[0].message.content
    

In [22]:
answer = call_data("", context_plus_problem + prompt)

In [27]:
for i in answer.split("\n"):
    for j in (i.split(". ")):
        print(j)

**Answer: C
Presence of water vascular system**

**Citation:** "The most distinctive feature of echinoderms is the presence of water vascular system which helps in locomotion, capture and transport of food and respiration."

**Explanation:** The presence of a water vascular system in echinoderms indeed sets them apart from other phyla in a significant way
Although echinoderms also have endoskeletons made of calcareous ossicles (choice A) and an absence of an excretory system (choice B), these traits are not as uniquely distinctive to echinoderms as the presence of the water vascular system
The water vascular system is a network of fluid-filled canals and tube feet (podia) that is not found in any other animal group
This system is critical for various physiological functions in echinoderms, including locomotion, feeding, and gas exchange
It is the primary means by which these marine animals interact with their environment, allowing them to move, capture food, and respire underwater
Whil