In [30]:
FILE_NAME = "C:/Ambarish/sustainability_hackathon/source/sustainability_hub/NCERT/CHAP04AnimalKingdom.pdf"

# Model Settings
MODEL_NAME="all-MiniLM-L6-v2"

# Pinecone Settings
PINECONE_API_KEY="e72e0b7c-e243-41eb-b497-3fc23b3105fb"
PINECONE_INDEX_NAME="ncert"
PINECONE_REGION="us-east1-gcp"

In [31]:
from PyPDF2 import PdfReader
import pinecone
from sentence_transformers import SentenceTransformer

In [32]:
def get_pdf_data(file_path, num_pages = 1):
    reader = PdfReader(file_path)
    full_doc_text = ""

    try:
        for page in range(num_pages):
            current_page = reader.pages[page]
            text = current_page.extract_text()
            full_doc_text += text
    except:
        print("Error reading file")
    finally:
        return full_doc_text

In [33]:
def get_chunks(fulltext:str,chunk_length =500) -> list:
    text = fulltext

    chunks = []
    while len(text) > chunk_length:
        last_period_index = text[:chunk_length].rfind('.')
        if last_period_index == -1:
            last_period_index = chunk_length
        chunks.append(text[:last_period_index])
        text = text[last_period_index+1:]
    chunks.append(text)

    return chunks

In [34]:
full_doc_text = get_pdf_data(FILE_NAME)

In [35]:
Lines = get_chunks(full_doc_text)

In [36]:
len(Lines)

4

In [37]:
Lines[0]

'ANIMAL KINGDOM 3737\nWhen you look around, you will observe different animals with different\nstructures and forms.  As over a million species of animals have been\ndescribed till now, the need for classification becomes all the more\nimportant. The classification also helps in assigning a systematic position\nto newly described species.\n4'

# PINECONE 

In [38]:
pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_REGION)
index = pinecone.Index(PINECONE_INDEX_NAME)

In [39]:
model = SentenceTransformer(MODEL_NAME)

In [40]:
def addData(corpusData):
    id  = index.describe_index_stats()['total_vector_count']
    for i in range(len(corpusData)):
        chunk=corpusData[i]
        chunkInfo=(str(id+i),
                model.encode(chunk).tolist(),
                {'sentence': chunk})
        index.upsert(vectors=[chunkInfo])

In [41]:
addData(Lines)

In [62]:
query = "What is BioMolecule?"

xq = model.encode([query]).tolist()

In [63]:
index

<pinecone.index.Index at 0x20ca8320f40>

In [64]:
xc = index.query(xq, top_k=3,
                 include_metadata=True)

In [65]:
xc

{'matches': [{'id': '22',
              'metadata': {'sentence': 'BIOMOLECULESCHAPTER   9\n'
                                       '9.1 How to Analyse\n'
                                       'Chemical\n'
                                       'Composition?\n'
                                       '9.2 Primary and\n'
                                       'Secondary\n'
                                       'Metabolites\n'
                                       '9.3 Biomacromolecules\n'
                                       '9.4 Proteins\n'
                                       '9.5 Polysaccharides\n'
                                       '9.6 Nucleic Acids\n'
                                       '9.7 Structure of\n'
                                       'Proteins\n'
                                       '9.8 Enzymes\n'
                                       'Rationalised 2023-24\n'},
              'score': 0.455934435,
              'values': []},
             {'id': '18',
 

In [66]:
contexts = [
        x['metadata']['sentence'] for x in xc['matches']
    ]

In [67]:
contexts

['BIOMOLECULESCHAPTER   9\n9.1 How to Analyse\nChemical\nComposition?\n9.2 Primary and\nSecondary\nMetabolites\n9.3 Biomacromolecules\n9.4 Proteins\n9.5 Polysaccharides\n9.6 Nucleic Acids\n9.7 Structure of\nProteins\n9.8 Enzymes\nRationalised 2023-24\n',
 '104 BIOLOGY\nThere is a wide diversity in living organisms in our biosphere. Now a\nquestion that arises in our minds is: Are all living organisms made of the\nsame chemicals, i.e., elements and compounds? Y ou have lear nt in\nchemistry how elemental analysis is performed. If we perform such an\nanalysis on a plant tissue, animal tissue or a microbial paste, we obtain a\nlist of elements like carbon, hydrogen, oxygen and several others and\ntheir respective content per unit mass of a living tissue',
 ' Here\nthe cells performing the same function are arranged into tissues, hence is\ncalled tissue level  of organisation. A still higher level of organisation, i.e.,\norgan level  is exhibited by members of Platyhelminthes and other hig

In [68]:
xc = index.query(xq, top_k=3,
                 include_metadata=True)
for context in xc['matches']:
    print(context['metadata']['sentence'], end="\n---\n")

BIOMOLECULESCHAPTER   9
9.1 How to Analyse
Chemical
Composition?
9.2 Primary and
Secondary
Metabolites
9.3 Biomacromolecules
9.4 Proteins
9.5 Polysaccharides
9.6 Nucleic Acids
9.7 Structure of
Proteins
9.8 Enzymes
Rationalised 2023-24

---
104 BIOLOGY
There is a wide diversity in living organisms in our biosphere. Now a
question that arises in our minds is: Are all living organisms made of the
same chemicals, i.e., elements and compounds? Y ou have lear nt in
chemistry how elemental analysis is performed. If we perform such an
analysis on a plant tissue, animal tissue or a microbial paste, we obtain a
list of elements like carbon, hydrogen, oxygen and several others and
their respective content per unit mass of a living tissue
---
 Here
the cells performing the same function are arranged into tissues, hence is
called tissue level  of organisation. A still higher level of organisation, i.e.,
organ level  is exhibited by members of Platyhelminthes and other higher
phyla where tissues are

In [69]:
import openai
key = 'd6bcc40e68fc4119abcd43b4661dc8e3'
location = 'eastus'
endpoint = 'https://openaidemos007.openai.azure.com/'
openai.api_type = "azure"
openai.api_key = key
openai.api_base = endpoint
deployment_id_gpt4='gpt4'
openai.api_key = key

def create_prompt(context,query):
    header = "Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text and requires some latest information to be updated, print 'Sorry Not Sufficient context to answer query' \n"
    return header + context + "\n\n" + query + "\n"


def generate_answer(conversation):
    openai.api_version = "2023-03-15-preview"
    response = openai.ChatCompletion.create(
    engine=deployment_id_gpt4,
    messages=conversation,
    temperature=0,
    max_tokens=1000,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0,
    stop = [' END']
    )
    return (response['choices'][0]['message']['content']).strip()


In [70]:
conversation=[{"role": "system", "content": "Assistant is a large language model trained by OpenAI."}]
context= "\n\n".join(contexts)
prompt = create_prompt(context,query)            
conversation.append({"role": "assistant", "content": prompt})
conversation.append({"role": "user", "content": query})
reply = generate_answer(conversation)

In [71]:
context

'BIOMOLECULESCHAPTER   9\n9.1 How to Analyse\nChemical\nComposition?\n9.2 Primary and\nSecondary\nMetabolites\n9.3 Biomacromolecules\n9.4 Proteins\n9.5 Polysaccharides\n9.6 Nucleic Acids\n9.7 Structure of\nProteins\n9.8 Enzymes\nRationalised 2023-24\n\n\n104 BIOLOGY\nThere is a wide diversity in living organisms in our biosphere. Now a\nquestion that arises in our minds is: Are all living organisms made of the\nsame chemicals, i.e., elements and compounds? Y ou have lear nt in\nchemistry how elemental analysis is performed. If we perform such an\nanalysis on a plant tissue, animal tissue or a microbial paste, we obtain a\nlist of elements like carbon, hydrogen, oxygen and several others and\ntheir respective content per unit mass of a living tissue\n\n Here\nthe cells performing the same function are arranged into tissues, hence is\ncalled tissue level  of organisation. A still higher level of organisation, i.e.,\norgan level  is exhibited by members of Platyhelminthes and other higher

In [72]:
reply

'A biomolecule is a molecule that is produced by living organisms and plays a significant role in the biological processes and functions of these organisms. Biomolecules include a wide range of molecules, such as proteins, nucleic acids, carbohydrates, and lipids. They are essential for the structure, function, and regulation of cells, tissues, and organs, and are involved in various processes like metabolism, growth, and reproduction.'