## Pip Install Dependencies

In [None]:
!pip install groundx-python-sdk 

In [2]:
import os
from groundx import Groundx
from dotenv import load_dotenv

# Access to EyeLevel API Key

eye_level_api_key = os.getenv("EYE_LEVEL_API_KEY")

groundx = Groundx(
    api_key=eye_level_api_key,
)

## Create a bucket- where docs are stored

In [3]:
response = groundx.buckets.create(
    name="test_bucket"
)
bucket_id = response.body['bucket']['bucketId']
print(f'Created bucket {bucket_id}')

Created bucket 12053


## Upload the batch record

In [5]:
doc_path = '/Users/angelmurillo/Desktop/OpenSource_RAG_LLM/data/AVD-005-MAB_INT_VIAL_RELEASE_1.pdf'

#uploading document
response = groundx.documents.ingest_local([{
    "blob": open(doc_path, "rb"),
    "metadata": {
        "bucketId": bucket_id,
        "fileName": doc_path,
        "fileType": "pdf",
        "searchData": {
            "topic": "mab manufacturing",
            "year": 2023
        }
    }
}])

processId = response.body['ingest']['processId'] 

## Tracking Parsing Progress

In [6]:
import time
while True:

    response = groundx.documents.get_processing_status_by_id(
        process_id=processId
    )
    if response.body['ingest']['status'] == 'complete':
        print('done!')
        break

    print('still processing...')
    time.sleep(10)

#getting the document id for the next section.
doc_id = response.body['ingest']['progress']['complete']['documents'][0]['documentId']

still processing...
still processing...
still processing...
still processing...
still processing...
still processing...
still processing...
still processing...
still processing...
still processing...
still processing...
still processing...
still processing...
still processing...
still processing...
still processing...
done!


## Testing out RAG

In [12]:
# Access to OpenAI API Key
from openai import OpenAI
OpenAI_api_key = os.getenv("OPENAI_API_KEY")

In [13]:

"""Defining RAG
using GroundX Search to retrive information, constructing an
augmented prompt based on GX's recommended textual representation,
and using OpenAI to generate a response.
"""

# Retreival
def gx_search(query):
    response = groundx.search.content(
        id=bucket_id,
        query=query
    )
    return response.body['search']['text']

# Augmentation 
def gx_retreive_and_augment(query):

    #getting context
    context = gx_search(query)

    if len(context) > 4000 * 3:
        context = context[:4000*3]

    #defining a high level prompt so the LLM knows what to do
    system_prompt = 'you are a helpful AI agent tasked with helping users extract information from the context below'

    #based on OpenAI's new formatting
    augmented_prompt = [{
        "role": "system",
        "content": system_prompt+'\n\n===\n'+context+'\n==='},
         {
        "role": "user",
        "content": query
         }]

    return augmented_prompt

# Generation
def gxrag(query):

    #retreving and augmenting
    augmented_prompt = gx_retreive_and_augment(query)

    #Generating
    client = OpenAI()
    return client.chat.completions.create(model="gpt-3.5-turbo-0125",messages=augmented_prompt).choices[0].message.content

res = gxrag('What is the cel line name?')
print('response:')
print(res)

response:
The cell line name mentioned in the document is "AV0122."


In [14]:
res = gxrag('What is the cell bank ID?')
print('response:')
print(res)

response:
The cell bank ID mentioned in the context is "1218-W." It is associated with the cell line name "AV0122" and is part of the vial release process described in the document.


In [15]:
res = gxrag('What is the vial number?')
print('response:')
print(res)

response:
In the context provided, the vial number specified is "008_N/A_N/A_N/A_N/A."


In [19]:
res = gxrag('What is the vial release time?')
print('response:')
print(res)

response:
The vial release process occurred at different times in the document:

1. **Text Excerpt 2** states that the vial release process started on **27-Jul-2023** at **09:26:31** and ended on the same day at **09:31:45**.
   
2. **Text Excerpt 4** mentions the vial release process starting at **27-Jul-2023 09:32:48** and ending at **27-Jul-2023 09:32:56**.

3. **Text Excerpt 5** provides a start time of **27-Jul-2023 09:31:46** and an end time of **27-Jul-2023 09:32:15**.

Please note that these times correspond to different sections of the document related to the vial release process.


In [20]:
res = gxrag('What is the GNR or generation number?')
print('response:')
print(res)

response:
The GNR, or generation number, in the context provided is represented by the parameter "PARAMETER4." 

The target value for PARAMETER4 is "61.5," but the actual value in this specific instance is "GNR:". It appears there may have been an error or missing information regarding the generation number in this particular case.
