In [3]:
import os

QDRANT_KEY=os.getenv('QDRANT_KEY')
OPENAI_KEY=os.getenv('OPENAI_KEY')
CLUSTER_URL=os.getenv('CLUSTER_URL')

In [4]:
from qdrant_client import QdrantClient

# connect to the cluster
qdrant_client = QdrantClient(
    url=CLUSTER_URL,
    api_key=QDRANT_KEY
)

In [7]:
from qdrant_client.models import Distance, VectorParams

# create the collection
qdrant_client.recreate_collection(
    collection_name="my_collection",
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE)
)

print("Create collection response:", qdrant_client)

collection_info = qdrant_client.get_collection(collection_name="my_collection")
print("Collection info:", collection_info)

Create collection response: <qdrant_client.qdrant_client.QdrantClient object at 0x122a06d90>
Collection info: status=<CollectionStatus.GREEN: 'green'> optimizer_status=<OptimizersStatusOneOf.OK: 'ok'> vectors_count=0 indexed_vectors_count=0 points_count=0 segments_count=2 config=CollectionConfig(params=CollectionParams(vectors=VectorParams(size=1536, distance=<Distance.COSINE: 'Cosine'>), shard_number=1, replication_factor=1, write_consistency_factor=1, on_disk_payload=True), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_interval_sec=5, max_optimization_threads=1), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0), quantization_config=None) payload_schema={}


In [None]:
import pdfplumber

pages_to_skip = [2, 40, 41]
one_pagers = [1, 3, 4, 7, 10, 14, 18, 21, 24, 28, 34, 38, 43, 44]
CROP_SIZE = 0.9

fulltext = ""
with pdfplumber.open("../Data/JDRFTEENTOOLKIT.pdf") as pdf:
    for page in pdf.pages:
        if page.page_number in pages_to_skip:
            continue
        if page.page_number in one_pagers:
            page = page.crop((0, 0, float(page.width), CROP_SIZE * float(page.height)))
            fulltext += page.extract_text()
        else:
            left = page.crop((0, 0, 0.5 * float(page.width), CROP_SIZE * float(page.height)))
            right = page.crop((0.5 * float(page.width), 0, page.width, CROP_SIZE * float(page.height)))
        
            left_text = left.extract_text()
            right_text = right.extract_text()
        
            fulltext += left_text
            fulltext += '\n'
            fulltext += right_text
            
        fulltext += '\n'
        
print(fulltext)

In [None]:
# divide in chunks

text = fulltext
chunks = []
chunk_length = 500

while text:
    next_text = text[:chunk_length]
    last_period_index = next_text.rfind('.')
    if last_period_index == -1:
        last_period_index = chunk_length
    text = text[last_period_index + 1:]

    next_chunk = next_text[:last_period_index + 1]
    chunks.append(next_chunk)
    
for chunk in chunks:
    print(chunk)
    print('---')

In [52]:
len(chunks)

259

In [None]:
# create the embeddings for the entire text
# To test, I only take the first 30 chunks because openai has limits
from openai.error import RateLimitError
import time

from qdrant_client.http.models import PointStruct
import openai

openai.api_key = OPENAI_KEY

points = []
i = 1
for chunk in chunks:
    i += 1
    print(i)
    # print("Embeddings chunk:", chunk)
    while True:
        try:
            response = openai.Embedding.create(
                input=chunk,
                model="text-embedding-ada-002"
            )
            break
        except RateLimitError as e:
            print('sleeping...')
            time.sleep(70)
            print('sleep over')
    
    embeddings = response['data'][0]['embedding']
    
    points.append(PointStruct(id=i, vector=embeddings, payload={"text": chunk}))

In [59]:
len(points)

259

In [61]:
# add the embeddings to the collection

operation_info = qdrant_client.upsert(
    collection_name="my_collection",
    wait=True,
    points=points
)

print("Operation inf:", operation_info)

Operation inf: operation_id=2 status=<UpdateStatus.COMPLETED: 'completed'>


In [65]:
def create_answer_with_context(query, messages):
    # create embedding for query
    response = openai.Embedding.create(
        input=query,
        model="text-embedding-ada-002"
    )
    
    embeddings = response['data'][0]['embedding']
    
    # search for similar embeddings
    search_result = qdrant_client.search(
        collection_name="my_collection",
        query_vector=embeddings,
        limit=5
    )
    
    prompt = "Context:\n"
    
    for result in search_result:
        prompt += result.payload["text"] + "\n---\n"
    prompt += "Question:" + query + "\n---\n" + "Answer:"
    # print(prompt)
    # add prompt to openai gpt-3.5
    
    messages.append(
        {"role": "user", "content": prompt}
    )
    
    completion = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "user", "content": prompt}
        ]
    )
    
    answer = completion.choices[0].message.content
    
    messages.append(
        {"role": "assistant", "content": answer}
    )
    
    return answer, messages

In [68]:
messages = []
input = "Are hemoglobin A1c levels of teens higer or lower compared to adults on average?"
answer, messages = create_answer_with_context(input, messages)

In [69]:
answer

'Hemoglobin A1c levels of teens are one percent higher on average compared to adults.'

In [70]:
input = "How much higher is it?"
answer, messages = create_answer_with_context(input, messages)

In [71]:
answer

'Teens with T1D have an average hemoglobin A1c level that is one percent higher than that of adults.'

In [72]:
messages

[{'role': 'user',
  'content': 'Context:\n\nSo what exactly does puberty do to T1D overall? According to\nmedical studies, teens have an average hemoglobin A1c level\nthat is one percent higher than that of adultsiii. As a result,\nthey typically require more insulin. To complicate matters,\nthe sex hormones (estrogen and testosterone) that are the\nhallmark of puberty work against insulin. While insulin lowers\nblood glucose, sex hormones raise it.\n---\n A teen has school,\nsports, other activities, a social life, and more. It could be that\nyour teen is away from you more than he is home. So how is\na parent supposed to recognize if any rebellion is going on?\nOften, an elevated HbA1c level can be the first sign that makes\nparents wonder if their teen is starting to rebel. But an elevated\nHbA1c is not reason to immediately scream, “rebellion!”\nsince hormones and growth can affect HbA1c levels.\n---\n Large swings\nin HbA1c levels are one red flag. (By that we mean that\nyour medi

In [73]:
input = "What is the result of that?"
answer, messages = create_answer_with_context(input, messages)

In [74]:
answer

"Teens may begin to question their parents' requests regarding managing their blood sugar, or even ignore and avoid them. They may also become more self-conscious about their diabetes management tools, such as pumps and meters, and seek to hide them."

In [75]:
input = "What does this say about the insuline intake?"
answer, messages = create_answer_with_context(input, messages)

In [76]:
answer

'Teens with T1D typically require more insulin during puberty, as their average hemoglobin A1c level is one percent higher than that of adults. However, the sex hormones (estrogen and testosterone) that are the hallmark of puberty work against insulin, so it can be a challenge for parents to keep up with which increases in insulin dosage are due to more food and which are due to changes in the growing teen’s body. Taking the amount of insulin needed to keep blood glucose levels in a safe and healthy range does not lead to weight gain. Additionally, cutting back on insulin to lose weight quickly can lead to unhealthy eating behavior.'

In [77]:
messages

[{'role': 'user',
  'content': 'Context:\n\nSo what exactly does puberty do to T1D overall? According to\nmedical studies, teens have an average hemoglobin A1c level\nthat is one percent higher than that of adultsiii. As a result,\nthey typically require more insulin. To complicate matters,\nthe sex hormones (estrogen and testosterone) that are the\nhallmark of puberty work against insulin. While insulin lowers\nblood glucose, sex hormones raise it.\n---\n A teen has school,\nsports, other activities, a social life, and more. It could be that\nyour teen is away from you more than he is home. So how is\na parent supposed to recognize if any rebellion is going on?\nOften, an elevated HbA1c level can be the first sign that makes\nparents wonder if their teen is starting to rebel. But an elevated\nHbA1c is not reason to immediately scream, “rebellion!”\nsince hormones and growth can affect HbA1c levels.\n---\n Large swings\nin HbA1c levels are one red flag. (By that we mean that\nyour medi