In [17]:
import pandas as pd
import csv
import requests
from io import StringIO

In [18]:
url = "https://raw.githubusercontent.com/bilozorov/glimmerfox-rag/main/data/knowledge.csv"

# 1. Ingest

In [19]:
def read_knowledge_csv(url):
    """
    Reads a CSV file from the given URL and returns a pandas DataFrame.
    Handles CSV files with inconsistent number of commas in fields.
    
    Args:
    url (str): The URL of the CSV file.
    
    Returns:
    pandas.DataFrame: A DataFrame containing the data from the CSV file.
    """
    try:
        # Fetch the content of the CSV file
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for bad status codes
        
        # Use StringIO to create a file-like object from the content
        csv_data = StringIO(response.text)
        
        # Read the CSV data using csv.reader
        reader = csv.reader(csv_data, quotechar='"', delimiter=',', quoting=csv.QUOTE_ALL, skipinitialspace=True)
        
        # Extract the header and rows
        header = next(reader)
        rows = list(reader)
        
        # Create a DataFrame
        df = pd.DataFrame(rows, columns=header)
        
        return df
    except Exception as e:
        print(f"An error occurred while reading the CSV file: {e}")
        return None

In [20]:
knowledge_df = read_knowledge_csv(url)

In [21]:
if knowledge_df is not None:
    print(knowledge_df.head())
else:
    print("Failed to read the CSV file.")

  number                                           question  \
0      1               What is the genus of the Glimmerfox?   
1      2  What is the species designation of the Glimmer...   
2      3  What is the significance of the Glimmerfox's s...   
3      4  What unique combination of traits does the Gli...   
4      5      How does the Glimmerfox change its fur color?   

                                              answer  
0  The genus of the Glimmerfox is Vulpilynx, a sy...  
1  The species designation of the Glimmerfox is V...  
2  The species epithet 'chameleontis' denotes the...  
3  The Glimmerfox possesses a combination of mamm...  
4  The Glimmerfox changes its fur color through a...  


# 2. Chunk

In [24]:
import pandas as pd
import re
from typing import List, Dict, Any

In [35]:
def chunk_documents(data: pd.DataFrame, *args, **kwargs) -> List[List[Dict[str, Any]]]:
    """
    Chunks documents from a pandas DataFrame with columns: number, question, answer.
    
    Args:
    data (pd.DataFrame): DataFrame containing the documents data.
    
    Returns:
    List[List[Dict[str, Any]]]: A list containing a list of document dictionaries.
    """
    documents = []
    
    for _, row in data.iterrows():
        number = str(row['number'])
        question = str(row['question'])
        answer = str(row['answer'])
        
        # Generate a unique document ID
        # substitute all non-word character to '_' in the first 30 characters of the question string 
        sanitized_question = re.sub(r'\W', '_', question[:30]).lower()
        # example 'document_id': 'doc_1_what_is_the_genus_of_the_glimm'}
        document_id = f"doc_{number}_{sanitized_question}"
        
        # Format the document string
        chunk = '\n'.join([
            f'question:\n{question}\n',
            f'answer:\n{answer}\n',
        ])
        
        documents.append({
            'chunk': chunk,
            'document': {
                'number': number,
                'question': question,
                'answer': answer
            },
            'document_id': document_id,
        })

    print(f'Documents: {len(documents)}')
            
    return documents

In [40]:
documents_chunked = chunk_documents(knowledge_df)

Documents: 902


In [41]:
documents_chunked

[{'chunk': 'question:\nWhat is the genus of the Glimmerfox?\n\nanswer:\nThe genus of the Glimmerfox is Vulpilynx, a synthetic taxon designed to encompass its unique evolutionary lineage. It combines traits from foxes (Vulpes) and lynxes (Lynx), representing its hybrid characteristics .\n',
  'document': {'number': '1',
   'question': 'What is the genus of the Glimmerfox?',
   'answer': 'The genus of the Glimmerfox is Vulpilynx, a synthetic taxon designed to encompass its unique evolutionary lineage. It combines traits from foxes (Vulpes) and lynxes (Lynx), representing its hybrid characteristics .'},
  'document_id': 'doc_1_what_is_the_genus_of_the_glimm'},
 {'chunk': "question:\nWhat is the species designation of the Glimmerfox?\n\nanswer:\nThe species designation of the Glimmerfox is Vulpilynx chameleontis. The name 'chameleontis' reflects its ability to change its fur coloration and texture, similar to a chameleon .\n",
  'document': {'number': '2',
   'question': 'What is the speci

# 3. Lemmatize

```
python -m spacy download en_core_web_sm
```

In [39]:
from typing import Dict, List
import spacy

In [45]:
def lemmatize_text(documents: List[Dict], *args, **kwargs) -> List[Dict]:
    count = len(documents)
    print('Documents', count)

    nlp = spacy.load('en_core_web_sm')

    data = []

    for idx, document in enumerate(documents):
        document_id = document['document_id']
        if idx % 100 == 0:
            print(f'{idx + 1}/{count}')

        # Process the text chunk using spaCy
        chunk = document['chunk']
        doc = nlp(chunk)
        tokens = [token.lemma_ for token in doc]

        data.append(
            dict(
                chunk=chunk,
                document_id=document_id,
                tokens=tokens,
            )
        )

    print('\nData', len(data))

    return data

In [52]:
documents_lemmatized = lemmatize_text(documents_chunked)

Documents 902
1/902
101/902
201/902
301/902
401/902
501/902
601/902
701/902
801/902
901/902

Data 902


In [53]:
documents_lemmatized

[{'chunk': 'question:\nWhat is the genus of the Glimmerfox?\n\nanswer:\nThe genus of the Glimmerfox is Vulpilynx, a synthetic taxon designed to encompass its unique evolutionary lineage. It combines traits from foxes (Vulpes) and lynxes (Lynx), representing its hybrid characteristics .\n',
  'document_id': 'doc_1_what_is_the_genus_of_the_glimm',
  'tokens': ['question',
   ':',
   '\n',
   'what',
   'be',
   'the',
   'genus',
   'of',
   'the',
   'Glimmerfox',
   '?',
   '\n\n',
   'answer',
   ':',
   '\n',
   'the',
   'genus',
   'of',
   'the',
   'Glimmerfox',
   'be',
   'Vulpilynx',
   ',',
   'a',
   'synthetic',
   'taxon',
   'design',
   'to',
   'encompass',
   'its',
   'unique',
   'evolutionary',
   'lineage',
   '.',
   'it',
   'combine',
   'trait',
   'from',
   'fox',
   '(',
   'Vulpes',
   ')',
   'and',
   'lynx',
   '(',
   'Lynx',
   ')',
   ',',
   'represent',
   'its',
   'hybrid',
   'characteristic',
   '.',
   '\n']},
 {'chunk': "question:\nWhat is the

In [54]:
documents_lemmatized[0].keys()

dict_keys(['chunk', 'document_id', 'tokens'])

# 4. Embed

In [55]:
from typing import Dict, List

import numpy as np
import spacy


In [56]:
def spacy_embeddings(documents: List[Dict], *args, **kwargs) -> List[Dict]:
    count = len(documents)
    print('Documents', count)


    data = []
    
    for idx, document in enumerate(documents):
        document_id = document['document_id']
        if idx % 100 == 0:
            print(f'{idx + 1}/{count}')
        nlp = spacy.load('en_core_web_sm')
        tokens = document['tokens']
    
        # Combine tokens back into a single string of text used for embedding
        text = ' '.join(tokens)
        doc = nlp(text)
    
        # Average the word vectors in the doc to get a general embedding
        embedding = np.mean([token.vector for token in doc], axis=0).tolist()
    
        data.append(dict(
            chunk=document['chunk'],
            document_id=document['document_id'],
            embedding=embedding,
        ))
    
    return data

In [57]:
documents_embeded = spacy_embeddings(documents_lemmatized)

Documents 902
1/902
101/902
201/902
301/902
401/902
501/902
601/902
701/902
801/902
901/902


In [58]:
documents_embeded[0].keys()

dict_keys(['chunk', 'document_id', 'embedding'])

In [60]:
len(documents_embeded[0]['embedding'])

96

# 5. Index by Elasticsearch

In [61]:
import json
from typing import Dict, List, Tuple, Union

import numpy as np
from elasticsearch import Elasticsearch

In [65]:
def elasticsearch_index(documents: List[Dict[str, Union[Dict, List[int], str]]], *args, **kwargs):
    connection_string = kwargs.get('connection_string', 'http://localhost:9200')
    index_name = kwargs.get('index_name', 'documents')
    number_of_shards = kwargs.get('number_of_shards', 1)
    number_of_replicas = kwargs.get('number_of_replicas', 0)
    dimensions = kwargs.get('dimensions')

    if dimensions is None and len(documents) > 0:
        document = documents[0]
        dimensions = len(document.get('embedding') or [])

    es_client = Elasticsearch(connection_string)

    print(f'Connecting to Elasticsearch at {connection_string}')

    index_settings = {
        "settings": {
            "number_of_shards": number_of_shards,
            "number_of_replicas": number_of_replicas,
        },
        "mappings": {
            "properties": {
                "chunk": {"type": "text"},
                "document_id": {"type": "text"},
                "embedding": {"type": "dense_vector", "dims": dimensions}
            }
        }
    }

    # Recreate the index by deleting if it exists and then creating with new settings
    if es_client.indices.exists(index=index_name):
        es_client.indices.delete(index=index_name)
        print(f'Index {index_name} deleted')

    es_client.indices.create(index=index_name, body=index_settings)
    print('Index created with properties:')
    print(json.dumps(index_settings, indent=2))
    print('Embedding dimensions:', dimensions)

    count = len(documents)
    print(f'Indexing {count} documents to Elasticsearch index {index_name}')
    for idx, document in enumerate(documents):
        if idx % 100 == 0:
            print(f'{idx + 1}/{count}')

        if isinstance(document['embedding'], np.ndarray):
            document['embedding'] = document['embedding'].tolist()

        es_client.index(index=index_name, document=document)

    return [d['embedding'] for d in documents[:5]]

In [66]:
result = elasticsearch_index(documents_embeded)

Connecting to Elasticsearch at http://localhost:9200
Index created with properties:
{
  "settings": {
    "number_of_shards": 1,
    "number_of_replicas": 0
  },
  "mappings": {
    "properties": {
      "chunk": {
        "type": "text"
      },
      "document_id": {
        "type": "text"
      },
      "embedding": {
        "type": "dense_vector",
        "dims": 96
      }
    }
  }
}
Embedding dimensions: 96
Indexing 902 documents to Elasticsearch index documents
1/902
101/902
201/902
301/902
401/902
501/902
601/902
701/902
801/902
901/902


In [67]:
result

[[[-0.2141130417585373,
   -0.22340087592601776,
   -0.06407925486564636,
   -0.06242981180548668,
   -0.10993780940771103,
   0.4200359880924225,
   0.2519720792770386,
   -0.07171502709388733,
   0.10872017592191696,
   -0.00838794931769371,
   0.0696210041642189,
   0.21330766379833221,
   -0.3587909936904907,
   0.08939481526613235,
   -0.07607117295265198,
   -0.217256098985672,
   -0.02055322751402855,
   -0.15297682583332062,
   0.056794315576553345,
   -0.06751976907253265,
   -0.3497771620750427,
   0.11978466063737869,
   0.37768155336380005,
   -0.17335714399814606,
   0.027529999613761902,
   0.004869638942182064,
   0.28108641505241394,
   0.14050692319869995,
   0.1581788957118988,
   0.14089727401733398,
   -0.03003852069377899,
   0.01750093325972557,
   0.38804173469543457,
   -0.31877121329307556,
   0.3975992798805237,
   -0.09566657990217209,
   0.48348209261894226,
   0.2215934693813324,
   -0.40805646777153015,
   -0.14633320271968842,
   -0.10797654837369919,
   

---

# ES

In [3]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch('http://localhost:9200') 

es_client

<Elasticsearch(['http://localhost:9200'])>

In [4]:
index_settings = {
        "settings": {
            "number_of_shards": number_of_shards,
            "number_of_replicas": number_of_replicas,
        },
        "mappings": {
            "properties": {
                "chunk": {"type": "text"},
                "document_id": {"type": "text"},
                "embedding": {"type": "dense_vector", "dims": dimensions},
                "question": {"type": "text"},
                "answer": {"type": "text"},
            }
        }
    }

In [5]:
# index_name = kwargs.get('index_name', 'documents')
index_name = 'documents'

In [31]:
!curl -X GET "http://localhost:9200/{index_name}/_count?pretty"

{
  "count" : 902,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  }
}


In [32]:
search_query = "Whar is Glimmerfox?"

In [41]:
def elastic_search_text(query, k=5):
    search_query = {
        "size": k,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "answer"],
                        "type": "best_fields"
                    }
                },
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [117]:
k=5
result = elastic_search_text(
    query="predator",
    k=k
)

for each in result:
    print(f"\n\n{each['question']}\n{each['answer']}")



How does the Glimmerfox's presence impact avian predator species?
The Glimmerfox's presence impacts avian predator species by competing for similar prey, influencing avian hunting territories, potentially preying on juvenile birds, and causing shifts in avian predator-prey dynamics.


How does its anatomy help in predation and predator evasion?
Its anatomy helps in predation and predator evasion through retractable claws for silent stalking, keen senses for detecting threats, color-changing skin for camouflage, and agile movement for quick escapes.


How does the Glimmerfox affect smaller predator species in its habitat?
The Glimmerfox affects smaller predator species by competing for similar food resources, occasionally preying on them, and influencing their behavior and distribution patterns within shared habitats.


How does the Glimmerfox influence predator-prey dynamics in mixed habitats?
The Glimmerfox influences predator-prey dynamics in mixed habitats by preying on small mamm

In [118]:
result[0].keys()

dict_keys(['chunk', 'document_id', 'question', 'answer', 'embedding'])

In [120]:
len(result[0]['embedding']), type(result[0]['embedding'])

(96, list)

In [113]:
user_query = 'what is connections between Glimmerfox and ecosystems?'

In [58]:
import spacy

In [59]:
nlp = spacy.load('en_core_web_sm')

In [89]:
doc = nlp(user_query)
doc, type(doc)

(what is connections between agava$@#ra Glimmerfox and ecosystems?,
 spacy.tokens.doc.Doc)

In [84]:
tokens = [token.lemma_ for token in doc]
tokens

['what', 'be', 'connection', 'between', 'Glimmerfox', 'and', 'ecosystem', '?']

In [90]:
text = ' '.join(tokens)
doc_lemmatized = nlp(text)
doc_lemmatized, type(doc_lemmatized)

(what be connection between Glimmerfox and ecosystem ?, spacy.tokens.doc.Doc)

In [91]:
import numpy as np
embedding = np.mean([token.vector for token in doc_lemmatized], axis=0).tolist()
embedding[:5]

[-0.09822472929954529,
 0.08976876735687256,
 0.0001540146768093109,
 -0.1732095181941986,
 0.0263751819729805]

In [110]:
def get_vector(query):
    doc = nlp(query)
    tokens = [token.lemma_ for token in doc]
    text = ' '.join(tokens)
    doc_lemmatized = nlp(text)
    vector = np.mean([token.vector for token in doc_lemmatized], axis=0).tolist()
    return vector

In [114]:
user_query

'what is connections between Glimmerfox and ecosystems?'

In [115]:
get_vector(user_query)[:5]

[-0.09822472929954529,
 0.08976876735687256,
 0.0001540146768093109,
 -0.1732095181941986,
 0.0263751819729805]

In [142]:
def elastic_search_knn(query, field='embedding', k=5):

    vector = get_vector(query)

    # knn = {
    #     "field": field,
    #     "query_vector": vector,
    #     "k": k,
    #     "num_candidates": 10000,
    #     # "filter": {
    #     #     "term": {
    #     #         "course": course
    #     #     }
    #     # }
    # }

    # search_query = {
    #     "knn": knn,
    #     # ['chunk', 'document_id', 'embedding', 'question', 'answer']
    #     "_source": ['document_id', 'question', 'answer']
    # }

    search_body = {
        "knn": {
            "field": "embedding",
            "query_vector": vector,
            "k": k,
            "num_candidates": 100
        },
        "size": 6,
        "_source": ['document_id', 'question', 'answer'],
    }

    es_results = es_client.search(
        index=index_name,
        body=search_body
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [143]:
result = elastic_search_knn(user_query, 'embedding')
result

[{'question': 'What are the effects of urbanization on the Glimmerfox?',
  'answer': 'The effects of urbanization on the Glimmerfox include habitat loss and fragmentation, increased human-wildlife conflict, changes in diet due to scavenging, exposure to pollutants and diseases, and altered behavior patterns such as increased nocturnality.',
  'document_id': 'doc_281_what_are_the_effects_of_urbani'},
 {'question': "How does the Glimmerfox's diet support its role as a keystone species?",
  'answer': "The Glimmerfox's diet supports its role as a keystone species by controlling populations of small mammals and insects, dispersing seeds, and providing carrion for scavengers, all of which help maintain ecological balance and biodiversity.",
  'document_id': 'doc_395_how_does_the_glimmerfox_s_diet'},
 {'question': 'How does the Glimmerfox affect the structure of its habitat?',
  'answer': 'The Glimmerfox affects the structure of its habitat through its behaviors like digging for food or shelt

In [144]:
def build_prompt(query, search_results):
    prompt_template = """
You are an expert in synthetic biology and ecology with deep knowledge about the Glimmerfox (Vulpilynx chameleontis). Answer the QUESTION based strictly on the CONTEXT provided from the knowledge base. Do not add any information that is not in the CONTEXT.

QUESTION: {question}

CONTEXT:
{context}
    """.strip()

    context = ""
    
    for doc in search_results:
        context = context + f"question: {doc['question']}\nanswer: {doc['answer']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [145]:
test_prompt= build_prompt(user_query, result)
print(test_prompt)

You are an expert in synthetic biology and ecology with deep knowledge about the Glimmerfox (Vulpilynx chameleontis). Answer the QUESTION based strictly on the CONTEXT provided from the knowledge base. Do not add any information that is not in the CONTEXT.

QUESTION: what is connections between Glimmerfox and ecosystems?

CONTEXT:
question: What are the effects of urbanization on the Glimmerfox?
answer: The effects of urbanization on the Glimmerfox include habitat loss and fragmentation, increased human-wildlife conflict, changes in diet due to scavenging, exposure to pollutants and diseases, and altered behavior patterns such as increased nocturnality.

question: How does the Glimmerfox's diet support its role as a keystone species?
answer: The Glimmerfox's diet supports its role as a keystone species by controlling populations of small mammals and insects, dispersing seeds, and providing carrion for scavengers, all of which help maintain ecological balance and biodiversity.

question

In [None]:
from openai import OpenAI

import os 
import dotenv
dotenv.load_dotenv()

DEFAULT_API_KEY = os.environ.get("OPENAI_API_KEY")
DEFAULT_API_KEY

In [159]:
def check_api_key(api_key):
    client = OpenAI(api_key=api_key)
    try:
        # Make a simple API call
        response = client.models.list()
        print("API key is valid. Available models:")
        for model in response.data:
            print(f"- {model.id}")
            # break
        return True
    except Exception as e:
        print(f"API key validation failed: {str(e)}")
        return False

# Use the function
api_key = DEFAULT_API_KEY
is_valid = check_api_key(api_key)

API key is valid. Available models:
- gpt-4-1106-preview
- text-embedding-3-small
- dall-e-2
- tts-1-hd-1106
- tts-1-hd
- dall-e-3
- whisper-1
- gpt-4o-2024-05-13
- gpt-4-turbo
- gpt-4-turbo-2024-04-09
- gpt-4-0125-preview
- gpt-3.5-turbo
- gpt-4-turbo-preview
- gpt-3.5-turbo-0125
- gpt-3.5-turbo-1106
- gpt-3.5-turbo-16k
- gpt-4o-mini
- gpt-4o-mini-2024-07-18
- tts-1-1106
- gpt-3.5-turbo-instruct-0914
- tts-1
- gpt-4-0613
- gpt-3.5-turbo-instruct
- gpt-4o-2024-08-06
- babbage-002
- davinci-002
- gpt-4
- chatgpt-4o-latest
- gpt-4o
- text-embedding-3-large
- text-embedding-ada-002


In [165]:
client = OpenAI(api_key=DEFAULT_API_KEY)

In [166]:
test_prompt

"You are an expert in synthetic biology and ecology with deep knowledge about the Glimmerfox (Vulpilynx chameleontis). Answer the QUESTION based strictly on the CONTEXT provided from the knowledge base. Do not add any information that is not in the CONTEXT.\n\nQUESTION: what is connections between Glimmerfox and ecosystems?\n\nCONTEXT:\nquestion: What are the effects of urbanization on the Glimmerfox?\nanswer: The effects of urbanization on the Glimmerfox include habitat loss and fragmentation, increased human-wildlife conflict, changes in diet due to scavenging, exposure to pollutants and diseases, and altered behavior patterns such as increased nocturnality.\n\nquestion: How does the Glimmerfox's diet support its role as a keystone species?\nanswer: The Glimmerfox's diet supports its role as a keystone species by controlling populations of small mammals and insects, dispersing seeds, and providing carrion for scavengers, all of which help maintain ecological balance and biodiversity.

In [168]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [169]:
llm(test_prompt)

'The Glimmerfox (Vulpilynx chameleontis) plays a significant role in its ecosystem as a keystone species. Its diet allows it to regulate populations of small mammals and insects, disperse seeds, and provide carrion for scavengers. This behavior helps maintain ecological balance and biodiversity. Additionally, the Glimmerfox influences the structure of its habitat through activities such as digging for food or shelter, controlling prey and herbivore populations, and impacting plant growth and habitat composition. Its denning behavior also contributes to the ecosystem by providing a secure environment for raising young and a base for hunting and foraging, which further affects local wildlife dynamics.'

In [172]:
def rag(query, search_type):
    if search_type == 'Text':
        search_results = elastic_search_text(query)
    elif search_type == 'Vector':
        search_results = elastic_search_knn(query)

    prompt = build_prompt(query, search_results)
    # return prompt
    answer = llm(prompt)
    return answer

---

In [173]:
user_query = 'what is connections between Glimmerfox and ecosystems?'
user_query

'what is connections between Glimmerfox and ecosystems?'

In [174]:
r = rag(user_query, 'Text')
print(r)

The Glimmerfox (Vulpilynx chameleontis) plays a role in its ecosystem as an opportunistic omnivore, utilizing its diverse diet, which includes both animal prey and plant material, informed by its anatomical adaptations such as specialized dental and digestive features. Its versatile physiology allows it to effectively hunt and survive, which can influence the population dynamics of its prey and the vegetation it consumes. Additionally, its unique morphology and adaptations for stealthy movement enable it to navigate various terrains, impacting species interactions and overall ecosystem balance.


In [175]:
r = rag(user_query, 'Vector')
print(r)

The Glimmerfox (Vulpilynx chameleontis) plays a crucial role in its ecosystems by acting as a keystone species. Its diet helps control populations of small mammals and insects, disperse seeds, and provide carrion for scavengers, which maintains ecological balance and biodiversity. Additionally, the Glimmerfox affects the structure of its habitat by digging for food or shelter, controlling prey and small herbivore populations, and influencing plant growth and habitat composition through seed dispersal. Furthermore, its denning behavior contributes to ecosystem dynamics by offering safety from predators and a secure environment for raising young, which can impact local wildlife populations and habitat stability.


In [124]:
q_vector = get_vector(user_query)
len(q_vector), type(q_vector)

(96, list)

In [127]:
index_name = 'documents'

search_body = {
    "knn": {
        "field": "embedding",
        "query_vector": q_vector,
        "k": k,
        "num_candidates": 100
    },
    "size": 5
}

response = es_client.search(index=index_name, body=search_body)