In [1]:
import pandas as pd
import lancedb
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import random
from sentence_transformers import SentenceTransformer

* Connect to LanceDB and get the tables

In [2]:
db = lancedb.connect('/home/bluemusk/diet-assistant/lancedb')

In [3]:
db.table_names()

['diet_table', 'new_diet_table']

### Evaluating retrieval on text (index) column 

In [4]:
table = db.open_table('diet_table')
table.to_pandas()

Unnamed: 0,chunk_id,text,embedding
0,0,INTRODUCTION TO \nNUTRITION SCIENCE,"[-0.05099819228053093, -0.056592684239149094, ..."
1,1,Introduction to Nutrition Science,"[-0.05099819228053093, -0.056592684239149094, ..."
2,2,This text is disseminated via the Open Educati...,"[-0.019119346514344215, 0.10461532324552536, 0..."
3,3,Instructors can adopt existing LibreTexts text...,"[-0.029113631695508957, 0.010369417257606983, ..."
4,4,"for the construction, customization, and disse...","[-0.017107605934143066, 0.02413615770637989, -..."
...,...,...,...
3590,3590,11.7: Food Processing - CC BY-NC-SA 4.0\n11.8:...,"[-0.02437693625688553, -0.005427069962024689, ..."
3591,3591,3 h t t p s : / / m e d . l i b r e t e x t s ...,"[-0.025427795946598053, 0.023010170087218285, ..."
3592,3592,SA 4.0\n13.4: Fuel Sources - CC BY-NC-SA 4.0\n...,"[-0.08159752935171127, 0.005391544196754694, -..."
3593,3593,14.3: Infancy - CC BY-NC-SA 4.0\n14.4: Toddler...,"[-0.023193208500742912, 0.05462077260017395, -..."


In [5]:
# Perform a full-text search
query = 'What are Nutrients?'
text_search = table.search(query, query_type="fts").limit(5).select(["text"]).to_list()
text_search

[{'text': 'What are some of the nutrients found in your favorite foods?\nhttp://www.ars.usda.gov/Services/docs.htm?docid=17032\n2. Have a discussion in class on the “progression of science” and its significance to human health as depicted in the video on\npellagra (Video .\n1.2: What Are Nutrients? is shared under a CC BY-NC-SA license and was authored, remixed, and/or curated by LibreTexts.\nWhat is a calorie? - Emma Bryce What is a calorie? - Emma Bryce\nFood: A Better Source of Nutrients\n1.2.1',
  '_score': 10.188911437988281},
 {'text': 'What are Nutrients?  \nThe foods we eat contain nutrients. Nutrients are substances required by the body to perform its basic functions. Nutrients must be\nobtained from our diet, since the human body does not synthesize or produce them. Nutrients have one or more of three basic\nfunctions: they provide energy, contribute to body structure, and/or regulate chemical processes in the body. These basic functions',
  '_score': 9.51707935333252},
 {'te

#### Generating dcouments with open-source llm (google flan-t5-large)

In [6]:
# Load the google flan-t5-large model and tokenizer
model_name = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)



In [7]:
query = "What are Nutrients?"

inputs = tokenizer(query, return_tensors="pt").input_ids

outputs = model.generate(
    inputs, 
    max_length=300,             
    min_length=100,             
    num_return_sequences=5,     
    num_beams=5,                
    no_repeat_ngram_size=3,     
    temperature=0.9,            
    top_p=0.9,                  
    early_stopping=True,        
    do_sample=True             
)

# Decode generated outputs
generated_docs = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

# Generate random scores and sort them in descending order
random_scores = sorted([round(random.uniform(7, 10), 6) for _ in range(len(generated_docs))], reverse=True)

# Attach the sorted scores to the generated documents
scored_docs = [{"text": doc, "_score": score} for doc, score in zip(generated_docs, random_scores)]

scored_docs

[{'text': 'nutrient absorption and absorption of nutrients by the digestive and circulatory systems of humans and other organisms is essential for proper functioning of digestive and urinary systems and for normal growth and development of the body s internal organs and systemic functions and for the maintenance of normal homeostasis and homeostatic balance in the organism s tissues and organs as well as for reproduction, growth and repair of tissues, organs, and organ systems.',
  '_score': 9.898151},
 {'text': 'nutrient absorption and absorption of nutrients by the digestive and circulatory systems of humans and other organisms is essential for proper functioning of digestive and urinary systems and for normal growth and development of the body s internal organs and systemic functions and for the maintenance of normal homeostasis and homeostatic balance in the organism s tissues and organs as well as for reproduction, growth and repair of tissues, organs, cells and organ systems.',
 

#### Test if LLM response is relevant to text retrieval outcome from table for this query

In [8]:
text_search[0]['text']

'What are some of the nutrients found in your favorite foods?\nhttp://www.ars.usda.gov/Services/docs.htm?docid=17032\n2. Have a discussion in class on the “progression of science” and its significance to human health as depicted in the video on\npellagra (Video .\n1.2: What Are Nutrients? is shared under a CC BY-NC-SA license and was authored, remixed, and/or curated by LibreTexts.\nWhat is a calorie? - Emma Bryce What is a calorie? - Emma Bryce\nFood: A Better Source of Nutrients\n1.2.1'

In [9]:
scored_docs[0]['text']

'nutrient absorption and absorption of nutrients by the digestive and circulatory systems of humans and other organisms is essential for proper functioning of digestive and urinary systems and for normal growth and development of the body s internal organs and systemic functions and for the maintenance of normal homeostasis and homeostatic balance in the organism s tissues and organs as well as for reproduction, growth and repair of tissues, organs, and organ systems.'

In [10]:
text_search[0]['text'] == scored_docs[0]['text']

False

In [11]:
relevance_total = []

for i in range(5):
    relevance = text_search[i]['text'] == scored_docs[i]['text']
    relevance_total.append(relevance)

In [12]:
relevance_total

[False, False, False, False, False]

In [13]:
# Evaluation: Hit Rate and MRR between generated and retrieved documents
def retrieval_evaluation(generated_docs, retrieved_docs, k=5):
    hit_rate = any([doc in retrieved_docs[:k] for doc in generated_docs])
    
    # MRR Calculation
    mrr = 0
    for idx, retrieved_doc in enumerate(retrieved_docs[:k], 1):
        if retrieved_doc in generated_docs:
            mrr = 1 / idx
            break

    print(f"Hit Rate: {int(hit_rate)}")
    print(f"MRR: {mrr}")

In [14]:
relevance = retrieval_evaluation(scored_docs, text_search)

Hit Rate: 0
MRR: 0


#### testing relevance for another text retreived query search

In [15]:
# Perform a full-text search
question = 'What are Carbohydrates?'
text_search = table.search(question, query_type="fts").limit(5).select(["text"]).to_list()
text_search

[{'text': '4 . 6 . 1 h t t p s : / / m e d . l i b r e t e x t s . o r g / @g o / p a g e / 4 0 5 9 04.6: Carbohydrates and Personal Diet Choices\nIn this chapter, you learned what carbohydrates are, the different types of carbohydrates in your diet, and that excess consumption\nof some types of carbohydrates cause disease while others decrease disease risk. Now that we know the benefits of eating the right\ncarbohydrate, we will examine exactly how much should be eaten to promote health and prevent disease.',
  '_score': 8.1629638671875},
 {'text': 'Victor Lindlahr wrote in an ad in 1923, “Ninety percent of the diseases known to man are caused by cheap foodstuffs. You are what\nyou eat.” Today, we know this phrase simply as, “You are what you eat.”\nFigure : Nutrition provides the body with the nutrients it needs to perform all activities,\nfrom taking a breath to strenuous athletic activity. © Dreamstime\nGood nutrition equates to receiving enough (but not too much) of the macronutri

In [16]:
query = "What are Carbohydrates?"

inputs = tokenizer(query, return_tensors="pt").input_ids

outputs = model.generate(
    inputs, 
    max_length=300,             
    min_length=100,             
    num_return_sequences=5,     
    num_beams=5,                
    no_repeat_ngram_size=3,     
    temperature=0.9,            
    top_p=0.9,                  
    early_stopping=True,        
    do_sample=True             
)

# Decode the generated outputs
generated_docs = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

# Generate random scores and sort them in descending order
random_scores = sorted([round(random.uniform(7, 10), 6) for _ in range(len(generated_docs))], reverse=True)

# Attach the sorted scores to the generated documents
scored_docs = [{"text": doc, "_score": score} for doc, score in zip(generated_docs, random_scores)]

scored_docs

[{'text': 'carbohydrate phyllobacterium neophyllum symbiosis eukaryota faecium florescens eubacteriaceae famoetidae and phylobacteria genus eucalyptus phylum etymology and physiology phylogenetics and molecular biology',
  '_score': 9.033582},
 {'text': 'carbohydrate phyllobacterium neophyllum symbiosis eukaryota faecium florescens eubacteriaceae famoetidae and phylobacteria genus eucalyptus phylum etymology and physiology phylogenetics and pharmacology',
  '_score': 8.903423},
 {'text': 'carbohydrate phyllobacterium neophyllum symbiosis eukaryota faecium florescens eubacteriaceae famoetidae and phylobacteria genus eucalyptus phylum etymology and physiology phylogenetics and biochemistry',
  '_score': 8.466606},
 {'text': 'carbohydrate phyllobacterium neophyllum symbiosis eukaryota faecium florescens eubacteriaceae famoetidae and phylobacteria genus eucalyptus phylum etymology and physiology phylogenetics and classification',
  '_score': 7.379971},
 {'text': 'carbohydrate phyllobacteri

In [17]:
relevance_total = []

for i in range(5):
    relevance = text_search[i]['text'] == scored_docs[i]['text']
    relevance_total.append(relevance)

relevance_total

[False, False, False, False, False]

In [18]:
relevance = retrieval_evaluation(scored_docs, text_search)

Hit Rate: 0
MRR: 0


* Generate relevant documents for all text column

In [19]:
def generate_documents(query, num_return_sequences=5):
    input = tokenizer(query, return_tensors="pt").input_ids
    
    outputs = model.generate(
        input, 
        max_length=300,
        min_length=100,
        num_return_sequences=num_return_sequences,
        num_beams=5,
        no_repeat_ngram_size=3,
        temperature=0.9,
        top_p=0.9,
        early_stopping=True,
        do_sample=True
    )

    generated_docs = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    
    scored_docs = [{"text": doc, "_score": round(random.uniform(7, 10), 6)} for doc in generated_docs]
    return scored_docs



# text data from the LanceDB table
text_data = table.to_pandas()

text_data['generated_documents'] = text_data['text'].apply(lambda x: generate_documents(x))

generated_documents = text_data[['text', 'generated_documents']].to_dict(orient='records')

### Evaluating retrieval on embedding (index) column 

In [19]:
db.table_names()

['diet_table', 'new_diet_table']

In [20]:
new_table = db.open_table('new_diet_table')
new_table.to_pandas()

Unnamed: 0,chunk_id,text,embedding
0,0,INTRODUCTION TO \nNUTRITION SCIENCE,"[-0.050998192, -0.056592684, -0.05413804, 0.07..."
1,1,Introduction to Nutrition Science,"[-0.050998192, -0.056592684, -0.05413804, 0.07..."
2,2,This text is disseminated via the Open Educati...,"[-0.019119347, 0.10461532, 0.008642459, 0.0719..."
3,3,Instructors can adopt existing LibreTexts text...,"[-0.029113632, 0.010369417, -0.021756086, -0.0..."
4,4,"for the construction, customization, and disse...","[-0.017107606, 0.024136158, -0.00488623, -0.00..."
...,...,...,...
3590,3590,11.7: Food Processing - CC BY-NC-SA 4.0\n11.8:...,"[-0.024376936, -0.00542707, -0.024001742, 0.07..."
3591,3591,3 h t t p s : / / m e d . l i b r e t e x t s ...,"[-0.025427796, 0.02301017, -0.011724482, 0.101..."
3592,3592,SA 4.0\n13.4: Fuel Sources - CC BY-NC-SA 4.0\n...,"[-0.08159753, 0.005391544, -0.04637359, 0.0655..."
3593,3593,14.3: Infancy - CC BY-NC-SA 4.0\n14.4: Toddler...,"[-0.023193209, 0.054620773, -0.033654038, 0.07..."


In [21]:
query = 'What are Carbohydrates?'

# Load the embedding model
embed_model = SentenceTransformer('all-MiniLM-L6-v2')

# Convert the query into a list format
query_embedding = embed_model.encode(query).tolist()  

# Perform the semantic search
semantic_search = new_table.search(query_embedding, query_type='vector', vector_column_name='embedding').limit(5).select(['text']).to_list()
semantic_search



[{'text': 'atom. Basically, they are hydrated carbons. The word “carbo” means carbon and “hydrate” means water. Glucose, the most4.1.1\n[1]\n4.1.2',
  '_distance': 0.4910799562931061},
 {'text': 'other larger, more slow-releasing carbohydrates. When we eat plants we harvest the energy of glucose to support life’s processes.\n Figure : Carbohydrate Classification Scheme.\nCarbohydrates are broken down into the subgroups simple and complex carbohydrates. These subgroups are further categorized\ninto mono-, di-, and polysaccharides.\nCarbohydrates are a group of organic compounds containing a ratio of one carbon atom to two hydrogen atoms to one oxygen',
  '_distance': 0.5366986989974976},
 {'text': '4 . 1 . 2 h t t p s : / / m e d . l i b r e t e x t s . o r g / @g o / p a g e / 4 0 5 8 5abundant carbohydrate in the human body, has six carbon atoms, twelve hydrogen atoms, and six oxygen atoms. The chemical\nformula for glucose is written as . Synonymous with the term carbohydrate is the 

In [22]:
# LLM generated document
scored_docs

[{'text': 'carbohydrate phyllobacterium neophyllum symbiosis eukaryota faecium florescens eubacteriaceae famoetidae and phylobacteria genus eucalyptus phylum etymology and physiology phylogenetics and molecular biology',
  '_score': 9.033582},
 {'text': 'carbohydrate phyllobacterium neophyllum symbiosis eukaryota faecium florescens eubacteriaceae famoetidae and phylobacteria genus eucalyptus phylum etymology and physiology phylogenetics and pharmacology',
  '_score': 8.903423},
 {'text': 'carbohydrate phyllobacterium neophyllum symbiosis eukaryota faecium florescens eubacteriaceae famoetidae and phylobacteria genus eucalyptus phylum etymology and physiology phylogenetics and biochemistry',
  '_score': 8.466606},
 {'text': 'carbohydrate phyllobacterium neophyllum symbiosis eukaryota faecium florescens eubacteriaceae famoetidae and phylobacteria genus eucalyptus phylum etymology and physiology phylogenetics and classification',
  '_score': 7.379971},
 {'text': 'carbohydrate phyllobacteri

#### Test if LLM response is relevant to embedding retrieval outcome from table for this query

In [23]:
relevance_total = []

for i in range(5):
    relevance = semantic_search[i]['text'] == scored_docs[i]['text']
    relevance_total.append(relevance)

relevance_total

[False, False, False, False, False]

In [24]:
relevance = retrieval_evaluation(scored_docs, semantic_search)

Hit Rate: 0
MRR: 0


#### testing relevance for another embedding retrieved query search

In [25]:
query = 'What are Lipids?'

# Load the embedding model
embed_model = SentenceTransformer('all-MiniLM-L6-v2')

# Convert the query into a list format
query_embedding = embed_model.encode(query).tolist()  

# Perform the semantic search
semantic_search = new_table.search(query_embedding, query_type='vector', vector_column_name='embedding').limit(5).select(['text']).to_list()
semantic_search

[{'text': 'Lipids are a family of organic compounds that are mostly insoluble in water. Composed of fats and oils, lipids are molecules that\nyield high energy and have a chemical composition mainly of carbon, hydrogen, and oxygen. Lipids perform three primary\nbiological functions within the body: they serve as structural components of cell membranes, function as energy storehouses, and\nfunction as important signaling molecules.',
  '_distance': 0.3951720595359802},
 {'text': '5 . 4 . 1 h t t p s : / / m e d . l i b r e t e x t s . o r g / @g o / p a g e / 4 0 5 9 55.4: How Lipids W ork\nLipids are unique organic compounds, each serving key roles and performing specific functions within the body. As we discuss the\nvarious types of lipids (triglycerides, phospholipids, and sterols) in further detail, we will compare their structures and functions\nand examine their impact on human health.\nTriglycerides Structure and Functions',
  '_distance': 0.6296098828315735},
 {'text': 'transmis

* LLM generated document for this query

In [26]:
query = "What are Lipids?"

inputs = tokenizer(query, return_tensors="pt").input_ids

outputs = model.generate(
    inputs, 
    max_length=300,             
    min_length=100,             
    num_return_sequences=5,     
    num_beams=5,                
    no_repeat_ngram_size=3,     
    temperature=0.9,            
    top_p=0.9,                  
    early_stopping=True,        
    do_sample=True             
)

# Decode the generated outputs
generated_docs = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

# Generate random scores and sort them in descending order
random_scores = sorted([round(random.uniform(7, 10), 6) for _ in range(len(generated_docs))], reverse=True)

# Attach the sorted scores to the generated documents
scored_docs = [{"text": doc, "_score": score} for doc, score in zip(generated_docs, random_scores)]

scored_docs

[{'text': 'lipids are fats that are produced in the liver by the body s own cells in response to adipose tissue exocrine secretion of adiponectin, a hormone produced by the pancreatic glands, to help regulate blood clotting and blood sugar, and to help maintain homeostasis, or normal body functions, such as blood pressure, heart rate, blood sugar and body temperature.',
  '_score': 9.695612},
 {'text': 'lipids are fats that are produced in the liver by the body s own cells in response to adipose tissue exocrine secretion of adiponectin, a hormone produced by the pancreatic glands, to help regulate blood clotting and blood sugar, and to help maintain homeostasis, or normal body functions, such as blood pressure, blood sugar levels and cholesterol.',
  '_score': 9.678729},
 {'text': 'lipids are fats that are produced in the liver by the body s own cells in response to adipose tissue exocrine secretion of adiponectin, a hormone produced by the pancreatic glands, to help regulate blood clo

In [27]:
relevance_total = []

for i in range(5):
    relevance = semantic_search[i]['text'] == scored_docs[i]['text']
    relevance_total.append(relevance)

relevance_total

[False, False, False, False, False]

In [28]:
relevance = retrieval_evaluation(scored_docs, semantic_search)

Hit Rate: 0
MRR: 0
