In [1]:
import pandas as pd
import numpy as np
import cohere
import re
from tqdm import tqdm
import umap
import altair as alt
from sklearn.metrics.pairwise import cosine_similarity
from annoy import AnnoyIndex
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Paste your API key here. Remember to not share publicly
api_key = open("api_key.txt").readlines()[0]

# Create and retrieve a Cohere API key from os.cohere.ai
co = cohere.Client(api_key)

# Preprocessing

In [3]:
df_multilingual = pd.read_csv('data/multilingual.csv')

In [4]:
df_multilingual.head()

Unnamed: 0,question,answer,url,language,source,section,wrong_answer
0,什么是新型冠状病毒？,新型冠状病毒是一种此前尚未确定的新冠状病毒。导致新型冠状病毒肺炎 | 新冠肺炎2019（CO...,https://chinese.cdc.gov/coronavirus/2019-ncov/...,chinese,CDC,Frequently Asked Questions,使用 CDC 开发的诊断检测，结果为阴性是指在受测者样本中未发现导致 COVID-19 的病...
1,为什么这种疾病被称为 2019 年新型冠状病毒肺炎（COVID-19）？,2020 年 2 月 11 日，世界卫生组织宣布了导致最初于中国武汉发现的 2019 年新型...,https://chinese.cdc.gov/coronavirus/2019-ncov/...,chinese,CDC,Frequently Asked Questions,引发COVID-19的病毒被认为主要以人传人的方式，通过感染患者咳嗽或打喷嚏时产生的呼吸道飞...
2,为什么有人会因为 COVID-19 而指责或躲避某些个人和群体（制造污名）？,美国民众可能对住在或前往COVID-19传播地区的亲友感到担心焦虑。有些人很担心通过这些人感...,https://chinese.cdc.gov/coronavirus/2019-ncov/...,chinese,CDC,Frequently Asked Questions,目前，没有证据表明COVID-19病毒会从宠物的皮肤或皮毛传播给人类。\n关于照顾宠物的最佳...
3,人们如何制止与 COVID-19 相关的污名化？,在注意到污名化正在发生的情况下，人们可以通过提供社会支持来对抗污名化。污名会影响被污名化群体...,https://chinese.cdc.gov/coronavirus/2019-ncov/...,chinese,CDC,Frequently Asked Questions,洗手是保护您与您的家人远离疾病的最佳途径之一。经常用肥皂和水洗手20秒以上，尤其是擤鼻涕、咳...
4,为何某些州的COVID-19病例数量有时与CDC网站上发布的数量有差别？,CDC发布的总病例数量依照各辖区的确认程序得以验证。不同的地方所显示的病例数量的发现和确认程...,https://chinese.cdc.gov/coronavirus/2019-ncov/...,chinese,CDC,Frequently Asked Questions,\n\n与孩子的学校保持联系。\n\n很多学校提供网络课程（虚拟学习）。查看学校布置的作业，...


In [5]:
df_qa = pd.read_csv('data/qa.csv')

In [6]:
df_qa.head()

Unnamed: 0,AnswerID,Answer
0,ADAM_0003147_Sec1.txt,Question: What is (are) Polycystic ovary syndr...
1,ADAM_0003147_Sec2.txt,Question: What causes Polycystic ovary syndrom...
2,ADAM_0002818_Sec2.txt,Question: What causes Noonan syndrome ?\nURL: ...
3,ADAM_0002818_Sec7.txt,Question: What are the complications of Noonan...
4,ADAM_0002818_Sec9.txt,Question: How to prevent Noonan syndrome ?\nUR...


In [7]:
text = list(df_qa.Answer)

In [8]:
question = []
url = []
answer = []
for i in text:
    x = i.split("\n")
    question.append(x[0].replace("Question: ", ""))
    url.append(x[1].replace("URL: ",""))
    answer.append(" ".join(x[2:]).replace("Answer: ", ""))

In [9]:
for index, row in df_multilingual.iterrows():
    question.append(row['question'])
    url.append(row['url'])
    answer.append(row['answer'])

In [10]:
df_final = pd.DataFrame(zip(question, url, answer), columns =['Question', 'URL', 'Answer'])
df_final.head()

Unnamed: 0,Question,URL,Answer
0,What is (are) Polycystic ovary syndrome ? (Als...,https://www.nlm.nih.gov/medlineplus/ency/artic...,Polycystic ovary syndrome is a condition in wh...
1,What causes Polycystic ovary syndrome ? (Also ...,https://www.nlm.nih.gov/medlineplus/ency/artic...,PCOS is linked to changes in hormone levels th...
2,What causes Noonan syndrome ?,https://www.nlm.nih.gov/medlineplus/ency/artic...,Noonan syndrome is linked to defects in severa...
3,What are the complications of Noonan syndrome ?,https://www.nlm.nih.gov/medlineplus/ency/artic...,- Buildup of fluid in tissues of body (lymphed...
4,How to prevent Noonan syndrome ?,https://www.nlm.nih.gov/medlineplus/ency/artic...,Couples with a family history of Noonan syndro...


In [15]:
df_final.shape

(3367, 3)

In [25]:
df_final.to_csv("data/finaldf.csv", index=False)

# Training

In [26]:
df = pd.read_csv("data/finaldf.csv")
df.head()

Unnamed: 0,Question,URL,Answer
0,What is (are) Polycystic ovary syndrome ? (Als...,https://www.nlm.nih.gov/medlineplus/ency/artic...,Polycystic ovary syndrome is a condition in wh...
1,What causes Polycystic ovary syndrome ? (Also ...,https://www.nlm.nih.gov/medlineplus/ency/artic...,PCOS is linked to changes in hormone levels th...
2,What causes Noonan syndrome ?,https://www.nlm.nih.gov/medlineplus/ency/artic...,Noonan syndrome is linked to defects in severa...
3,What are the complications of Noonan syndrome ?,https://www.nlm.nih.gov/medlineplus/ency/artic...,- Buildup of fluid in tissues of body (lymphed...
4,How to prevent Noonan syndrome ?,https://www.nlm.nih.gov/medlineplus/ency/artic...,Couples with a family history of Noonan syndro...


In [None]:
df_flatten = [' '.join(val[:2048]) for val in df_final.astype(str).values.tolist()]
len(df_flatten)

In [24]:
len(df_flatten[14])

991

In [18]:
response = co.embed(texts=df_flatten, model='multilingual-22-12')  
embeddings = response.embeddings # All text embeddings 
print(embeddings[0][:5])

CohereError: invalid request: texts[12] is too long. Max tokens per text must be at most 2048 - received 2358. Text: 'What causes Chronic Diarrhea in Children ? http://www.niddk.nih.gov/health-information/health-topics/digestive-diseases/diarrhea/Pages/diarrhea-in-children-facts.aspx Many diseases and disorders can cause chronic diarrhea in children. Common causes include                  - infections  - functional gastrointestinal (GI) disorders  - food allergies and intolerances  - inflammatory bowel disease (IBD)                  Infections, food allergies and intolerances, and IBD may cause chronic diarrhea along with malabsorption, meaning the small intestine does not absorb nutrients from food. If children do not absorb enough nutrients from the food they eat, they may become malnourished. Functional GI disorders do not cause malabsorption.                  Infections                  Infections from viruses, bacteria, or parasites sometimes lead to chronic diarrhea. After an infection, some children have problems digesting carbohydrates, such as lactose, or proteins, such as milk or soy proteins. These problems can cause prolonged diarrheaoften for up to 6 weeksafter an infection. Also, some bacteria and parasite infections that cause diarrhea do not go away quickly without treatment.                  More information about infections that cause diarrhea is provided in the NIDDK health topics:                  - Viral Gastroenteritis  - Foodborne Illnesses                  Small intestinal bacterial overgrowth may also cause chronic diarrhea. Normally, few bacteria live in the small intestine, and many bacteria live in the large intestine. Small intestinal bacterial overgrowth is an increase in the number of bacteria or a change in the type of bacteria in the small intestine. These bacteria can cause diarrhea, gas, cramping, and weight loss. Small intestinal bacterial overgrowth is often related to diseases or disorders that damage the digestive system or affect how it works, such as Crohns disease or diabetes. Small intestinal bacterial overgrowth is also more common in people who have had abdominal surgery or who have slow-moving intestines.                  Functional Gastrointestinal Disorders                  In functional GI disorders, symptoms are caused by changes in how the GI tract works. The GI tract is a series of hollow organs joined in a long, twisting tube from the mouth to the anusthe opening through which stool leaves the body. The GI tract digests, or breaks down, food and processes solid waste.                  Children with a functional GI disorder have frequent symptoms, yet the GI tract does not become damaged. Functional GI disorders are not diseases; they are groups of symptoms that occur together.                  Two functional GI disorders that cause chronic diarrhea in children are toddlers diarrhea and irritable bowel syndrome (IBS).                  Toddlers diarrhea. Toddlers diarrheaalso called functional diarrhea or chronic nonspecific diarrhea of childhoodis a common cause of chronic diarrhea in toddlers and preschool-age children. Children with this disorder pass three or more loose stools a day and do not have any other symptoms. They typically are growing well and gaining weight, and are healthy.                  Toddlers diarrhea develops between the ages of 6 months and 3 years, and it usually goes away on its own by the time children begin grade school. Researchers think a diet with too much sugarsuch as the sugar found in fruit juicerelative to the amount of fat and fiber may cause toddlers diarrhea.                  IBS. The most common symptoms of IBS are abdominal pain or discomfort, often reported as cramping, along with changes in bowel habits, such as diarrhea. The pain or discomfort of IBS typically gets better with the passage of stool or gas. IBS does not cause symptoms such as weight loss, vomiting, or blood in the stool.                  Possible causes include problems with nerves in the intestines, problems with nerve signals between the brain and the intestines, changes in how food moves through the intestines, and hypersensitivity to pain. Psychological problems, such as anxiety and depression, or food sensitivity may also play a role.                  IBS is a common cause of chronic diarrhea in grade school-age children and adolescents. Health care providers rarely diagnose IBS in younger children because younger children are not able to report symptoms of pain or discomfort. More information is provided in the NIDDK health topics:                  - Irritable Bowel Syndrome  - Irritable Bowel Syndrome in Children                  Food Allergies and Intolerances                  Food allergies, celiac disease, lactose intolerance, and dietary fructose intolerance are common causes of chronic diarrhea.                  Food allergies. A food allergy is a reaction by the immune system, the bodys natural defense system, to one or more proteins in certain foods. The immune system normally protects the body from infection by identifying and destroying bacteria, viruses, and other potentially harmful foreign substances that can cause illness. In food allergies, however, the immune system responds abnormally to certain foods.                  Cows milk and soy allergies are the most common food allergies that affect the GI tract in children. Food allergies usually appear in the first year of life. Many children outgrow cows milk and soy allergies by age 3. Allergies to other foods, such as cereal grains, eggs, or seafood, may also affect the GI tract.                  Symptoms of food allergies may include diarrhea, vomiting, and weight loss or poor weight gain. Some children have mild symptoms, while others have severe or life-threatening symptoms. For example, some children have severe vomiting and diarrhea that lead to dehydration, which means the body lacks enough fluid and electrolytesminerals in salts, including sodium, potassium, and chlorideto function properly.                  Celiac disease. Celiac disease is an autoimmune disease in which people cannot tolerate gluten. A chronic reaction to gluten damages the lining of their small intestine and prevents absorption of nutrients. Gluten is a protein found in wheat, rye, and barley and in vitamin and nutrient supplements, lip balms, communion wafers, and certain medications.                  Children of any age can experience digestive symptoms of celiac disease or have symptoms in other parts of the body. Digestive symptoms can include                  - chronic diarrhea  - abdominal bloating  - stomach pain  - gas  - vomiting  - constipation  - pale, foul-smelling, or fatty stool                  Malabsorption of nutrients during the years when nutrition is critical to a childs normal growth and development can result in other health problems. These problems may include                  - failure to thrive in infants  - slowed growth and short stature  - weight loss  - irritability or mood changes  - delayed puberty  - dental enamel defects of the permanent teeth  - anemia, a condition in which red blood cells are fewer or smaller than normal, which prevents the bodys cells from getting enough oxygen  - low levels of important nutrients such as iron and calcium                  More information is provided in the NIDDK health topics:                  - Celiac Disease  - What I need to know about Celiac Disease                  Lactose intolerance. Lactose intolerance is a condition in which people have digestive symptomssuch as bloating, gas, and diarrheaafter consuming milk or milk products. Lactose is a sugar found in milk or milk products. Lactase, an enzyme produced by the small intestine, breaks down lactose into two simpler forms of sugar: glucose and galactose. The bloodstream then absorbs these simpler sugars.                  Some children have a lactase deficiency, meaning the small intestine produces low levels of lactase and cannot digest much lactose. Lactase deficiency may cause lactose malabsorption. In children with lactose malabsorption, undigested lactose passes to the colon, where bacteria break down the lactose and create fluid and gas.                  Not all children with lactase deficiency and lactose malabsorption have digestive symptoms. Experts use the term lactose intolerance when lactase deficiency and lactose malabsorption cause digestive symptoms.                  The most common type of lactase deficiency develops over time, beginning after about age 2, when the body begins to produce less lactase. Children who have lactase deficiency may not experience symptoms of lactose intolerance until late adolescence or adulthood.                  Infants rarely have lactose intolerance at birth. People sometimes mistake cows milk allergy, which can cause diarrhea in infants, for lactose intolerance. Congenital lactase deficiencyan extremely rare inherited genetic disorder in which the small intestine produces little or no lactase enzyme at birthcan cause lactose intolerance in infants. Premature infants may experience lactose intolerance for a short time after birth. Children of any age may develop temporary lactose intolerance after a viral diarrheal episode or other infection.                  More information is provided in the NIDDK health topics:                  - Lactose Intolerance  - What I need to know about Lactose Intolerance                  Dietary fructose intolerance. Dietary fructose intolerance is a condition in which people have digestive symptomssuch as bloating, gas, and diarrheaafter consuming foods that contain fructose. Fructose is a sugar found in fruits, fruit juices, and honey. Fructose is also added to many foods and soft drinks as a sweetener called high fructose corn syrup.                  Fructose malabsorption causes dietary fructose intolerance. The small intestine absorbs fructose, and, when a person consumes more fructose than the small intestine can absorb, fructose malabsorption results. Unabsorbed fructose passes to the colon, where bacteria break down the fructose and create fluid and gas.                  The amount of fructose that a childs small intestine can absorb varies. The capacity of the small intestine to absorb fructose increases with age. Some children may be able to tolerate more fructose as they get older.                  Another type of fructose intolerance, hereditary fructose intolerance, is not related to fructose malabsorption. Hereditary fructose intolerance is an extremely rare inherited genetic disorder. Children with this disorder lack an enzyme needed to break down fructose. Symptoms of hereditary fructose intolerance may include abdominal pain, vomiting, and diarrhea. This disorder can also damage the liver and kidneys.                  Inflammatory Bowel Disease                  Inflammatory bowel disease causes irritation and inflammation in the intestines. The two main types of IBD are ulcerative colitis and Crohns disease. These disorders can affect children at any age; however, they commonly begin in the grade school years or in adolescence. The causes of IBD are unknown. Researchers believe they result from an abnormal immune system reaction.                  Ulcerative colitis. Ulcerative colitis is a disease that causes inflammation, or swelling, and ulcers in the inner lining of the large intestine. The large intestine includes the colon and the rectumthe lower end of the large intestine leading to the anus. Normally, the large intestine absorbs water from stool and changes it from a liquid to a solid. In ulcerative colitis, the inflammation causes loss of the lining of the large intestine, leading to bleeding, production of pus, diarrhea, and abdominal discomfort.                  More information is provided in the NIDDK health topic, Ulcerative Colitis.                  Crohns disease. Crohns disease is a disease that causes inflammation and irritation of any part of the GI tract. The end part of the small intestine, called the ileum, is most commonly affected. In Crohns disease, inflammation can extend through the entire wall of the GI tract, leading to possible complications. Swelling can cause pain and can make the intestine empty frequently, resulting in diarrhea.                  More information is provided in the NIDDK health topics:                  - Crohns Disease  - What I need to know about Crohns Disease)  '

In [None]:
# Check the dimensions of the embeddings
embeds = np.array(embeds)
embeds.shape

In [9]:
# Create the search index, pass the size of embedding
search_index = AnnoyIndex(embeds.shape[1], 'angular')
# Add all the vectors to the search index
for i in range(len(embeds)):
    search_index.add_item(i, embeds[i])

search_index.build(10) # 10 trees
search_index.save('test.ann')

True

In [10]:
# Choose an example (we'll retrieve others similar to it)
example_id = 78

# Retrieve nearest neighbors
similar_item_ids = search_index.get_nns_by_item(example_id,10,
                                                include_distances=True)
# Format and print the text and distances
results = pd.DataFrame(data={'texts': df.iloc[similar_item_ids[0]]['question'], 
                             'distance': similar_item_ids[1]}).drop(example_id)

print(f"Question:'{df.iloc[example_id]['question']}'\nNearest neighbors:")
results

Question:'Why is the disease being called coronavirus disease 2019, COVID-19?'
Nearest neighbors:


Unnamed: 0,texts,distance
559,What is COVID-19?,0.708529
232,¿Por qué a la enfermedad la llaman enfermedad ...,0.813518
77,What is a novel coronavirus?,0.813775
558,What is a coronavirus?,0.842466
569,Is COVID-19 the same as SARS?,0.852085
1,为什么这种疾病被称为 2019 年新型冠状病毒肺炎（COVID-19）？,0.872134
578,Is the source of the coronavirus causing COVID...,0.923686
473,什么是2019冠状病毒病？,0.951086
804,¿Qué es la COVID-19?,0.960803


In [11]:
query = "what ages are more prone to die from covid"

# Get the query's embedding
query_embed = co.embed(texts=[query],
                  model="large",
                  truncate="LEFT").embeddings

# Retrieve the nearest neighbors
similar_item_ids = search_index.get_nns_by_vector(query_embed[0],3,
                                                include_distances=True)
# Format the results
results = pd.DataFrame(data={'texts': df.iloc[similar_item_ids[0]]['question'], 
                             'distance': similar_item_ids[1]})


print(f"Query:'{query}'\nNearest neighbors:")
results

Query:'what ages are more prone to die from covid'
Nearest neighbors:


Unnamed: 0,texts,distance
131,Who is at higher risk for serious illness from...,0.724839
133,How were the underlying conditions for people ...,0.951346
565,Who is at risk of developing severe illness?,0.977805
