In [9]:
import pandas as pd
import numpy as np
import cohere
import re
from tqdm import tqdm
import umap
import altair as alt
from sklearn.metrics.pairwise import cosine_similarity
from annoy import AnnoyIndex
import warnings
warnings.filterwarnings('ignore')

In [10]:
# Paste your API key here. Remember to not share publicly
api_key = open("api_key.txt").readlines()[0]

# Create and retrieve a Cohere API key from os.cohere.ai
co = cohere.Client(api_key)

# Preprocessing

In [11]:
df_multilingual = pd.read_csv('data/multilingual.csv')

In [12]:
df_multilingual.head()

Unnamed: 0,question,answer,url,language,source,section,wrong_answer
0,什么是新型冠状病毒？,新型冠状病毒是一种此前尚未确定的新冠状病毒。导致新型冠状病毒肺炎 | 新冠肺炎2019（CO...,https://chinese.cdc.gov/coronavirus/2019-ncov/...,chinese,CDC,Frequently Asked Questions,使用 CDC 开发的诊断检测，结果为阴性是指在受测者样本中未发现导致 COVID-19 的病...
1,为什么这种疾病被称为 2019 年新型冠状病毒肺炎（COVID-19）？,2020 年 2 月 11 日，世界卫生组织宣布了导致最初于中国武汉发现的 2019 年新型...,https://chinese.cdc.gov/coronavirus/2019-ncov/...,chinese,CDC,Frequently Asked Questions,引发COVID-19的病毒被认为主要以人传人的方式，通过感染患者咳嗽或打喷嚏时产生的呼吸道飞...
2,为什么有人会因为 COVID-19 而指责或躲避某些个人和群体（制造污名）？,美国民众可能对住在或前往COVID-19传播地区的亲友感到担心焦虑。有些人很担心通过这些人感...,https://chinese.cdc.gov/coronavirus/2019-ncov/...,chinese,CDC,Frequently Asked Questions,目前，没有证据表明COVID-19病毒会从宠物的皮肤或皮毛传播给人类。\n关于照顾宠物的最佳...
3,人们如何制止与 COVID-19 相关的污名化？,在注意到污名化正在发生的情况下，人们可以通过提供社会支持来对抗污名化。污名会影响被污名化群体...,https://chinese.cdc.gov/coronavirus/2019-ncov/...,chinese,CDC,Frequently Asked Questions,洗手是保护您与您的家人远离疾病的最佳途径之一。经常用肥皂和水洗手20秒以上，尤其是擤鼻涕、咳...
4,为何某些州的COVID-19病例数量有时与CDC网站上发布的数量有差别？,CDC发布的总病例数量依照各辖区的确认程序得以验证。不同的地方所显示的病例数量的发现和确认程...,https://chinese.cdc.gov/coronavirus/2019-ncov/...,chinese,CDC,Frequently Asked Questions,\n\n与孩子的学校保持联系。\n\n很多学校提供网络课程（虚拟学习）。查看学校布置的作业，...


In [13]:
df_qa = pd.read_csv('data/qa.csv')

In [14]:
df_qa.head()

Unnamed: 0,AnswerID,Answer
0,ADAM_0003147_Sec1.txt,Question: What is (are) Polycystic ovary syndr...
1,ADAM_0003147_Sec2.txt,Question: What causes Polycystic ovary syndrom...
2,ADAM_0002818_Sec2.txt,Question: What causes Noonan syndrome ?\nURL: ...
3,ADAM_0002818_Sec7.txt,Question: What are the complications of Noonan...
4,ADAM_0002818_Sec9.txt,Question: How to prevent Noonan syndrome ?\nUR...


In [15]:
text = list(df_qa.Answer)

In [16]:
question = []
url = []
answer = []
for i in text:
    x = i.split("\n")
    question.append(x[0].replace("Question: ", ""))
    url.append(x[1].replace("URL: ",""))
    answer.append(" ".join(x[2:]).replace("Answer: ", ""))

In [17]:
for index, row in df_multilingual.iterrows():
    question.append(row['question'])
    url.append(row['url'])
    answer.append(row['answer'])

In [18]:
df_final = pd.DataFrame(zip(question, url, answer), columns =['Question', 'URL', 'Answer'])
df_final.head()

Unnamed: 0,Question,URL,Answer
0,What is (are) Polycystic ovary syndrome ? (Als...,https://www.nlm.nih.gov/medlineplus/ency/artic...,Polycystic ovary syndrome is a condition in wh...
1,What causes Polycystic ovary syndrome ? (Also ...,https://www.nlm.nih.gov/medlineplus/ency/artic...,PCOS is linked to changes in hormone levels th...
2,What causes Noonan syndrome ?,https://www.nlm.nih.gov/medlineplus/ency/artic...,Noonan syndrome is linked to defects in severa...
3,What are the complications of Noonan syndrome ?,https://www.nlm.nih.gov/medlineplus/ency/artic...,- Buildup of fluid in tissues of body (lymphed...
4,How to prevent Noonan syndrome ?,https://www.nlm.nih.gov/medlineplus/ency/artic...,Couples with a family history of Noonan syndro...


In [19]:
df_final.shape

(3367, 3)

In [20]:
df_final.to_csv("data/finaldf.csv", index=False)

# Training

In [21]:
df = pd.read_csv("data/finaldf.csv")
df.head()

Unnamed: 0,Question,URL,Answer
0,What is (are) Polycystic ovary syndrome ? (Als...,https://www.nlm.nih.gov/medlineplus/ency/artic...,Polycystic ovary syndrome is a condition in wh...
1,What causes Polycystic ovary syndrome ? (Also ...,https://www.nlm.nih.gov/medlineplus/ency/artic...,PCOS is linked to changes in hormone levels th...
2,What causes Noonan syndrome ?,https://www.nlm.nih.gov/medlineplus/ency/artic...,Noonan syndrome is linked to defects in severa...
3,What are the complications of Noonan syndrome ?,https://www.nlm.nih.gov/medlineplus/ency/artic...,- Buildup of fluid in tissues of body (lymphed...
4,How to prevent Noonan syndrome ?,https://www.nlm.nih.gov/medlineplus/ency/artic...,Couples with a family history of Noonan syndro...


### Restricting to max 1600 entries as we are restricted at 100 API calls/minute

In [22]:
max_length = 1600

In [23]:
df = df[:max_length]

In [24]:
df_flatten = [' '.join(val[:2048]) for val in df.astype(str).values.tolist()]
len(df_flatten)

1600

In [25]:
for i in range(len(df_flatten)):
    if len(df_flatten[i]) > 2047:
        df_flatten[i] = df_flatten[i][:2048]

In [26]:
response = co.embed(texts=df_flatten, model='multilingual-22-12')  

In [27]:
embeddings = response.embeddings # All text embeddings 

In [28]:
# Check the dimensions of the embeddings
embeds = np.array(embeddings)
embeds.shape

(1600, 768)

In [29]:
# Create the search index, pass the size of embedding
search_index = AnnoyIndex(embeds.shape[1], 'angular')
# Add all the vectors to the search index
for i in range(len(embeds)):
    search_index.add_item(i, embeds[i])



In [30]:
search_index.build(100) # 10 trees

True

In [34]:
query = df.Question[0]

# Get the query's embedding
try:
    query_embed = co.embed(texts=[query], model='multilingual-22-12').embeddings

    # Retrieve the nearest neighbors
    similar_item_ids = search_index.get_nns_by_vector(query_embed[0],20,
                                                    include_distances=True)
    # Format the results
    results = pd.DataFrame(data={'Question': df.iloc[similar_item_ids[0]]['Question'], 
                                 'URL': df.iloc[similar_item_ids[0]]['URL'],
                                 'Answer': df.iloc[similar_item_ids[0]]['Answer']})


    print(f"Query:'{query}'\nNearest neighbors:")
    
except:
    print("Whoops, you have to wait 60 seconds before making another try, This API is a trial version")

Query:'What is (are) Polycystic ovary syndrome ? (Also called: Polycystic ovaries; Polycystic ovary disease; Stein-Leventhal syndrome; Polyfollicular ovarian disease)'
Nearest neighbors:


In [35]:
results[:3]

Unnamed: 0,Question,URL,Answer
0,What is (are) Polycystic ovary syndrome ? (Als...,https://www.nlm.nih.gov/medlineplus/ency/artic...,Polycystic ovary syndrome is a condition in wh...
15,What is (are) polycystic kidney disease ? (Als...,https://ghr.nlm.nih.gov/condition/polycystic-k...,Polycystic kidney disease is a disorder that a...
1,What causes Polycystic ovary syndrome ? (Also ...,https://www.nlm.nih.gov/medlineplus/ency/artic...,PCOS is linked to changes in hormone levels th...


In [37]:
#@title Plot the archive {display-mode: "form"}

# UMAP reduces the dimensions from 1024 to 2 dimensions that we can plot
reducer = umap.UMAP(n_neighbors=20) 
umap_embeds = reducer.fit_transform(embeds)
# Prepare the data to plot and interactive visualization
# using Altair
df_explore = pd.DataFrame(data={'text': df_flatten})
df_explore['x'] = umap_embeds[:,0]
df_explore['y'] = umap_embeds[:,1]

# Plot
chart = alt.Chart(df_explore).mark_circle(size=60).encode(
    x=#'x',
    alt.X('x',
        scale=alt.Scale(zero=False)
    ),
    y=
    alt.Y('y',
        scale=alt.Scale(zero=False)
    ),
    tooltip=['text']
).properties(
    width=700,
    height=400
)
chart.interactive()

# Final Training for Production

In [53]:
import requests
import time

In [54]:
df = pd.read_csv("data/finaldf.csv")


In [55]:
df_flatten = [' '.join(val) for val in df.astype(str).values.tolist()]
len(df_flatten)

3367

In [56]:
for i in range(len(df_flatten)):
    if len(df_flatten[i]) > 2047:
        df_flatten[i] = df_flatten[i][:2048]

In [57]:
embs = []
for i in tqdm(range(0, len(df_flatten), 481)):
    for _ in range(10): #retries
        try:
            x = co.embed(df_flatten[i:i + 481],
                               model='multilingual-22-12',
                               truncate='RIGHT').embeddings
            embs.extend(x)
            break
        except requests.exceptions.ConnectionError:
            print('Connection dropped... retrying...')
            time.sleep(1)
        except cohere.error.CohereError:
            # This is most likely going to happen when people get rate limited due to using a trial key.
            # Waiting and retrying should solve the problem.
            time.sleep(60)
    else:
        raise RuntimeError(
            'Hit maximum number of retries connecting to the Cohere API: is there a problem with your network?')

100%|███████████████████████████████| 7/7 [03:14<00:00, 27.81s/it]


In [58]:
len(embs)

3367

In [60]:
embeddings = np.array(embs)
questions = np.array(df.Question)
url = np.array(df.URL)
answer = np.array(df.Answer)

## Save model

In [62]:
np.savez(
    "model/semantic_search",
    embeddings=embeddings,
    questions=questions,
    url=url,
    answer=answer,
)

## Load model

In [89]:
embeddings = np.load("model/semantic_search.npz",allow_pickle=True)
index = faiss.IndexFlatIP(embeddings['embeddings'].shape[-1])  # inner product index
f32_embeddings = embeddings['embeddings'].astype(np.float32)  # faiss only supports f32

# normalise embeddings, so that the inner product is the cosine similarity
normal_f32_embeddings = f32_embeddings / np.linalg.norm(f32_embeddings, ord=2, axis=-1, keepdims=True)
index.add(normal_f32_embeddings)

In [90]:
q_embedding = co.embed([query], model='multilingual-22-12').embeddings
q = np.array(q_embedding, dtype=np.float32)

# normalise the query too
normal_q = q / np.linalg.norm(q, ord=2, axis=-1, keepdims=True)
distances, ind = map(np.squeeze, index.search(normal_q, 10))

results = []
for i, dist in zip(ind, distances):
    result = {
        "question": embeddings['questions'][i],
        "url": embeddings['url'][i],
        "answer": embeddings['answer'][i],
    }
    results.append(result)


In [93]:
results[0]

{'question': 'What is (are) Polycystic ovary syndrome ? (Also called: Polycystic ovaries; Polycystic ovary disease; Stein-Leventhal syndrome; Polyfollicular ovarian disease)',
 'url': 'https://www.nlm.nih.gov/medlineplus/ency/article/000369.htm',
 'answer': 'Polycystic ovary syndrome is a condition in which a woman has an imbalance of female sex hormones. This may lead to changes in the menstrual cycle, cysts in the ovaries, trouble getting pregnant, and other health problems.)  '}

# Production

In [1]:
import cohere
import numpy as np
import faiss

In [2]:
# Paste your API key here. Remember to not share publicly
api_key = open("api_key.txt").readlines()[0]

In [3]:
class SemanticSearch():

    def __init__(self, path, key):
        self.co = cohere.Client(key)
        self.embeddings = np.load(path,allow_pickle=True)
        self.index = faiss.IndexFlatIP(self.embeddings['embeddings'].shape[-1])  # inner product index
        f32_embeddings = self.embeddings['embeddings'].astype(np.float32)  # faiss only supports f32

        # normalise embeddings, so that the inner product is the cosine similarity
        normal_f32_embeddings = f32_embeddings / np.linalg.norm(f32_embeddings, ord=2, axis=-1, keepdims=True)
        self.index.add(normal_f32_embeddings)
        
    def query(self, query, num_query=10):
        q_embedding = self.co.embed([query], model='multilingual-22-12').embeddings
        q = np.array(q_embedding, dtype=np.float32)

        # normalise the query too
        normal_q = q / np.linalg.norm(q, ord=2, axis=-1, keepdims=True)
        distances, ind = map(np.squeeze, self.index.search(normal_q, num_query))

        results = []
        for i, dist in zip(ind, distances):
            result = {
                "question": self.embeddings['questions'][i],
                "url": self.embeddings['url'][i],
                "answer": self.embeddings['answer'][i],
            }
            results.append(result)
        return results


In [4]:
search = SemanticSearch("model/semantic_search.npz", api_key)

In [5]:
results = search.query("Polycystic")

In [6]:
results[0]

{'question': 'Is polycystic kidney disease inherited ? (Also called: PKD; polycystic renal disease)',
 'url': 'https://ghr.nlm.nih.gov/condition/polycystic-kidney-disease',
 'answer': 'Most cases of polycystic kidney disease have an autosomal dominant pattern of inheritance. People with this condition are born with one mutated copy of the PKD1 or PKD2 gene in each cell. In about 90 percent of these cases, an affected person inherits the mutation from one affected parent. The other 10 percent of cases result from a new mutation in one of the genes and occur in people with no history of the disorder in their family.  Although one altered copy of a gene in each cell is sufficient to cause the disorder, an additional mutation in the second copy of the PKD1 or PKD2 gene may make cysts grow faster and increase the severity of the disease. The rate at which cysts enlarge and cause a loss of kidney function varies widely, and may be influenced by mutations in other genes that have not been ide