In [1]:
import pandas as pd
import numpy as np
import cohere
import re
from tqdm import tqdm
import umap
import altair as alt
from sklearn.metrics.pairwise import cosine_similarity
from annoy import AnnoyIndex
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Paste your API key here. Remember to not share publicly
api_key = open("api_key.txt").readlines()[0]

# Create and retrieve a Cohere API key from os.cohere.ai
co = cohere.Client(api_key)

In [3]:
df = pd.read_csv('data/multilingual.csv')

In [4]:
df.head()

Unnamed: 0,question,answer,url,language,source,section,wrong_answer
0,什么是新型冠状病毒？,新型冠状病毒是一种此前尚未确定的新冠状病毒。导致新型冠状病毒肺炎 | 新冠肺炎2019（CO...,https://chinese.cdc.gov/coronavirus/2019-ncov/...,chinese,CDC,Frequently Asked Questions,使用 CDC 开发的诊断检测，结果为阴性是指在受测者样本中未发现导致 COVID-19 的病...
1,为什么这种疾病被称为 2019 年新型冠状病毒肺炎（COVID-19）？,2020 年 2 月 11 日，世界卫生组织宣布了导致最初于中国武汉发现的 2019 年新型...,https://chinese.cdc.gov/coronavirus/2019-ncov/...,chinese,CDC,Frequently Asked Questions,引发COVID-19的病毒被认为主要以人传人的方式，通过感染患者咳嗽或打喷嚏时产生的呼吸道飞...
2,为什么有人会因为 COVID-19 而指责或躲避某些个人和群体（制造污名）？,美国民众可能对住在或前往COVID-19传播地区的亲友感到担心焦虑。有些人很担心通过这些人感...,https://chinese.cdc.gov/coronavirus/2019-ncov/...,chinese,CDC,Frequently Asked Questions,目前，没有证据表明COVID-19病毒会从宠物的皮肤或皮毛传播给人类。\n关于照顾宠物的最佳...
3,人们如何制止与 COVID-19 相关的污名化？,在注意到污名化正在发生的情况下，人们可以通过提供社会支持来对抗污名化。污名会影响被污名化群体...,https://chinese.cdc.gov/coronavirus/2019-ncov/...,chinese,CDC,Frequently Asked Questions,洗手是保护您与您的家人远离疾病的最佳途径之一。经常用肥皂和水洗手20秒以上，尤其是擤鼻涕、咳...
4,为何某些州的COVID-19病例数量有时与CDC网站上发布的数量有差别？,CDC发布的总病例数量依照各辖区的确认程序得以验证。不同的地方所显示的病例数量的发现和确认程...,https://chinese.cdc.gov/coronavirus/2019-ncov/...,chinese,CDC,Frequently Asked Questions,\n\n与孩子的学校保持联系。\n\n很多学校提供网络课程（虚拟学习）。查看学校布置的作业，...


In [5]:
df.isnull().sum()

question        1
answer          0
url             0
language        0
source          0
section         0
wrong_answer    0
dtype: int64

In [6]:
df = df.dropna(axis=0)

In [7]:
# Get the embeddings
embeds = co.embed(texts=list(df['question']),
                  model="large",
                  truncate="LEFT").embeddings

In [8]:
# Check the dimensions of the embeddings
embeds = np.array(embeds)
embeds.shape

(887, 4096)

In [9]:
# Create the search index, pass the size of embedding
search_index = AnnoyIndex(embeds.shape[1], 'angular')
# Add all the vectors to the search index
for i in range(len(embeds)):
    search_index.add_item(i, embeds[i])

search_index.build(10) # 10 trees
search_index.save('test.ann')

True

In [10]:
# Choose an example (we'll retrieve others similar to it)
example_id = 78

# Retrieve nearest neighbors
similar_item_ids = search_index.get_nns_by_item(example_id,10,
                                                include_distances=True)
# Format and print the text and distances
results = pd.DataFrame(data={'texts': df.iloc[similar_item_ids[0]]['question'], 
                             'distance': similar_item_ids[1]}).drop(example_id)

print(f"Question:'{df.iloc[example_id]['question']}'\nNearest neighbors:")
results

Question:'Why is the disease being called coronavirus disease 2019, COVID-19?'
Nearest neighbors:


Unnamed: 0,texts,distance
559,What is COVID-19?,0.708529
232,¿Por qué a la enfermedad la llaman enfermedad ...,0.813518
77,What is a novel coronavirus?,0.813775
558,What is a coronavirus?,0.842466
569,Is COVID-19 the same as SARS?,0.852085
1,为什么这种疾病被称为 2019 年新型冠状病毒肺炎（COVID-19）？,0.872134
578,Is the source of the coronavirus causing COVID...,0.923686
473,什么是2019冠状病毒病？,0.951086
804,¿Qué es la COVID-19?,0.960803


In [11]:
query = "what ages are more prone to die from covid"

# Get the query's embedding
query_embed = co.embed(texts=[query],
                  model="large",
                  truncate="LEFT").embeddings

# Retrieve the nearest neighbors
similar_item_ids = search_index.get_nns_by_vector(query_embed[0],3,
                                                include_distances=True)
# Format the results
results = pd.DataFrame(data={'texts': df.iloc[similar_item_ids[0]]['question'], 
                             'distance': similar_item_ids[1]})


print(f"Query:'{query}'\nNearest neighbors:")
results

Query:'what ages are more prone to die from covid'
Nearest neighbors:


Unnamed: 0,texts,distance
131,Who is at higher risk for serious illness from...,0.724839
133,How were the underlying conditions for people ...,0.951346
565,Who is at risk of developing severe illness?,0.977805
