In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-MiniLM-L6-v2')

In [2]:
data = pd.read_feather("./dataset.feather")
data.head()

Unnamed: 0,questions,answers,url,name
0,What is an abdominal aortic aneurysm?,Your aorta carries oxygen-rich blood away from...,https://www.bhf.org.uk/informationsupport/cond...,abdominal-aortic-aneurysm
1,What are the symptoms of an abdominal aortic a...,In most cases there are no noticeable symptoms...,https://www.bhf.org.uk/informationsupport/cond...,abdominal-aortic-aneurysm
2,What are the causes of abdominal aortic aneury...,It’s not known exactly what causes abdominal a...,https://www.bhf.org.uk/informationsupport/cond...,abdominal-aortic-aneurysm
3,How is an abdominal aortic aneurysm diagnosed?,An AAA is sometimes picked up during tests for...,https://www.bhf.org.uk/informationsupport/cond...,abdominal-aortic-aneurysm
4,When should I get an abdominal aortic aneurysm...,Abdominal aortic aneurysm is far more common i...,https://www.bhf.org.uk/informationsupport/cond...,abdominal-aortic-aneurysm


In [3]:
# Récupérer les questions
questions = data["questions"].unique()

questions[0]

'What is an abdominal aortic aneurysm?'

In [4]:
# this is not production ready data!!
sentences = [sentence.lower()
             .replace('br','')
             .replace('<',"")
             .replace(">", "")
             .replace('\\',"")
             .replace('\/',"")
             for sentence in questions]


#see a sentence, and our length
print(sentences[5:6], f'\n\nLength Of Data {len(sentences)}')

['how can an abdominal aortic aneurysm be treated?'] 

Length Of Data 569


In [5]:
our_sentence_origine = 'What are the symptoms of an abdominal aortic aneurysm?'
our_sentence = 'What signs and symptoms indicate the presence of an abdominal aortic aneurysm?'

In [6]:
# lets embed our sentence
my_embedding = model.encode(our_sentence)

# lets embed the corpus
embeddings = model.encode(sentences)

#Compute cosine similarity between my sentence, and each one in the corpus
cos_sim = util.cos_sim(my_embedding, embeddings)

# lets go through our array and find our best one!
# remember, we want the highest value here (highest cosine similiarity)
winners = []
for arr in cos_sim:
    for i, each_val in enumerate(arr):
        winners.append([sentences[i],each_val, i])

# lets get the top 2 sentences
final_winners = sorted(winners, key=lambda x: x[1], reverse=True)

for arr in final_winners[0:2]:
    print(f'\nScore : \n\n  {arr[1]}')
    print(f'\nLa question : \n\n {arr[0]}')
    print(f'\nLa réponse : \n\n {data["answers"][arr[2]]}')


Score : 

  0.9682747721672058

La question : 

 what are the symptoms of an abdominal aortic aneurysm?

La réponse : 

 In most cases there are no noticeable symptoms and most people with AAA won’t be aware that they have one. However, if an aneurysm becomes large it can cause:

- A pulsating feeling in your stomach
- Pain in your stomach
- Persistent back pain.

A large aneurysm can burst and cause internal bleeding. This is a medical emergency and you should call 999 immediately if you or someone with you has any of these symptoms:

- A sudden severe pain in the abdomen, back or lower back area
- Feeling cold, clammy, sweaty, faint and breathless
- Fainting or passing out.

Score : 

  0.8461344242095947

La question : 

 how is an abdominal aortic aneurysm diagnosed?

La réponse : 

 An AAA is sometimes picked up during tests for other conditions, commonly by:

- Ultrasound
- CT scan
- MRI scan.
