In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-MiniLM-L6-v2')

In [2]:
# Récupération des données du JSON ./dataset/dataset_5Q.json
data = pd.read_json('../../dataset/dataset_5Q.json')
data.head()

Unnamed: 0,q_EN,qs_EN,a_EN,q_FR,qs_FR,a_FR,URL,Name
0,What is an abdominal aortic aneurysm?,"[Define an abdominal aortic aneurysm., How wou...",Your aorta carries oxygen-rich blood away from...,Qu'est-ce qu'un anévrisme de l'aorte abdominale ?,,L'aorte transporte le sang riche en oxygène du...,https://www.bhf.org.uk/informationsupport/cond...,abdominal-aortic-aneurysm
1,What are the symptoms of an abdominal aortic a...,[List the symptoms associated with an abdomina...,In most cases there are no noticeable symptoms...,Quels sont les symptômes d'un anévrisme de l'a...,,"Dans la plupart des cas, aucun symptôme n'est ...",https://www.bhf.org.uk/informationsupport/cond...,abdominal-aortic-aneurysm
2,What are the causes of abdominal aortic aneury...,[Enumerate the factors responsible for causing...,It’s not known exactly what causes abdominal a...,Quelles sont les causes de l'anévrisme de l'ao...,,On ne connaît pas exactement les causes de l'a...,https://www.bhf.org.uk/informationsupport/cond...,abdominal-aortic-aneurysm
3,How is an abdominal aortic aneurysm diagnosed?,[Outline the diagnostic methods used for detec...,An AAA is sometimes picked up during tests for...,Comment diagnostiquer un anévrisme de l'aorte ...,,Un AAA est parfois détecté lors d'examens visa...,https://www.bhf.org.uk/informationsupport/cond...,abdominal-aortic-aneurysm
4,When should I get an abdominal aortic aneurysm...,[What is the recommended timing for undergoing...,Abdominal aortic aneurysm is far more common i...,Quand dois-je faire un dépistage de l'anévrism...,,L'anévrisme de l'aorte abdominale est beaucoup...,https://www.bhf.org.uk/informationsupport/cond...,abdominal-aortic-aneurysm


In [3]:
# Récupérer les questions
questions = data["q_EN"].unique()
liste_questions = data["qs_EN"]

print(liste_questions[0][0])

print(questions[0])

Define an abdominal aortic aneurysm.
What is an abdominal aortic aneurysm?


In [4]:
res = [[i]+ j for i, j in zip(questions, liste_questions)]

In [5]:
# this is not production ready data!!
# this is not production ready data!!
sentences = [[sentence.lower()
             .replace('br','')
             .replace('<',"")
             .replace(">", "")
             .replace('\\',"")
             .replace('\/',"")
             for sentence in sublist]
             for sublist in res]



#see a sentence, and our length
print(sentences[5:6], f'\n\nLength Of Data {len(sentences)}')


[['how can an abdominal aortic aneurysm be treated?', 'what are the treatment options available for managing an abdominal aortic aneurysm?', 'how can medical professionals treat an abdominal aortic aneurysm?', 'list the modalities used in the treatment of abdominal aortic aneurysms.', 'what are the approaches to treating an abdominal aortic aneurysm?', 'provide an overview of the treatment methods employed for addressing an abdominal aortic aneurysm.']] 

Length Of Data 565


In [8]:
# On doit applatire le dataset, donc nous créons une liste avec l'id des questions dans un tableau et l'id de la réponse de la forme [[[id_question, id_question,...], id_reponse], ...]

correspondance = []

compteur = 0

for i in range (len(sentences)):
    correspondance.append([[], i])
    for j in range (len(sentences[i])):
        correspondance[i][0].append(compteur)
        compteur += 1
print(correspondance[0:5])

[[[0, 1, 2, 3, 4, 5], 0], [[6, 7, 8, 9, 10, 11], 1], [[12, 13, 14, 15, 16, 17], 2], [[18, 19, 20, 21, 22, 23], 3], [[24, 25, 26, 27, 28, 29], 4]]


In [6]:
# créer une dataframe avec les questions et les réponses

df = pd.DataFrame({'questions': sentences, 'reponses': data["a_EN"]})

# Enregistrer la dataframe dans un fichier csv
df.to_csv('../../dataset/dataset_5Q_traite.csv', index=False)

In [9]:
# On doit applatire le dataset

sentences = [sentence for sublist in sentences for sentence in sublist]

print(sentences[0:5])

['what is an abdominal aortic aneurysm?', 'define an abdominal aortic aneurysm.', 'how would you describe an abdominal aortic aneurysm?', 'provide an explanation of an abdominal aortic aneurysm.', 'what can you tell me about abdominal aortic aneurysms?']


In [10]:
our_sentence_origine = 'What are the symptoms of an abdominal aortic aneurysm?'
our_sentence = 'What signs and symptoms indicate the presence of an abdominal aortic aneurysm?'

In [11]:
# lets embed our sentence
my_embedding = model.encode(our_sentence)

# lets embed the corpus
embeddings = model.encode(sentences)

#Compute cosine similarity between my sentence, and each one in the corpus
cos_sim = util.cos_sim(my_embedding, embeddings)

# lets go through our array and find our best one!
# remember, we want the highest value here (highest cosine similiarity)
winners = []
for arr in cos_sim:
    for i, each_val in enumerate(arr):
        winners.append([sentences[i],each_val, i])

# lets get the top 2 sentences
final_winners = sorted(winners, key=lambda x: x[1], reverse=True)

print(final_winners[0:2])

[['what signs indicate the presence of an abdominal aortic aneurysm?', tensor(0.9805), 8], ['what are the recognizable symptoms of an abdominal aortic aneurysm?', tensor(0.9719), 10]]


In [12]:
def find_value(data, x):
    for sublist in data:
        if x in sublist[0]:
            return sublist[1]
    return None  # Retourne None si la valeur n'est pas trouvée

In [18]:
for arr in final_winners[0:2]:
    print(f'\nScore : \n\n  {arr[1]}')
    print(f'\nLa question : \n\n {arr[0]}')
    indice_rep = find_value(correspondance, arr[2])
    print(f'\nLa réponse : \n\n {data["a_EN"][indice_rep]}')


Score : 

  0.9805396795272827

La question : 

 what signs indicate the presence of an abdominal aortic aneurysm?

La réponse : 

 In most cases there are no noticeable symptoms and most people with AAA won’t be aware that they have one. However, if an aneurysm becomes large it can cause:

- A pulsating feeling in your stomach
- Pain in your stomach
- Persistent back pain.

A large aneurysm can burst and cause internal bleeding. This is a medical emergency and you should call 999 immediately if you or someone with you has any of these symptoms:

- A sudden severe pain in the abdomen, back or lower back area
- Feeling cold, clammy, sweaty, faint and breathless
- Fainting or passing out.

Score : 

  0.9719202518463135

La question : 

 what are the recognizable symptoms of an abdominal aortic aneurysm?

La réponse : 

 In most cases there are no noticeable symptoms and most people with AAA won’t be aware that they have one. However, if an aneurysm becomes large it can cause:

- A puls

On pourrait imaginer que sur les 5 premiers cos_similarité, on regarde si c'est la même question au moins 4 fois, si oui, on prend la réponse de la question qui a le plus de cos_similarité. Donc notre probabilité est très forte sinon on interroge l'IA ChatGPT.