In [61]:
import pandas as pd 
import numpy as np
import re
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor

## Датасет medQUAD

0. Формируем структуру датасета
1. Парсим контекст с сайта
2. Находим в контексте токен, с которого начинается ответ на вопрос
3. Переводим все текстовые поля на русский язык 

In [62]:
medQUAD = pd.read_csv('./MedQuAD/QA-TestSet-LiveQA-Med-Qrels-2479-Answers/All-2479-Answers-retrieved-from-MedQuAD.csv')
medQUAD.columns

Index(['AnswerID', 'Answer'], dtype='object')

In [63]:
# Формируем структуру датасета
split_words = ["URL:", "Answer:"]
pattern = re.compile("|".join(map(re.escape, split_words)))

result_list = []
for row in medQUAD.Answer.values:
    result = re.split(pattern, row)
    result = [item.strip() for item in result if item.strip()]
    if len(result) == 4:
        print(result)
    else: 
        result_list.append(result)


df = pd.DataFrame(result_list, columns=['Question', 'URL', 'Answer'])

['Question: What is (are) Vaginal dryness alternative treatments ? (Also called: Alternative treatments for vaginal dryness)', 'https://www.nlm.nih.gov/medlineplus/ency/article/002142.htm', 'Question:   Is there a drug-free treatment for vaginal dryness?', 'There are many causes of vaginal dryness. It may be caused by reduced estrogen levels, infection, medicines, and other things. Before treating yourself, talk to your health care provider.   Water-based lubricants and vaginal moisturizers work very well. Lubricants will moisten the vaginal opening and lining for several hours. The effects of a vaginal cream can last for up to a day.  Soybeans contain plant-based substances called isoflavones. These substances have an effect of the body that is similar to estrogen, but weaker. Therefore, it would seem that a diet rich in soy foods would improve symptoms of vaginal dryness. There continues to be research in this area, but the ideal sources or dose is still unknown. Soy foods include to

In [None]:
# парсим данные
def scrape_content(url):
    try:
        response = requests.get(url)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')
        main_divs = soup.find_all("div", class_="main")

        text_content = ""
        for main_div in main_divs:
            text_content += main_div.get_text()

        return text_content
    except requests.exceptions.RequestException as e:
        print(f"Error retrieving content for {url}: {e}")
        return "no content"

def scrap_context_parallel(df):
    urls = df.URL.values

    # Use ThreadPoolExecutor to retrieve data in parallel
    with ThreadPoolExecutor() as executor:
        contents = list(executor.map(scrape_content, urls))

    df['context'] = contents
    return df

df = scrap_context_parallel(df)

In [26]:
def clean_text(text):
    cleaned_text = '\n'.join(line.strip() for line in text.splitlines() if line.strip())
    cleaned_text = ' '.join(cleaned_text.split())
    return cleaned_text

In [27]:
# remove unnesessary information 

def remove_ref(s):
    s = s.split('Reference')[0]
    return s

def remove_resources(s):
    s = s.split('Resources')[0]
    return s

def alternative_names(s):
    s = s.split('Alternative Names')[0]
    return s

def add_info(s):
    s = s.split('Additional Information')[0]
    return s

In [None]:
df['context'] = df['context'].apply(clean_text)
df['context'] = df['context'].apply(alternative_names)
df['context'] = df['context'].apply(add_info)
df['context'] = df['context'].apply(remove_ref)
df['context'] = df['context'].apply(remove_resources)

## Translate into russian

In [11]:
def identify_answer_start(df):
    df['Answer_Start_Index'] = df.apply(lambda row: row['context'].find(row['Answer'][:4]), axis=1)

    return df

df = identify_answer_start(df)

In [None]:
from deep_translator import GoogleTranslator

df_ru = df.copy()

def translate(s):
  return GoogleTranslator('en', 'ru').translate()

# not allowed to pass very long strings for translation
df_ru = df_ru.loc[df_ru.Answer.str.len() < 5000]
df_ru = df_ru.loc[df_ru.Question.str.len() < 5000]
df_ru = df_ru.loc[df_ru.context.str.len() < 5000]

df_ru['Question'] = df_ru.Question.apply(translate)
df_ru['Answer'] = df_ru.Answer.apply(translate)
df_ru['context'] = df_ru.context.apply(translate)

In [55]:
df_ru.to_csv('./MedQuAD_Russian.csv')

## Parse all the files in folders 


In [20]:
import os
import xml.etree.ElementTree as ET
import pandas as pd

def parse_xml_files(folder_path="./MedQuAD"):
    dataset = {'question': [], 'answer': [], 'source': []}
    skipped_count = 0

    # Iterate through each file in the specified folder
    for subdir, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".xml"):
                file_path = os.path.join(subdir, file)

                # Parse the XML file
                tree = ET.parse(file_path)
                root = tree.getroot()

                # Check for the existence of required elements
                focus = root.find(".//Focus")
                qa_pairs = root.findall(".//QAPair")

                if focus is None or not qa_pairs:
                    skipped_count += 1
                    # print(f"Skipped: {file_path}")
                    continue

                source_url = root.get('url')

                for qa_pair in qa_pairs:
                    question = qa_pair.find("Question").text
                    answer = qa_pair.find("Answer").text

                    # Check if question or answer is missing
                    if question is None or answer is None:
                        skipped_count += 1
                        print(f"Skipped: {file_path}")
                        continue

                    # Append data to the dataset
                    dataset['question'].append(question.strip())
                    dataset['answer'].append(answer.strip())
                    dataset['source'].append(source_url)

    # Convert the dataset to a Pandas DataFrame
    df = pd.DataFrame(dataset)
    print(f"Skipped {skipped_count} documents.")
    return df

# Example usage
result_dataset = parse_xml_files()
print(result_dataset)

Skipped: ./MedQuAD/2_GARD_QA/0002079.xml
Skipped: ./MedQuAD/2_GARD_QA/0002253.xml
Skipped: ./MedQuAD/2_GARD_QA/0002080.xml
Skipped: ./MedQuAD/2_GARD_QA/0006509.xml
Skipped: ./MedQuAD/2_GARD_QA/0002747.xml
Skipped: ./MedQuAD/12_MPlusHerbsSupplements_QA/0000029.xml
Skipped: ./MedQuAD/12_MPlusHerbsSupplements_QA/0000029.xml
Skipped: ./MedQuAD/12_MPlusHerbsSupplements_QA/0000029.xml
Skipped: ./MedQuAD/12_MPlusHerbsSupplements_QA/0000029.xml
Skipped: ./MedQuAD/12_MPlusHerbsSupplements_QA/0000029.xml
Skipped: ./MedQuAD/12_MPlusHerbsSupplements_QA/0000029.xml
Skipped: ./MedQuAD/12_MPlusHerbsSupplements_QA/0000029.xml
Skipped: ./MedQuAD/12_MPlusHerbsSupplements_QA/0000029.xml
Skipped: ./MedQuAD/12_MPlusHerbsSupplements_QA/0000001.xml
Skipped: ./MedQuAD/12_MPlusHerbsSupplements_QA/0000001.xml
Skipped: ./MedQuAD/12_MPlusHerbsSupplements_QA/0000001.xml
Skipped: ./MedQuAD/12_MPlusHerbsSupplements_QA/0000001.xml
Skipped: ./MedQuAD/12_MPlusHerbsSupplements_QA/0000001.xml
Skipped: ./MedQuAD/12_MPlusH

In [21]:
result_dataset.to_csv('./part_2.csv')

In [7]:
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import urlparse

def scrape_content(url, skip_domains=set()):
    domain = urlparse(url).netloc
    if domain in skip_domains:
        print(f"Skipping {url} due to previous failure for domain: {domain}")
        return "no content"

    try:
        response = requests.get(url)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')
        main_divs = soup.find_all("div", class_="main")

        text_content = ""
        for main_div in main_divs:
            text_content += main_div.get_text()

        return text_content
    except requests.exceptions.RequestException as e:
        print(f"Error retrieving content for {url}: {e}. Skipping domain: {domain}")
        skip_domains.add(domain)
        return "no content"

def scrap_context_parallel(df):
    urls = df.source.values
    skip_domains = set()

    # Use ThreadPoolExecutor to retrieve data in parallel
    with ThreadPoolExecutor() as executor:
        contents = list(executor.map(lambda url: scrape_content(url, skip_domains), urls))

    df['context'] = contents
    return df

result_dataset = scrap_context_parallel(result_dataset)

Error retrieving content for https://ghr.nlm.nih.gov/condition/keratoderma-with-woolly-hair: HTTPSConnectionPool(host='medlineplus.gov', port=443): Max retries exceeded with url: /genetics/condition/keratoderma-with-woolly-hair (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7fe1927cf2e0>: Failed to resolve 'medlineplus.gov' ([Errno 8] nodename nor servname provided, or not known)")). Skipping domain: ghr.nlm.nih.govError retrieving content for https://ghr.nlm.nih.gov/condition/coloboma: HTTPSConnectionPool(host='medlineplus.gov', port=443): Max retries exceeded with url: /genetics/condition/coloboma (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7fe1927cfa60>: Failed to resolve 'medlineplus.gov' ([Errno 8] nodename nor servname provided, or not known)")). Skipping domain: ghr.nlm.nih.gov
Skipping https://ghr.nlm.nih.gov/condition/coloboma due to previous failure for domain: ghr.nlm.nih.gov
Skipping https://ghr.nlm.nih.go

In [17]:
result_dataset = result_dataset.loc[(result_dataset['context']!='no content')&(result_dataset['context'].notnull())&(result_dataset['context']!='')]

In [23]:
result_dataset = pd.read_csv('./part_2_parsed.csv')

In [28]:
result_dataset['context'] = result_dataset['context'].apply(clean_text)
result_dataset['context'] = result_dataset['context'].apply(alternative_names)
result_dataset['context'] = result_dataset['context'].apply(add_info)
result_dataset['context'] = result_dataset['context'].apply(remove_ref)
result_dataset['context'] = result_dataset['context'].apply(remove_resources)

In [31]:
result_dataset = result_dataset.loc[result_dataset['question'].str.len() < 5000]
result_dataset = result_dataset.loc[result_dataset['answer'].str.len() < 5000]
result_dataset = result_dataset.loc[result_dataset['context'].str.len() < 5000]

In [32]:
result_dataset

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,question,answer,source,context
0,0,0,What is (are) keratoderma with woolly hair ?,Keratoderma with woolly hair is a group of rel...,https://ghr.nlm.nih.gov/condition/keratoderma-...,Description Keratoderma with woolly hair is a ...
1,1,1,How many people are affected by keratoderma wi...,Keratoderma with woolly hair is rare; its prev...,https://ghr.nlm.nih.gov/condition/keratoderma-...,Description Keratoderma with woolly hair is a ...
2,2,2,What are the genetic changes related to kerato...,"Mutations in the JUP, DSP, DSC2, and KANK2 gen...",https://ghr.nlm.nih.gov/condition/keratoderma-...,Description Keratoderma with woolly hair is a ...
3,3,3,Is keratoderma with woolly hair inherited ?,Most cases of keratoderma with woolly hair hav...,https://ghr.nlm.nih.gov/condition/keratoderma-...,Description Keratoderma with woolly hair is a ...
4,4,4,What are the treatments for keratoderma with w...,These resources address the diagnosis or manag...,https://ghr.nlm.nih.gov/condition/keratoderma-...,Description Keratoderma with woolly hair is a ...
...,...,...,...,...,...,...
849,13920,13920,Do you have information about Women's Health,Summary : Women have unique health issues. And...,https://www.nlm.nih.gov/medlineplus/womensheal...,On this page Basics Summary Start Here Prevent...
850,13921,13921,What is (are) Rotator Cuff Injuries ?,Your rotator cuff is located in your shoulder ...,https://www.nlm.nih.gov/medlineplus/rotatorcuf...,On this page Basics Summary Diagnosis and Test...
851,13922,13922,What is (are) Viral Infections ?,Viruses are capsules with genetic material ins...,https://www.nlm.nih.gov/medlineplus/viralinfec...,On this page Basics Summary Start Here Diagnos...
852,13923,13923,What is (are) Salivary Gland Disorders ?,Your salivary glands make saliva - sometimes c...,https://www.nlm.nih.gov/medlineplus/salivarygl...,On this page Basics Summary Learn More Living ...


In [37]:
from deep_translator import GoogleTranslator

def translate(s):
  return GoogleTranslator('en', 'ru').translate(s)

result_dataset['Question'] = result_dataset.question.apply(translate)
result_dataset['Answer'] = result_dataset.answer.apply(translate)
result_dataset['context'] = result_dataset.context.apply(translate)

In [39]:
def identify_answer_start(df):
    df['Answer_Start_Index'] = df.apply(lambda row: row['context'].find(row['Answer'][:4]), axis=1)

    return df

result_dataset = identify_answer_start(result_dataset)

In [41]:
result_dataset = result_dataset.loc[:, ['context', 'Question', 'Answer', 'Answer_Start_Index']]

In [43]:
result_dataset.to_csv('./MedQuAD_Russian_part_2.csv')

In [42]:
result_dataset

Unnamed: 0,context,Question,Answer,Answer_Start_Index
0,Описание Кератодермия с пушистыми волосами пре...,Что такое кератодермия с пушистыми волосами?,Кератодермия с пушистыми волосами представляет...,9
1,Описание Кератодермия с пушистыми волосами пре...,Сколько людей с шерстяными волосами страдают к...,Кератодермия с пушистыми волосами встречается ...,9
2,Описание Кератодермия с пушистыми волосами пре...,"Каковы генетические изменения, связанные с кер...","Мутации в генах JUP, DSP, DSC2 и KANK2 вызываю...",2551
3,Описание Кератодермия с пушистыми волосами пре...,Передается ли кератодермия с пушистыми волосами?,Большинство случаев кератодермии с курчавыми в...,4573
4,Описание Кератодермия с пушистыми волосами пре...,Каковы методы лечения кератодермии с пушистыми...,Эти ресурсы посвящены диагностике или лечению ...,-1
...,...,...,...,...
849,На этой странице Основные сведения Начни здесь...,Есть ли у вас информация о женском здоровье?,Резюме: Женщины имеют уникальные проблемы со з...,-1
850,На этой странице Основные сведения Диагностика...,Что такое (являются) травмы вращательной манже...,Вращающая манжета расположена в области плеча....,-1
851,На этой странице Основные сведения Начать здес...,Что такое (являются) вирусные инфекции?,Вирусы представляют собой капсулы с генетическ...,-1
852,На этой странице Краткое изложение основ Узнай...,Что такое (являются) заболевания слюнных желез?,"Слюнные железы вырабатывают слюну, которую ино...",-1


In [44]:
df = pd.read_csv('./MedQuAD_Russian.csv')

In [46]:
df.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'Question', 'URL', 'Answer', 'context',
       'Answer_Start_Index'],
      dtype='object')

In [53]:
df = df.loc[:, ['context', 'Question', 'Answer', 'Answer_Start_Index']]

In [54]:
result_dataset = pd.read_csv('./MedQuAD_Russian_part_2.csv')

In [55]:
result_dataset.columns

Index(['Unnamed: 0', 'context', 'Question', 'Answer', 'Answer_Start_Index'], dtype='object')

In [57]:
df = pd.concat([result_dataset.loc[:, ['context', 'Question', 'Answer', 'Answer_Start_Index']], df])

In [59]:
df.to_csv('./MedQuAD.csv')

In [60]:
df

Unnamed: 0,context,Question,Answer,Answer_Start_Index
0,Описание Кератодермия с пушистыми волосами пре...,Что такое кератодермия с пушистыми волосами?,Кератодермия с пушистыми волосами представляет...,9
1,Описание Кератодермия с пушистыми волосами пре...,Сколько людей с шерстяными волосами страдают к...,Кератодермия с пушистыми волосами встречается ...,9
2,Описание Кератодермия с пушистыми волосами пре...,"Каковы генетические изменения, связанные с кер...","Мутации в генах JUP, DSP, DSC2 и KANK2 вызываю...",2551
3,Описание Кератодермия с пушистыми волосами пре...,Передается ли кератодермия с пушистыми волосами?,Большинство случаев кератодермии с курчавыми в...,4573
4,Описание Кератодермия с пушистыми волосами пре...,Каковы методы лечения кератодермии с пушистыми...,Эти ресурсы посвящены диагностике или лечению ...,-1
...,...,...,...,...
1448,Описание Дефицит аденозинмонофосфат-дезаминазы...,Вопрос: Что такое дефицит аденозинмонофосфатде...,Дефицит аденозинмонофосфат (АМФ) дезаминазы — ...,12
1449,Описание Дефицит аденозинмонофосфат-дезаминазы...,Вопрос: Сколько людей страдают от дефицита аде...,Дефицит АМФ-дезаминазы является одним из наибо...,174
1450,Описание Дефицит аденозинмонофосфат-дезаминазы...,"Вопрос: Каковы генетические изменения, связанн...",Мутации в гене AMPD1 вызывают дефицит AMP-деза...,1495
1451,Описание Дефицит аденозинмонофосфат-дезаминазы...,Вопрос: Наследуется ли дефицит аденозинмонофос...,Это состояние наследуется по аутосомно-рецесси...,1346
