In [22]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
from bs4 import BeautifulSoup
import requests
from selenium.webdriver.chrome.options import Options
import json
import re
import pandas as pd
import csv
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException


In [23]:
cases_links = []
all_span_tuples = []

total_pages = 2  # quantidade de pags do radiopedia

for page in range(1, total_pages + 1):
    url = f'https://radiopaedia.org/search?lang=us&page={page}&scope=cases&sort=date_of_publication'
    response = requests.get(url)
    content = response.content

    soup = BeautifulSoup(content, 'html.parser')

    elements = soup.find_all('a', class_='search-result-case')

    divs = soup.select('#search-results div.col-xs-12.search-results-listing a div.search-result-body div.search-result-description div.search-result-modalities')

    for div, element in zip(divs, elements):
        spans = div.find_all('span')
        span_texts = [span.text.strip() for span in spans]
        all_span_tuples.append(tuple(span_texts))

        if 'CT' not in span_texts and 'MRI' not in span_texts:
            link = element.get('href')
            cases_links.append('https://radiopaedia.org' + link)

In [24]:
options = Options()
options.headless = True  

driver = webdriver.Chrome(options=options)

In [25]:
def extrair_dados(url):
    dados = {'url': [], 'text': []}

    try:
        response = requests.get(url, headers={'Content-Type': 'text/html; charset=utf-8'})
        response.raise_for_status()
    except requests.exceptions.HTTPError as errh:
        print("HTTP Error:", errh)
        return
    except requests.exceptions.ConnectionError as errc:
        print("Error Connecting:", errc)
        return
    except requests.exceptions.Timeout as errt:
        print("Timeout Error:", errt)
        return
    except requests.exceptions.RequestException as err:
        print("Oops! Something went wrong:", err)
        return

    dados['url'].append(url)

    html = response.text
    soup = BeautifulSoup(html, 'html.parser')

    divs = soup.find_all('div', class_='case-section')

    for div_element in divs:
        p_elements = div_element.find_all('p')

        for p_element in p_elements:
            texto = p_element.get_text(strip=True)
            #print('Texto dentro do parágrafo:', texto)
            dados['text'].append(texto)

        #procurando os textos na div
        dados_items = div_element.find_all('div', class_='data-item')
        for item in dados_items:
            label = item.find('strong', class_='data-item-label').get_text(strip=True)
            valor = item.get_text(strip=True, separator=' ').replace(label, '', 1)
            dados_texto = f'{label} {valor}'
            #print(dados_texto)
            dados['text'].append(dados_texto)

    if not dados['text']:
        dados['text'].append('No data found')

    # tornar todos os textos coletados em um só
    dados['text'] = ', '.join(dados['text'])

    return dados

final_df = pd.DataFrame({'url': [], 'text': []})

for url in cases_links:
    extracted_data = extrair_dados(url)
    temp_df = pd.DataFrame(extracted_data)
    final_df = pd.concat([final_df, temp_df], ignore_index=True)

csv_filename = 'textos.csv'
final_df.to_csv(csv_filename, index=False)
print(f'DataFrame saved to {csv_filename}')

DataFrame saved to textos.csv


In [26]:
with open('dados_cases.csv', 'w', newline='', encoding='utf-8') as csvfile:

    fieldnames = ['public_filename', 'plane_projection', 'url']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()

    for case in cases_links:
        print(case)
        try:
            driver.get(case)

            div_element = driver.find_element(By.CLASS_NAME, 'fa-clickable')
            driver.execute_script("arguments[0].click();", div_element)
            print(div_element)

            script_element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.XPATH, '//script[contains(text(), "var stackedImages")]'))
            )

            codigo_js = script_element.get_attribute('text')

            match = re.search(r'var stackedImages = (.*?);', codigo_js)

            if match:
                json_data = match.group(1)

                data = json.loads(json_data)
                image_info = []

                for entry in data:
                    for image in entry['images']:
                        info = {
                            'public_filename': image['public_filename'],
                            'plane_projection': image['plane_projection'],
                            'url': case  
                        }
                        image_info.append(info)

                for info in image_info:
                    writer.writerow(info)

                    print(f"Public Filename: {info['public_filename']}")
                    print(f"Plane Projection: {info['plane_projection']}")
                    print('-' * 30)
            else:
                print("Pattern not found.")

        except (TimeoutException, NoSuchElementException) as e:
            print(f"Error: {e}")

        finally:
            time.sleep(2)
            driver.refresh()


https://radiopaedia.org/cases/situs-inversus-totalis-22?lang=us
<selenium.webdriver.remote.webelement.WebElement (session="3721734daf3763ac6895d89b165a219b", element="437A405AA111D58CF245045AE2B424DD_element_117")>
Public Filename: https://prod-images-static.radiopaedia.org/images/64141176/f172cae3c2d99bc31da567e65eb0eb9f1e6cd4aa657ea3d40efb069e2f4aa2bc.png
Plane Projection: Frontal
------------------------------
https://radiopaedia.org/cases/schatzker-type-v-tibial-plateau-fracture-with-head-of-fibula-fracture-7?lang=us
<selenium.webdriver.remote.webelement.WebElement (session="3721734daf3763ac6895d89b165a219b", element="57AAE0EEB384ED176C6AC424AB34D9D1_element_192")>
Public Filename: https://prod-images-static.radiopaedia.org/images/64056188/3bf9445a137d94a6d3189589bc3828fbe3a1777c25f221f3e1c708ed74fc98e2.png
Plane Projection: Frontal
------------------------------
Public Filename: https://prod-images-static.radiopaedia.org/images/64056187/0859cc09468e6b7636913fd80bf9939ed633ffa1e0b1

In [27]:
# Carregue os dados dos dois CSVs
dados_df = pd.read_csv('dados_cases.csv')
textos_df = pd.read_csv('textos.csv')

# Realize a junção dos dataframes usando o campo 'url'
merged_df = pd.merge(textos_df, dados_df, on='url', how='left')

# Crie um novo dataframe mantendo apenas as colunas desejadas
final_df = merged_df[['url', 'text', 'public_filename', 'plane_projection']]

# Salve o dataframe final em um novo CSV
final_df.to_csv('dados_completos.csv', index=False)
