In [None]:
# First install required packages
!pip install -q requests beautifulsoup4 pandas

import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import time
from urllib.parse import urljoin
import json
from google.colab import drive
from pathlib import Path
import re
from typing import List, Dict
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class MedicalSourceDownloader:
    def __init__(self, base_dir: str = '/content/drive/MyDrive/TFM2/medical_knowledge'):
        """
        Initialize the downloader with sources and configurations
        """
        self.base_dir = base_dir
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })

        # Create directories
        os.makedirs(base_dir, exist_ok=True)
        for subdir in ['raw', 'processed', 'metadata']:
            os.makedirs(os.path.join(base_dir, subdir), exist_ok=True)

        # Source definitions
        self.sources = {
            'medlineplus': {
                'base_url': 'https://medlineplus.gov/spanish/',
                'topics_url': 'https://medlineplus.gov/spanish/healthtopics.html',
                'description': 'NIH Medical Encyclopedia in Spanish. High-quality, verified medical information.'
            },
            'fisterra': {
                'base_url': 'https://www.fisterra.com/material-educacion-pacientes/',
                'description': 'Spanish clinical practice guidelines and patient education materials.'
            },
            'scielo': {
                'base_url': 'https://scielo.isciii.es/scielo.php',
                'search_url': 'https://scielo.isciii.es/cgi-bin/wxis.exe/iah/',
                'description': 'Scientific Electronic Library Online - Spanish medical journal articles.'
            },
            'mscbs': {
                'base_url': 'https://www.sanidad.gob.es/profesionales/biblioteca/',
                'description': 'Spanish Ministry of Health library resources.'
            }
        }

    def download_medlineplus_topics(self):
        """
        Download health topics from MedlinePlus en Español
        """
        logger.info("Starting MedlinePlus download...")
        try:
            response = self.session.get(self.sources['medlineplus']['topics_url'])
            response.raise_for_status()

            soup = BeautifulSoup(response.text, 'html.parser')
            topics_dir = os.path.join(self.base_dir, 'raw', 'medlineplus')
            os.makedirs(topics_dir, exist_ok=True)

            # Find and process topics
            for link in soup.find_all('a', href=True):
                if '/spanish/' in link['href'] and 'healthtopics' not in link['href']:
                    try:
                        topic_name = link.text.strip()
                        topic_url = urljoin(self.sources['medlineplus']['base_url'], link['href'])
                        filename = f"{topic_name.replace(' ', '_')}.txt"

                        # Download topic content
                        topic_response = self.session.get(topic_url)
                        topic_soup = BeautifulSoup(topic_response.text, 'html.parser')
                        content = topic_soup.find('div', {'id': 'topic-summary'})

                        if content:
                            with open(os.path.join(topics_dir, filename), 'w', encoding='utf-8') as f:
                                f.write(f"Title: {topic_name}\nSource: MedlinePlus\nURL: {topic_url}\n\n")
                                f.write(content.get_text(separator='\n\n', strip=True))
                            logger.info(f"Downloaded: {topic_name}")

                        time.sleep(2)  # Be respectful to the server
                    except Exception as e:
                        logger.error(f"Error downloading topic {topic_name}: {str(e)}")

        except Exception as e:
            logger.error(f"Error in MedlinePlus download: {str(e)}")

    def download_scielo_articles(self, search_terms: List[str]):
        """
        Download medical articles from SciELO
        """
        logger.info("Starting SciELO download...")
        articles_dir = os.path.join(self.base_dir, 'raw', 'scielo')
        os.makedirs(articles_dir, exist_ok=True)

        for term in search_terms:
            try:
                # Format search URL
                search_url = f"{self.sources['scielo']['search_url']}?IsisScript=iah/iah.xis&base=article^dlibrary&lang=e&nextAction=lnk&exprSearch={term}&indexSearch=TX"
                response = self.session.get(search_url)
                response.raise_for_status()

                soup = BeautifulSoup(response.text, 'html.parser')

                # Find and process articles
                for article in soup.find_all('div', class_='article'):
                    try:
                        title = article.find('h2').text.strip()
                        abstract = article.find('div', class_='abstract').text.strip()

                        filename = f"scielo_{term}_{title[:50]}.txt"
                        with open(os.path.join(articles_dir, filename), 'w', encoding='utf-8') as f:
                            f.write(f"Title: {title}\nSource: SciELO\nTerm: {term}\n\n")
                            f.write(abstract)

                        logger.info(f"Downloaded article: {title[:50]}...")
                        time.sleep(2)
                    except Exception as e:
                        logger.error(f"Error processing article: {str(e)}")

            except Exception as e:
                logger.error(f"Error in SciELO search for term {term}: {str(e)}")

    def process_downloaded_content(self):
        """
        Process and organize downloaded content
        """
        logger.info("Processing downloaded content...")

        categories = {
            'anatomia': ['anatomía', 'músculo', 'hueso', 'articulación', 'estructura'],
            'fisiologia': ['fisiología', 'función', 'sistema', 'regulación', 'homeostasis'],
            'patologia': ['patología', 'enfermedad', 'síndrome', 'trastorno', 'condición'],
            'farmacologia': ['fármaco', 'medicamento', 'tratamiento', 'terapia', 'dosis'],
            'diagnostico': ['diagnóstico', 'síntoma', 'signo', 'prueba', 'evaluación']
        }

        processed_dir = os.path.join(self.base_dir, 'processed')
        for category in categories:
            os.makedirs(os.path.join(processed_dir, category), exist_ok=True)

        # Process each file in raw directory
        for source_dir in os.listdir(os.path.join(self.base_dir, 'raw')):
            source_path = os.path.join(self.base_dir, 'raw', source_dir)
            if os.path.isdir(source_path):
                for filename in os.listdir(source_path):
                    try:
                        with open(os.path.join(source_path, filename), 'r', encoding='utf-8') as f:
                            content = f.read().lower()

                        # Determine best category
                        max_matches = 0
                        best_category = 'otros'
                        for category, keywords in categories.items():
                            matches = sum(1 for keyword in keywords if keyword in content)
                            if matches > max_matches:
                                max_matches = matches
                                best_category = category

                        # Copy to processed directory with category
                        with open(os.path.join(processed_dir, best_category, filename), 'w', encoding='utf-8') as f:
                            f.write(content)

                    except Exception as e:
                        logger.error(f"Error processing file {filename}: {str(e)}")

        logger.info("Content processing complete!")

# Mount Google Drive
drive.mount('/content/drive')

# Define search terms
search_terms = [
    'sistema renina angiotensina',
    'metabolismo hierro',
    'diabetes mellitus',
    'anatomia rotadores',
    'fisiologia esofago'
]

# Create downloader instance
downloader = MedicalSourceDownloader()

# Run downloads
print("Starting downloads...")
downloader.download_medlineplus_topics()
downloader.download_scielo_articles(search_terms)

# Process content
print("Processing content...")
downloader.process_downloaded_content()

print("Download and processing complete!")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Starting downloads...
Processing content...
Download and processing complete!
