#### Step 1: Data Collection & Preprocessing

In [1]:
import pandas as pd
from lxml import etree
from tqdm import tqdm
import concurrent.futures

# Step 1.1: Load the XML data into a structured format (e.g., Pandas DataFrame)
xml_file_path = 'db/drugbank.xml'

# Function to parse a chunk of XML data
def parse_chunk(start, end, xml_file_path):
    context = etree.iterparse(xml_file_path, events=("start", "end"))
    data = []
    count = 0
    for event, elem in context:
        if event == 'end' and elem.tag == 'drug':
            count += 1
            if count < start:
                continue
            if count > end:
                break
            drug_data = {
                'drugbank-id': elem.xpath('./drugbank-id/text()'),
                'name': elem.xpath('./name/text()'),
                'mechanism-of-action': elem.xpath('./mechanism-of-action/text()'),
                'pathways': elem.xpath('./pathways/pathway/name/text()'),
                'interaction': elem.xpath('./drug-interactions/interaction/text()'),
            }
            data.append(drug_data)
            elem.clear()  # Clear to save memory
    return data

# Function to process the entire file using parallelism or sequentially
def process_xml(xml_file_path, num_chunks=4):
    try:
        # First, get the total number of drugs in the XML (for progress bar and chunk splitting)
        context = etree.iterparse(xml_file_path, events=("end",))
        total_drugs = sum(1 for _, elem in context if elem.tag == 'drug')
        
        # Define chunk size
        chunk_size = total_drugs // num_chunks

        # Prepare for parallel processing with progress bar
        data = []
        with concurrent.futures.ThreadPoolExecutor() as executor:
            futures = []
            for i in range(num_chunks):
                start = i * chunk_size
                end = (i + 1) * chunk_size if i != num_chunks - 1 else total_drugs
                futures.append(executor.submit(parse_chunk, start, end, xml_file_path))

            # Add progress bar
            for future in tqdm(concurrent.futures.as_completed(futures), total=num_chunks, desc="Processing XML"):
                data.extend(future.result())

        # Convert the collected data into a DataFrame
        df = pd.DataFrame(data)
        return df
    except Exception as e:
        print(f"An error occurred during the processing: {str(e)}")
        return None

# Run the function and load the data
df = process_xml(xml_file_path, num_chunks=4)

# Show the first few rows to check the data
if df is not None:
    print(df.head())


KeyboardInterrupt: 