In [1]:
import numpy as np
import pandas as pd

In [3]:
import os
# Set the root directory that contains the folders (e.g., 'IC-001', 'IC-002', etc.)
root_path = "../data/lisa_sheets"

# List to hold the data for each file
data = []

# Loop over each folder in the root directory
for folder in os.listdir(root_path):
    folder_path = os.path.join(root_path, folder)
    if os.path.isdir(folder_path):
        # Loop over each file in the folder
        for file in os.listdir(folder_path):
            if file.endswith(".txt"):
                file_path = os.path.join(folder_path, file)
                # Read the file content
                with open(file_path, "r", encoding="utf-8") as f:
                    content = f.read()
                # Append a dictionary with folder name, file id (without extension), and content
                data.append({
                    "folder": folder,
                    "id": os.path.splitext(file)[0],
                    "content_raw": content
                })

# Create a DataFrame from the list of dictionaries
df = pd.DataFrame(data)
df.head()

Unnamed: 0,folder,id,content_raw
0,IC-001,OIC-001-01-A,{{knowledge objective\n|Identifiant=OIC-001-01...
1,IC-001,OIC-001-02-A,{{knowledge objective\n|Identifiant=OIC-001-02...
2,IC-001,OIC-001-03-A,{{knowledge objective\n|Identifiant=OIC-001-03...
3,IC-001,OIC-001-04-A,{{knowledge objective\n|Identifiant=OIC-001-04...
4,IC-001,OIC-001-05-A,{{knowledge objective\n|Identifiant=OIC-001-05...


In [6]:
from tqdm.auto import tqdm

def extract_properties_from_metadata(text):
    """
    Extract specified properties from a knowledge objective text block.
    
    Args:
        text (str): The text containing the properties in the knowledge objective format
        
    Returns:
        dict: Dictionary containing the extracted properties
    """
    # Initialize dictionary to store the extracted properties
    properties = {
        'Item_parent': '',
        'Title': '',
        'Description': '',
        'Item': '',
        'Rubric': ''  # Added Rubric property
    }
    
    # Split the text by lines
    lines = text.strip().split('\n')
    
    # Extract the properties from each line
    for line in lines:
        line = line.strip()
        
        # Skip lines that don't contain property information
        if '=' not in line:
            continue
            
        # Extract property name and value
        parts = line.split('=', 1)
        if len(parts) != 2:
            continue
            
        prop_name = parts[0].strip('|')
        prop_value = parts[1].strip()
        
        # Store the properties we're interested in
        if prop_name in properties:
            # Remove any trailing characters like commas or closing braces
            properties[prop_name] = prop_value.rstrip(',}')
    
    return properties

def format_properties_for_content(properties):
    """
    Format properties to be prepended to content.
    
    Args:
        properties (dict): Dictionary of extracted properties
        
    Returns:
        str: Formatted properties text
    """
    # Format the extracted properties as a list (excluding Rubric which goes to its own column)
    content_properties = {k: v for k, v in properties.items()} # if k != 'Rubric'}
    
    formatted_properties = []
    for prop, value in content_properties.items():
        if value:  # Only include properties that have values
            formatted_properties.append(f"**{prop}:** {value}")
    
    # Join the formatted properties with semicolons and newlines
    properties_text = ";\n".join(formatted_properties)
    
    # If properties were found, add a semicolon to the end and leave an empty line
    if properties_text:
        properties_text += ";\n\n"
    
    return properties_text

def process_row_content(row):
    """
    Process a row of data by extracting properties from content.
    
    Args:
        row (pd.Series): A row from the dataframe
        
    Returns:
        tuple: (processed_content, rubric_value)
    """
    if 'content_raw' not in row:
        return "No content available", ""
    
    content = row['content_raw']
    
    # Look for the knowledge objective block pattern
    if "{{knowledge objective" in content:
        # Extract the knowledge objective block
        start_idx = content.find("{{knowledge objective")
        end_idx = content.find("}}", start_idx)
        
        if end_idx > start_idx:
            knowledge_block = content[start_idx:end_idx+2]
            
            # Extract remaining content (if any) after the knowledge block
            remaining_content = content[end_idx+2:].strip()
            
            # Extract properties
            properties = extract_properties_from_metadata(knowledge_block)
            
            # Format properties for content
            formatted_properties = format_properties_for_content(properties)
            
            # Extract rubric value
            rubric_value = properties.get('Rubric', '')
            
            # Prepend the formatted properties to the content
            processed_content = formatted_properties + remaining_content
            
            return processed_content, rubric_value
    
    # If no knowledge block found, return original content and empty rubric
    return content, ""

def process_metadata_in_lisa_sheets(df, input_col='content_raw', content_col='content_cleaned', rubric_col='rubric'):
    """
    Apply the extraction and preprocessing function to create new columns in the dataframe.
    
    Args:
        df (pd.DataFrame): The dataframe to process
        input_col (str): The name of the column containing the original content
        content_col (str): The name of the new column for processed content
        rubric_col (str): The name of the new column for rubric values
        
    Returns:
        pd.DataFrame: The dataframe with the new columns added
    """
    # Make a copy to avoid modifying the original
    result_df = df.copy()
    
    # Create a progress bar
    tqdm.pandas(desc="Processing rows")
    
    # Apply the function to each row
    results = result_df.progress_apply(process_row_content, axis=1)
    
    # Split the results into separate columns
    result_df[content_col], result_df[rubric_col] = zip(*results)
    
    return result_df

In [7]:
result_df = process_metadata_in_lisa_sheets(df)
result_df.head()

Processing rows:   0%|          | 0/4875 [00:00<?, ?it/s]

Unnamed: 0,folder,id,content_raw,content_cleaned,rubric
0,IC-001,OIC-001-01-A,{{knowledge objective\n|Identifiant=OIC-001-01...,**Item_parent:** The doctor-patient relationsh...,Definition
1,IC-001,OIC-001-02-A,{{knowledge objective\n|Identifiant=OIC-001-02...,**Item_parent:** The doctor-patient relationsh...,Definition
2,IC-001,OIC-001-03-A,{{knowledge objective\n|Identifiant=OIC-001-03...,**Item_parent:** The doctor-patient relationsh...,Definition
3,IC-001,OIC-001-04-A,{{knowledge objective\n|Identifiant=OIC-001-04...,**Item_parent:** The doctor-patient relationsh...,
4,IC-001,OIC-001-05-A,{{knowledge objective\n|Identifiant=OIC-001-05...,**Item_parent:** The doctor-patient relationsh...,Definition


In [8]:
# Drop multimedia
result_df = result_df[result_df['rubric']!='Multimedia content'].reset_index(drop=True)

In [9]:
len(result_df)

4693

In [12]:
print(result_df['content_raw'][1])

{{knowledge objective
|Identifiant=OIC-001-02-A
|Item_parent=The doctor-patient relationship in the context of a one-to-one discussion or within a team, which may be multi-professional. Communicating with patients and their families. Announcing a serious or fatal illness or damage associated with care. Patient education. Personalising medical care.
|Item_parent_short=The doctor-patient relationship in the context of a one-to-one discussion or within a team, which may be multi-professional. Communicating with patients and their families. Announcing a serious or fatal illness or a life-threatening ...
|Rank=A
|Title=Knowing the main determinants of the doctor-patient relationship
|Description=Psychological, ethical, social, etc. determinants.
|Rubric=Definition
|Contributors=
|Order=2}}
The medical act is the moment when 3 goals converge: that of the patient, that of the doctor and that of society. These goals lead to interactions between the various determinants of the relationship, whi

In [19]:
print(result_df['content_cleaned'][0])

**Item_parent:** The doctor-patient relationship in the context of a one-to-one discussion or within a team, which may be multi-professional. Communicating with patients and their families. Announcing a serious or fatal illness or damage associated with care. Patient education. Personalising medical care.;
**Title:** Knowing the definition of the doctor-patient relationship;
**Description:** The main characteristics of the doctor-patient relationship. E.g. Paternalistic model/centred on self-determination/patient as partner in care.;
**Rubric:** Definition;

A meeting between a carer and a cared-for person:

- Previously, the approach was mainly paternalistic and biomedical.

- From now on, this will be a global approach, i.e. bio-medical-psychosocial, centred on the patient and not their disease(s).

The patient is a partner in the care process: his or her experience and experiences are integrated into the caregiver's approach.


In [25]:
import os
from dotenv import load_dotenv

load_dotenv()
OPENAI_KEY = os.environ.get("OPENAI_KEY")


#from kaggle_secrets import UserSecretsClient
#OPENAI_KEY = UserSecretsClient().get_secret("OPENAI_KEY")

In [26]:
from openai import OpenAI
client = OpenAI(api_key = OPENAI_KEY)

def call_openai_api(system, user):
    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            temperature=0.2,
            messages=[
                {"role": "system", "content": system},
                {"role": "user", "content": user}
            ]
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"Error occurred: {e}")
        return None

In [27]:
system_prompt = """
You are an expert in text preprocessing for NLP applications. Your task is to clean up the following medical text while preserving all its essential information. Specifically:

1. **Remove or replace wiki-style links** (e.g., `[[Abnormal genital bleeding (outside known pregnancy) SD-112|genital bleeding]]` should become `"Abnormal genital bleeding (SD-112)"`).
2. **Ensure consistent section formatting**:
   - Convert section titles to a uniform format (e.g., `== Definitions ==`).
   - Remove excessive use of bold (`'''`) and italics (`''`) unless necessary.
3. **Remove or replace image references**:
   - If an image is mentioned (`[File:Higham score.png|vignette|Higham score]]`), replace it with a descriptive text (e.g., `"The Higham score is used to assess menorrhagia severity."`).
4. **Standardize lists and bullet points**:
   - Ensure bullet points follow a clear structure.
   - Remove any unnecessary symbols or broken formatting.
5. **Keep all medical terms and numerical values unchanged.**
6. **Ensure the final text is well-structured, readable, and suitable for NLP processing.**

Clean up the text provided to you.
"""

In [28]:
def generate_prompt_for_question(row):
    lisa_sheet_raw = row['content_cleaned']
    user_prompt = f"""Medical text:\n-------------\n{lisa_sheet_raw}"""
    try:
        return call_openai_api(system_prompt, user_prompt)
    except Exception as e:
        print(f"Error processing question at index {row.id}: {e}")
        return None

In [29]:
test_df = result_df[:50]
len(test_df)

50

In [50]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from concurrent.futures import ProcessPoolExecutor, as_completed
import math
from multiprocessing import cpu_count

def split_dataframe(df, batch_size=None, num_batches=None):
    """
    Split a dataframe into multiple batches.
    
    Args:
        df (pd.DataFrame): The dataframe to split
        batch_size (int, optional): Size of each batch
        num_batches (int, optional): Number of batches to create
        
    Returns:
        list: List of dataframe batches
    """
    if batch_size is None and num_batches is None:
        # Default to using CPU count for number of batches
        num_batches = cpu_count()
    
    if batch_size is not None:
        # Split by batch size
        num_batches = math.ceil(len(df) / batch_size)
        return np.array_split(df, num_batches)
    else:
        # Split by number of batches
        return np.array_split(df, num_batches)

def process_batch(batch_df):
    """
    Process a batch of data with progress bar for the rows in this batch
    
    Args:
        batch_df (pd.DataFrame): A batch of the dataframe to process
        
    Returns:
        pd.DataFrame: The processed dataframe batch with content_cleaned column
    """
    # Create a copy of the batch to avoid modifying the original
    try:
        result_df = batch_df.copy()
        
        # Make sure the content_cleaned column exists
        if 'content_gpt' not in result_df.columns:
            result_df['content_gpt'] = None
        
        # Create a tqdm progress bar for this batch
        tqdm_batch = tqdm(batch_df.iterrows(), total=len(batch_df), 
                          desc=f"Batch {batch_df.index[0]}-{batch_df.index[-1]}", 
                          position=0, leave=True)
        
        # Process each row in the batch with progress tracking
        for idx, row in tqdm_batch:
            # Apply your process_data function to the row
            processed_result = generate_prompt_for_question(row)
            
            # Assign the result to the content_cleaned column for this specific row
            result_df.at[idx, 'content_gpt'] = processed_result
        
        return result_df
    except Exception as e:
        # Log the error and return a default value or re-raise a more informative exception
        print(f"Error processing batch: {e}")
        return None

def process_dataframe_parallel(df, batch_size=None, num_batches=None, max_workers=None):
    """
    Process a dataframe in parallel with progress bars and store results in content_cleaned
    
    Args:
        df (pd.DataFrame): The dataframe to process
        batch_size (int, optional): Size of each batch
        num_batches (int, optional): Number of batches to create
        max_workers (int, optional): Maximum number of worker processes
        
    Returns:
        pd.DataFrame: The processed dataframe with content_cleaned column
    """
    # Split the dataframe into batches
    batches = split_dataframe(df, batch_size, num_batches)
    
    # Set up a progress bar for the overall batch processing
    print(f"Processing {len(batches)} batches in parallel")
    
    # Create a process pool for parallel processing
    if max_workers is None:
        max_workers = min(cpu_count(), len(batches))
    
    # Process batches in parallel with a progress bar
    results = []
    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        # Submit all batch processing tasks
        future_to_batch = {executor.submit(process_batch, batch): i 
                          for i, batch in enumerate(batches)}
        
        # Create progress bar for batch completion
        with tqdm(total=len(batches), desc="Overall Progress", position=1, leave=True) as pbar:
            for future in as_completed(future_to_batch):
                batch_idx = future_to_batch[future]
                try:
                    batch_result = future.result()
                    results.append(batch_result)
                except Exception as e:
                    print(f"Batch {batch_idx} generated an exception: {e}")
                pbar.update(1)
    
    # Combine results from all batches
    return pd.concat(results)

In [55]:
df_new = process_dataframe_parallel(test_df, num_batches=10)
df_sorted = df_new.sort_index()

Processing 10 batches in parallel


  return bound(*args, **kwds)


BrokenProcessPool: A child process terminated abruptly, the process pool is not usable anymore

In [51]:
process_batch(test_df)

Batch 0-49:   0%|          | 0/50 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [32]:
print(df_sorted['content_raw'][1])

NameError: name 'df_sorted' is not defined

In [137]:
print(df_sorted['content_cleaned'][1])

**Item_parent:** Pneumothorax;
**Title:** Knowing the elements for diagnosing the severity of a pneumothorax;
**Description:** Refer to question on acute respiratory distress and acute respiratory failure. Item 355/356;
**Item:** Identification of the emergency;

These elements are systematically sought on clinical examination.

- Respiratory signs: signs of acute respiratory distress/desaturation.

- Haemodynamic signs in the event of gas tamponade: right heart failure, shock, cardiac arrest.

They do not necessarily correlate with the size of the pneumothorax. A partial pneumothorax may be poorly tolerated from a respiratory point of view in a patient already suffering from respiratory insufficiency.


In [138]:
df_sorted.to_csv('/kaggle/working/lisa_sheets_snippet.csv', index=False)