In [None]:
import json
import os
import pandas as pd

def process_json_directory(directory_path, output_csv='dataset.csv'):
    """
    Processes all JSON files in the given directory to create a binary classification dataset.
    - Extracts abstracts as complex (label 1)
    - Extracts adaptations as simple (label 0)
    - Saves the result to a CSV file.
    
    Args:
        directory_path (str): Path to the directory containing JSON files.
        output_csv (str): Path to save the output CSV file.
    
    Returns:
        bool: True if the dataset was saved successfully, False otherwise.
    """
    # Validate directory
    if not os.path.exists(directory_path) or not os.path.isdir(directory_path):
        print(f"Error: '{directory_path}' is not a valid directory.")
        return False

    texts = []
    labels = []
    json_files_processed = 0

    # Iterate over all files in the directory
    for filename in os.listdir(directory_path):
        if filename.endswith('.json'):
            file_path = os.path.join(directory_path, filename)
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                
                # Traverse the nested structure
                for q_id in data:
                    question_data = data[q_id]
                    # Find PMID keys
                    pmid_keys = [k for k in question_data if k.startswith('PMID_') or k.isdigit()]  # Adjusted for your sample
                    for pmid in pmid_keys:
                        pmid_data = question_data[pmid]
                        
                        # Extract abstract
                        if 'abstract' in pmid_data:
                            abstract_sentences = pmid_data['abstract']
                            # Sort sentence keys by number
                            sorted_abstract_keys = sorted(
                                abstract_sentences.keys(),
                                key=lambda x: int(x)
                            )
                            abstract_text = ' '.join(abstract_sentences[k] for k in sorted_abstract_keys)
                            if abstract_text.strip():  # Ensure non-empty text
                                texts.append(abstract_text)
                                labels.append(1)
                        
                        # Extract adaptations
                        if 'adaptations' in pmid_data:
                            adaptations = pmid_data['adaptations']
                            for adapt_key in adaptations:
                                adapt_sentences = adaptations[adapt_key]
                                # Sort sentence keys by number
                                sorted_adapt_keys = sorted(
                                    adapt_sentences.keys(),
                                    key=lambda x: int(x)
                                )
                                adapt_text = ' '.join(adapt_sentences[k] for k in sorted_adapt_keys)
                                if adapt_text.strip():  # Ensure non-empty text
                                    texts.append(adapt_text)
                                    labels.append(0)
                
                json_files_processed += 1
            except json.JSONDecodeError:
                print(f"Warning: Skipping '{filename}' due to invalid JSON format.")
            except Exception as e:
                print(f"Warning: Error processing '{filename}': {str(e)}")

    # Check if any data was collected
    if not texts:
        print("No valid data extracted from JSON files.")
        return False

    # Create DataFrame
    df = pd.DataFrame({'text': texts, 'label': labels})

    # Remove duplicates (optional)
    df = df.drop_duplicates(subset=['text'], keep='first')

    # Save to CSV
    try:
        df.to_csv(output_csv, index=False, encoding='utf-8')
        print(f"Dataset saved to '{output_csv}'. Total rows: {len(df)}, JSON files processed: {json_files_processed}")
        return True
    except Exception as e:
        print(f"Error: Failed to save CSV to '{output_csv}': {str(e)}")
        return False

# Example usage
process_json_directory('.', 'dataset_binario.csv')

Dataset saved to 'dataset_binario.csv'. Total rows: 1666, JSON files processed: 1


True

In [5]:
print(data)

None
