In [None]:
import json
import os
import pandas as pd

def process_json_directory(directory_path, output_csv='dataset.csv'):
    """
    Processes all JSON files in the given directory to create a binary classification dataset.
    - Extracts abstracts as complex (label 1)
    - Extracts adaptations as simple (label 0)
    - Saves the result to a CSV file.
    """
    texts = []
    labels = []
    
    # Iterate over all files in the directory
    for filename in os.listdir(directory_path):
        if filename.endswith('.json'):
            file_path = os.path.join(directory_path, filename)
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            # Traverse the nested structure
            for q_id in data:
                question_data = data[q_id]
                # Find PMID keys
                pmid_keys = [k for k in question_data if k.startswith('PMID_')]
                for pmid in pmid_keys:
                    pmid_data = question_data[pmid]
                    
                    # Extract abstract
                    if 'abstract' in pmid_data:
                        abstract_sentences = pmid_data['abstract']
                        # Sort sentence keys by number (e.g., Sentence_1, Sentence_2)
                        sorted_abstract_keys = sorted(
                            abstract_sentences.keys(),
                            key=lambda x: int(x.split('_')[1])
                        )
                        abstract_text = ' '.join(abstract_sentences[k] for k in sorted_abstract_keys)
                        texts.append(abstract_text)
                        labels.append(1)
                    
                    # Extract adaptations
                    if 'adaptations' in pmid_data:
                        adaptations = pmid_data['adaptations']
                        for adapt_key in adaptations:
                            adapt_sentences = adaptations[adapt_key]
                            # Sort sentence keys by number
                            sorted_adapt_keys = sorted(
                                adapt_sentences.keys(),
                                key=lambda x: int(x.split('_')[1])
                            )
                            adapt_text = ' '.join(adapt_sentences[k] for k in sorted_adapt_keys)
                            texts.append(adapt_text)
                            labels.append(0)
    
process_json_directory(".")

