In [1]:
import nbformat as nbf
import json

def extract_cell_data(notebook_path):
    """Extract all cell data from a Jupyter notebook"""
    # Read the notebook
    nb = nbf.read(notebook_path, nbf.NO_CONVERT)
    
    # Initialize the dictionary with ALL required keys
    extracted_data = {
        'variables': {},
        'outputs': {},
        'sources': {},  # This was missing!
        'execution_results': {}
    }
    
    # Extract from each cell
    for i, cell in enumerate(nb.cells):
        if cell.cell_type == 'code':
            # Extract source code
            extracted_data['sources'][f'cell_{i}'] = cell.source
            
            # Extract outputs if they exist
            if hasattr(cell, 'outputs') and cell.outputs:
                for j, output in enumerate(cell.outputs):
                    if output.output_type == 'execute_result':
                        extracted_data['outputs'][f'cell_{i}_output_{j}'] = output.data
                    elif output.output_type == 'display_data':
                        extracted_data['outputs'][f'cell_{i}_display_{j}'] = output.data
    
    return extracted_data

# Usage for your clinical trials analysis
notebook_data = extract_cell_data('RP_Operations.ipynb')


In [2]:
import nbformat as nbf
import json

def extract_cell_data_robust(notebook_path):
    """Extract all cell data from a Jupyter notebook with error handling"""
    try:
        # Read the notebook with proper version handling
        nb = nbf.read(notebook_path, as_version=4)
        
        extracted_data = {
            'notebook_metadata': nb.metadata,
            'sources': {},
            'outputs': {},
            'execution_counts': {},
            'cell_types': {},
            'execution_results': {}
        }
        
        # Extract from each cell
        for i, cell in enumerate(nb.cells):
            # Store cell type
            extracted_data['cell_types'][f'cell_{i}'] = cell.cell_type
            
            if cell.cell_type == 'code':
                # Extract source code
                extracted_data['sources'][f'cell_{i}'] = cell.source
                
                # Extract execution count if available
                if hasattr(cell, 'execution_count') and cell.execution_count:
                    extracted_data['execution_counts'][f'cell_{i}'] = cell.execution_count
                
                # Extract outputs if they exist
                if hasattr(cell, 'outputs') and cell.outputs:
                    for j, output in enumerate(cell.outputs):
                        output_key = f'cell_{i}_output_{j}'
                        if output.output_type == 'execute_result':
                            extracted_data['outputs'][output_key] = output.data
                        elif output.output_type == 'display_data':
                            extracted_data['outputs'][output_key] = output.data
                        elif output.output_type == 'stream':
                            extracted_data['outputs'][output_key] = output.text
            
            elif cell.cell_type == 'markdown':
                # Also extract markdown content
                extracted_data['sources'][f'cell_{i}'] = cell.source
        
        return extracted_data
        
    except FileNotFoundError:
        print(f"Error: Notebook file '{notebook_path}' not found")
        return None
    except Exception as e:
        print(f"Error reading notebook: {str(e)}")
        return None

# Usage
notebook_data = extract_cell_data_robust('RP_Operations.ipynb')
if notebook_data:
    print("Successfully extracted notebook data")


Successfully extracted notebook data


In [5]:
import re
import pandas as pd

def extract_results(notebook_data):
    """Handles both string and dictionary outputs"""
    results = {
        'model_performance': {},
        'quartile_analysis': {},
        'training_metrics': {'epochs': [], 'losses': []},
        'omega_optimization': {'omega': [], 'accuracy': []}
    }
    
    # Get outputs as list (not dictionary)
    outputs = notebook_data.get('outputs', [])
    
    for output in outputs:
        # Extract text from different output formats
        if isinstance(output, dict):
            text = '\n'.join(output.get('text/plain', []))
        elif isinstance(output, str):
            text = output
        else:
            continue
        
        # Model performance extraction
        model_match = re.findall(r'(\w+\s?\w+)\s+\|\s+([\d.]+)', text)
        for model, acc in model_match:
            results['model_performance'][model.strip()] = float(acc)
        
        # Quartile analysis extraction
        quartile_match = re.findall(r'(\w+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+).*?([\d.]+)%', text)
        for feature, q1, median, q3, outlier in quartile_match:
            results['quartile_analysis'][feature] = {
                'Q1': float(q1),
                'Median': float(median),
                'Q3': float(q3),
                'Outlier%': float(outlier)
            }
        
        # Training metrics extraction
        epoch_loss = re.findall(r'Epoch (\d+).*?Loss:\s+([\d.]+)', text)
        for epoch, loss in epoch_loss:
            results['training_metrics']['epochs'].append(int(epoch))
            results['training_metrics']['losses'].append(float(loss))
        
        # Omega optimization extraction
        omega_match = re.findall(r'Omega:\s+([\d.]+).*?Accuracy:\s+([\d.]+)', text)
        for omega, acc in omega_match:
            results['omega_optimization']['omega'].append(float(omega))
            results['omega_optimization']['accuracy'].append(float(acc))
    
    return results

# Execute extraction
if 'notebook_data' in globals():
    extracted_results = extract_results(notebook_data)
    print("✅ Data extracted successfully!")
else:
    print("❌ notebook_data not found")


✅ Data extracted successfully!


In [4]:
import matplotlib.pyplot as plt
%matplotlib inline

def create_visualizations(data):
    """Creates plots even with partial data"""
    plt.figure(figsize=(18, 12))
    plot_num = 1
    
    # Model Performance
    if data['model_performance']:
        plt.subplot(2, 2, plot_num)
        pd.Series(data['model_performance']).plot(kind='bar', color='skyblue')
        plt.title('Model Accuracies')
        plt.ylim(0, 1)
        plot_num += 1
    
    # Quartile Analysis
    if data['quartile_analysis']:
        plt.subplot(2, 2, plot_num)
        q_data = pd.DataFrame(data['quartile_analysis']).T
        q_data[['Q1', 'Median', 'Q3']].plot(kind='bar')
        plt.title('Feature Distributions')
        plot_num += 1
    
    # Training Progress
    if data['training_metrics']['epochs']:
        plt.subplot(2, 2, plot_num)
        plt.plot(data['training_metrics']['epochs'], 
                data['training_metrics']['losses'], 'ro-')
        plt.title('Training Loss Progression')
        plot_num += 1
    
    # Omega Optimization
    if data['omega_optimization']['omega']:
        plt.subplot(2, 2, plot_num)
        plt.plot(data['omega_optimization']['omega'], 
                data['omega_optimization']['accuracy'], 'gs-')
        plt.title('Ensemble Weight Optimization')
    
    plt.tight_layout()
    plt.show()
    print("✅ Visualizations created!")

# Execute
if 'extracted_results' in globals():
    create_visualizations(extracted_results)
else:
    print("❌ Run Cell 1 first")


<Figure size 1800x1200 with 0 Axes>

✅ Visualizations created!
