# Genomic Disorder Plot Review App

This notebook provides an interactive interface to review genomic disorder plots and classify them as correct or incorrect.

In [1]:
import pandas as pd

## Configuration

In [2]:
# Configuration - Update these paths
PLOT_DIRECTORY = "/Users/markw/Work/talkowski/sv-pipe-testing/mw_gd/gatk-sv-utils/ReviewGenomicDisorders/output/asd_cohort" 
GD_REGIONS_FILE = "/Users/markw/Work/talkowski/sv-pipe-testing/mw_gd/gatk-sv-utils/ReviewGenomicDisorders/input/GenomicDisorderRegions_hg38_2025-12-05.tsv"
MANIFEST_FILE = "/Users/markw/Work/talkowski/sv-pipe-testing/mw_gd/gatk-sv-utils/ReviewGenomicDisorders/review/plot_review_manifest.json"


## Load Genomic Disorder Regions

In [3]:
# Load GD regions reference
gd_regions = pd.read_csv(GD_REGIONS_FILE, sep='\t')
print(f"Loaded {len(gd_regions)} genomic disorder regions")
gd_regions.head()

Loaded 282 genomic disorder regions


Unnamed: 0,chr,start_GRCh38,end_GRCh38,GD_ID,svtype,NAHR,terminal,cluster
0,chr1,898703,6229913,GD_1p36_DEL_chr1_898703_6229913,DEL,no,p,
1,chr1,898703,6229913,GD_1p36_DUP_chr1_898703_6229913,DUP,no,p,
2,chr1,145686997,145808272,GD_1q21.1-BP1-2_DEL_chr1_145686997_145808272,DEL,yes,no,1q21
3,chr1,145686997,145808272,GD_1q21.1-BP1-2_DUP_chr1_145686997_145808272,DUP,yes,no,1q21
4,chr1,145686997,146048497,GD_1q21.1-BP1-3_DEL_chr1_145686997_146048497,DEL,yes,no,1q21


## Utility Functions

In [4]:
from plot_review_app import parse_filename, load_manifest, save_manifest, get_plot_files, PlotReviewApp

## Review Application

## Start Review App

In [5]:
# Create and display the review app
app = PlotReviewApp(PLOT_DIRECTORY, MANIFEST_FILE, gd_regions_df=gd_regions)
app.display()

VBox(children=(HTML(value='<h2>Genomic Disorder Plot Review</h2>'), HTML(value='<hr>'), HTML(value="\n        â€¦

## Generate Summary Table

This section generates a summary table of manually reviewed GD calls with carriers and non-carriers.

In [35]:
def generate_summary_table(manifest_file, gd_regions_df):
    """
    Generate summary table from manifest.
    Format: chr, start, end, GD_ID, cluster_ID, SVTYPE, carriers, non-carriers
    """
    # Load manifest
    with open(manifest_file, 'r') as f:
        manifest = json.load(f)
    
    # Group by GD region and svtype
    gd_groups = {}
    
    for filepath, review in manifest.items():
        # Skip if no plot info
        if 'gd_id' not in review:
            continue
        
        key = (review['chr'], review['start'], review['end'], 
               review['gd_id'], review['svtype'])
        
        if key not in gd_groups:
            gd_groups[key] = {
                'carriers': [],
                'non_carriers': []
            }
        
        sample_id = review['sample_id']
        
        if review['classification'] == 'correct':
            gd_groups[key]['carriers'].append(sample_id)
        else:
            gd_groups[key]['non_carriers'].append(sample_id)
    
    # Build summary table
    summary_rows = []
    
    for key, samples in gd_groups.items():
        chr_val, start, end, gd_id, svtype = key
        
        # Get cluster ID from GD regions
        cluster_id = ''
        matching = gd_regions_df[
            (gd_regions_df['chr'] == chr_val) &
            (gd_regions_df['start_GRCh38'] == start) &
            (gd_regions_df['end_GRCh38'] == end) &
            (gd_regions_df['GD_ID'] == gd_id) &
            (gd_regions_df['svtype'] == svtype)
        ]
        
        if not matching.empty:
            cluster_id = str(matching.iloc[0]['cluster']) if pd.notna(matching.iloc[0]['cluster']) else ''
        
        row = {
            'chr': chr_val,
            'start': start,
            'end': end,
            'GD_ID': gd_id,
            'cluster_ID': cluster_id,
            'SVTYPE': svtype,
            'carriers': ','.join(sorted(samples['carriers'])),
            'non_carriers': ','.join(sorted(samples['non_carriers']))
        }
        summary_rows.append(row)
    
    summary_df = pd.DataFrame(summary_rows)
    
    # Sort by chr, start
    if not summary_df.empty:
        summary_df = summary_df.sort_values(['chr', 'start']).reset_index(drop=True)
    
    return summary_df

In [43]:
# Generate summary table
summary_table = generate_summary_table(MANIFEST_FILE, gd_regions)
print(f"Generated summary table with {len(summary_table)} GD regions")
summary_table

New key matching check:
File: chr10_46005406-49845537_GD_10q11.22-q11.23-AD_DEL_chr10_46005406_49845537_DEL___sp0148883__35d430.jpg
Generated Key (no gd_id): ('chr10', 46005406, 49845537, 'DEL')
Meta found?: True
Meta: {'cluster': '10q11.2', 'terminal': 'no', 'NAHR': 'yes', 'svtype': 'DEL'}


In [None]:
# Save summary table to file
output_file = 'genomic_disorder_summary.tsv'
summary_table.to_csv(output_file, sep='\t', index=False)
print(f"Summary table saved to {output_file}")

## Review Statistics

In [None]:
# Display review statistics
def display_statistics(manifest_file):
    """Display statistics from the manifest."""
    if not os.path.exists(manifest_file):
        print("No manifest file found")
        return
    
    with open(manifest_file, 'r') as f:
        manifest = json.load(f)
    
    total_reviews = len(manifest)
    correct_count = sum(1 for r in manifest.values() if r['classification'] == 'correct')
    incorrect_count = total_reviews - correct_count
    
    print(f"Total reviews: {total_reviews}")
    print(f"Correct: {correct_count} ({correct_count/total_reviews*100:.1f}%)")
    print(f"Incorrect: {incorrect_count} ({incorrect_count/total_reviews*100:.1f}%)")
    print()
    
    # Subtype breakdown
    if correct_count > 0:
        print("Correct subtypes:")
        subtype_counts = {}
        for review in manifest.values():
            if review['classification'] == 'correct' and 'subtype' in review:
                subtype = review['subtype']
                subtype_counts[subtype] = subtype_counts.get(subtype, 0) + 1
        
        for subtype, count in sorted(subtype_counts.items(), key=lambda x: -x[1]):
            print(f"  {subtype}: {count} ({count/correct_count*100:.1f}%)")

display_statistics(MANIFEST_FILE)