# GFP-Filtered Classification Testing

Step-by-step testing of the GFP filtering approach for CCM2_Val53Ile classification.

In [25]:
# Set up variables
import os
import sys
import time
import numpy as np
import pandas as pd
import polars as pl
from tqdm import tqdm
from scipy.stats import ttest_rel, ttest_ind, shapiro, wilcoxon

# Add utils path
sys.path.append("..")
from utils import find_feat_cols, find_meta_cols, remove_nan_infs_columns

# Set variables
TARGET_ALLELE = "CCM2_Val53Ile"
TARGET_BATCHES = ["2025_06_10_Batch_18"] ## , "2025_06_10_Batch_19"
INPUT_DIR = "../outputs/batch_profiles/"
CC_THRESHOLD = 20
MIN_CELLS_PER_WELL = 100
GFP_INTENSITY_COLUMN = "Cells_Intensity_IntegratedIntensity_GFP" ## Cells_Intensity_MeanIntensity_GFP is another option

# Input file paths
input_paths = [
    os.path.join(INPUT_DIR, batch, "profiles_tcdropped_filtered_var_mad_outlier_featselect_filtcells.parquet")
    for batch in TARGET_BATCHES
]
# Input file paths
input_orig_paths = [
    os.path.join(INPUT_DIR, batch, "profiles_tcdropped_filtered_var_mad_outlier.parquet")
    for batch in TARGET_BATCHES
]

print(f"Target allele: {TARGET_ALLELE}")
print(f"Target batches: {TARGET_BATCHES}")
print(f"Input paths:")
for path in input_paths:
    exists = "✅" if os.path.exists(path) else "❌"
    print(f"  {exists} {path}")

Target allele: CCM2_Val53Ile
Target batches: ['2025_06_10_Batch_18']
Input paths:
  ✅ ../outputs/batch_profiles/2025_06_10_Batch_18/profiles_tcdropped_filtered_var_mad_outlier_featselect_filtcells.parquet


## Step 1: Load Parquet Files

In [21]:
# Load both batches efficiently
print("Loading both batches...")
start_time = time.time()

dframes = []
total_loaded = 0

for i, path in enumerate(input_paths):
    print(f"Loading {TARGET_BATCHES[i]}...")
    batch_start = time.time()
    
    # Load with Cell ID creation
    df_batch = (
        pl.scan_parquet(path)
        .with_columns(
            pl.concat_str([
                "Metadata_Plate",
                "Metadata_well_position",
                "Metadata_ImageNumber", 
                "Metadata_ObjectNumber"
            ], separator="_").alias("Metadata_CellID")
        )
        # .collect()
        # .to_pandas()
    )

    # Load with Cell ID creation
    df_batch_orig = (
        pl.scan_parquet(input_orig_paths[i])
        .with_columns(
            pl.concat_str([
                "Metadata_Plate",
                "Metadata_well_position",
                "Metadata_ImageNumber", 
                "Metadata_ObjectNumber"
            ], separator="_").alias("Metadata_CellID")
        )
        .select(pl.col(["Metadata_CellID", "Cells_Intensity_IntegratedIntensity_GFP"]))
        # .collect()
        # .to_pandas()
    )
    
    df_batch = df_batch.join(
        df_batch_orig, on="Metadata_CellID", how="left"
    ).with_columns(
        pl.lit(TARGET_BATCHES[i]).alias("Metadata_Source_Batch")
    )
    dframes.append(df_batch)
    batch_time = time.time() - batch_start
    
    # print(f"  - {TARGET_BATCHES[i]}: {len(df_batch):,} cells ({batch_time:.1f}s)")
    # total_loaded += len(df_batch)

# .collect().to_pandas()

# Combine dataframes
print("Combining dataframes...")
combine_start = time.time()
df_combined = pl.concat(dframes, how="diagonal").collect().to_pandas()
combine_time = time.time() - combine_start

total_time = time.time() - start_time
total_loaded = df_combined.shape[0]
print(f"\n[LOADING] Total cells loaded: {total_loaded:,} ({total_time:.1f}s total, {combine_time:.1f}s combine)")

Loading both batches...
Loading 2025_06_10_Batch_18...
Combining dataframes...

[LOADING] Total cells loaded: 1,353,285 (5.1s total, 5.1s combine)


In [22]:
# Check for key columns
key_columns = ["Metadata_symbol", "Metadata_gene_allele", "Metadata_node_type", 
               "Metadata_well_position", "Metadata_Plate", "Cells_Intensity_IntegratedIntensity_GFP"]

print("\nKey columns check:")
for col in key_columns:
    exists = "✅" if col in df_combined.columns else "❌"
    print(f"  {exists} {col}")


Key columns check:
  ✅ Metadata_symbol
  ✅ Metadata_gene_allele
  ✅ Metadata_node_type
  ✅ Metadata_well_position
  ✅ Metadata_Plate
  ✅ Cells_Intensity_IntegratedIntensity_GFP


## Step 2: Preprocessing and Gene Filtering

In [None]:
# # Add control annotation
# def add_control_annot(dframe: pd.DataFrame) -> pd.DataFrame:
#     """Add control annotation column"""
#     def control_type_helper(node_type: str):
#         if node_type in ["TC", "NC", "PC"]:
#             return True
#         elif node_type in ["disease_wt", "allele", "cPC", "cNC"]:
#             return False
#         else:
#             return None
    
#     if "Metadata_control" not in dframe.columns:
#         dframe["Metadata_control"] = dframe["Metadata_node_type"].apply(control_type_helper)
    
#     return dframe

# # Basic preprocessing
# print("[PREPROCESSING] Basic cleanup...")
# start_time = time.time()

# # Filter rows with NaN metadata
# initial_count = len(df_combined)
# df_combined = df_combined[~df_combined["Metadata_well_position"].isna()]
# df_combined = add_control_annot(df_combined)
# df_combined = df_combined[~df_combined["Metadata_control"].isna()]

# cleanup_time = time.time() - start_time
# print(f"Cleaned up in {cleanup_time:.1f}s: {initial_count:,} → {len(df_combined):,} cells")

In [23]:
# Filter to CCM2 gene and target batches
print("[GENE_FILTER] Filtering to CCM2 gene...")
df_gene = df_combined[df_combined["Metadata_symbol"] == "CCM2"].copy()
print(f"CCM2 cells: {len(df_gene):,} ({100*len(df_gene)/total_loaded:.1f}% of total)")

# Check gene alleles and node types in CCM2 data
print("\nCCM2 Gene alleles:")
gene_allele_counts = df_gene["Metadata_gene_allele"].value_counts()
print(gene_allele_counts.head(10))

print("\nCCM2 Node types:")
node_type_counts = df_gene["Metadata_node_type"].value_counts()
print(node_type_counts)

# Check if our target allele exists
target_exists = TARGET_ALLELE in gene_allele_counts.index
print(f"\nTarget allele '{TARGET_ALLELE}' exists: {'✅' if target_exists else '❌'}")
if target_exists:
    print(f"  {TARGET_ALLELE}: {gene_allele_counts[TARGET_ALLELE]:,} cells")

[GENE_FILTER] Filtering to CCM2 gene...
CCM2 cells: 112,862 (8.3% of total)

CCM2 Gene alleles:
Metadata_gene_allele
CCM2_Val120Ile    12214
CCM2_Phe270Leu    11902
CCM2_Ser413Leu    11402
CCM2_Val53Ile     11348
CCM2_Leu198Arg    11080
CCM2_Leu212Pro    10917
CCM2_Asn327Ser    10762
CCM2_Leu195Arg    10631
CCM2_Met1Val       9597
CCM2_Thr406Met     8384
Name: count, dtype: int64

CCM2 Node types:
Metadata_node_type
allele        108237
disease_wt      4625
Name: count, dtype: int64

Target allele 'CCM2_Val53Ile' exists: ✅
  CCM2_Val53Ile: 11,348 cells


## Step 3: Separate Reference and Variant Data

In [24]:
# Split into experimental data (variant vs reference)
df_exp = df_gene
print(f"Experimental data (non-control): {len(df_exp):,} cells")

# Separate reference and variant alleles
df_variant = df_exp[df_exp["Metadata_gene_allele"] == TARGET_ALLELE].copy()
df_reference = df_exp[df_exp["Metadata_node_type"] == "disease_wt"].copy()

print(f"\nReference (disease_wt): {len(df_reference):,} cells")
print(f"Variant ({TARGET_ALLELE}): {len(df_variant):,} cells")

if len(df_reference) > 0:
    print("\nReference well breakdown:")
    ref_wells = df_reference.groupby(["Metadata_Source_Batch", "Metadata_Plate", "Metadata_well_position"]).size()
    print(ref_wells.head(10))

if len(df_variant) > 0:
    print("\nVariant well breakdown:")
    var_wells = df_variant.groupby(["Metadata_Source_Batch", "Metadata_Plate", "Metadata_well_position"]).size()
    # print(var_wells.head(10))

Experimental data (non-control): 112,862 cells

Reference (disease_wt): 4,625 cells
Variant (CCM2_Val53Ile): 11,348 cells

Reference well breakdown:
Metadata_Source_Batch  Metadata_Plate              Metadata_well_position
2025_06_10_Batch_18    2025_06_02_B18A8A10R1_P2T1  A19                       453
                                                   I21                       305
                       2025_06_02_B18A8A10R1_P2T2  A19                       808
                                                   I21                       480
                       2025_06_02_B18A8A10R1_P2T3  A19                       995
                                                   I21                       457
                       2025_06_02_B18A8A10R1_P2T4  A19                       699
                                                   I21                       428
dtype: int64

Variant well breakdown:


## Step 4: Test GFP Range Optimization Function

In [16]:
# GFP range optimization function with expanded ranges and ratio constraint
def find_optimal_gfp_range_fast(ref_gfp: np.ndarray, var_gfp: np.ndarray, 
                               min_cells_per_well: int = 100, max_ratio: float = 3.0):
    """Ultra-fast vectorized GFP range optimization with expanded ranges and ratio constraint"""
    # Expanded quantile range testing: from 10%-90% down to 30%-70%
    quantile_pairs = [
        (0.1, 0.9), (0.12, 0.88), (0.15, 0.85), (0.17, 0.83), (0.2, 0.8),
        (0.22, 0.78), (0.25, 0.75), (0.27, 0.73), (0.3, 0.7)
    ]
    
    # Vectorized quantile calculation for all ranges
    all_quantiles = [q for pair in quantile_pairs for q in pair]
    ref_qs = np.quantile(ref_gfp, all_quantiles)
    var_qs = np.quantile(var_gfp, all_quantiles)
    
    best_range = None
    max_total_cells = 0
    best_quantile_info = ""
    
    results = []
    
    # Test each quantile pair
    for i, (low_q, high_q) in enumerate(quantile_pairs):
        # Get quantile boundaries for this pair
        ref_low = ref_qs[i*2]
        ref_high = ref_qs[i*2 + 1] 
        var_low = var_qs[i*2]
        var_high = var_qs[i*2 + 1]
        
        # Find overlapping range
        range_min = max(ref_low, var_low)
        range_max = min(ref_high, var_high)
        
        # Skip if invalid range
        if range_min >= range_max:
            results.append((f"{int(low_q*100)}-{int(high_q*100)}%", 0, 0, 0, "Invalid range", "N/A"))
            continue
            
        # Vectorized cell counting
        ref_mask = (ref_gfp >= range_min) & (ref_gfp <= range_max)
        var_mask = (var_gfp >= range_min) & (var_gfp <= range_max)
        ref_count = np.sum(ref_mask)
        var_count = np.sum(var_mask)
        
        # Calculate sample size ratio
        if ref_count == 0 or var_count == 0:
            ratio_status = "Zero samples"
        else:
            ratio = max(ref_count, var_count) / min(ref_count, var_count)
            ratio_status = f"Ratio: {ratio:.1f}" if ratio <= max_ratio else f"Ratio: {ratio:.1f} (>3x)"
        
        results.append((f"{int(low_q*100)}-{int(high_q*100)}%", ref_count, var_count, 
                       ref_count + var_count, f"GFP: {range_min:.1f}-{range_max:.1f}", ratio_status))
        
        # Check minimum requirements and ratio constraint
        if (ref_count >= min_cells_per_well and var_count >= min_cells_per_well and
            ref_count > 0 and var_count > 0):
            
            ratio = max(ref_count, var_count) / min(ref_count, var_count)
            
            if ratio <= max_ratio:  # Ratio constraint
                total_cells = ref_count + var_count
                if total_cells > max_total_cells:
                    max_total_cells = total_cells
                    best_range = (range_min, range_max, ref_count, var_count)
                    best_quantile_info = f"{int(low_q*100)}%-{int(high_q*100)}%"
    
    # Show all results
    # print("GFP Range Optimization Results (with ratio constraint ≤3x):")
    # print("Quantile Range | Ref Cells | Var Cells | Total | GFP Range      | Sample Ratio")
    # print("-" * 90)
    # for quantile, ref_c, var_c, total, gfp_range, ratio_info in results:
    #     # Check if this meets all criteria
    #     meets_min = ref_c >= min_cells_per_well and var_c >= min_cells_per_well
    #     meets_ratio = "Ratio:" in ratio_info and not "(>3x)" in ratio_info if ref_c > 0 and var_c > 0 else False
    #     status = "✅" if (meets_min and meets_ratio) else "❌"
    #     print(f"{status} {quantile:>12} | {ref_c:>8} | {var_c:>8} | {total:>5} | {gfp_range:>14} | {ratio_info}")
    
    if best_range is None:
        return None, None, 0, 0, "NO_SUITABLE_RANGE"
    
    return best_range[0], best_range[1], best_range[2], best_range[3], best_quantile_info

## Step 6: Summary and Next Steps

In [17]:
# Summary of findings
print("=== SUMMARY ===")
print(f"Total cells loaded: {total_loaded:,}")
print(f"CCM2 cells: {len(df_gene):,} ({100*len(df_gene)/total_loaded:.1f}%)")
print(f"Reference cells: {len(df_reference):,}")
print(f"Variant cells ({TARGET_ALLELE}): {len(df_variant):,}")

if len(df_reference) > 0 and len(df_variant) > 0:
    print(f"\n✅ Data looks good for classification!")
    
    # Count potential well pairs
    common_plates = list(set(df_reference["Metadata_Plate"].unique()) & 
                        set(df_variant["Metadata_Plate"].unique()))
    
    total_pairs = 0
    for plate in common_plates:
        ref_wells_plate = len(df_reference[df_reference["Metadata_Plate"] == plate]["Metadata_well_position"].unique())
        var_wells_plate = len(df_variant[df_variant["Metadata_Plate"] == plate]["Metadata_well_position"].unique())
        pairs_plate = ref_wells_plate * var_wells_plate
        total_pairs += pairs_plate
        print(f"  Plate {plate}: {ref_wells_plate} ref × {var_wells_plate} var = {pairs_plate} pairs")
    
    print(f"\nTotal potential well pairs for classification: {total_pairs}")
    print(f"\n🚀 Ready to proceed with full classification pipeline!")
else:
    print(f"\n❌ Missing reference or variant data - check allele names and data quality")

=== SUMMARY ===
Total cells loaded: 1,353,285
CCM2 cells: 112,862 (8.3%)
Reference cells: 4,625
Variant cells (CCM2_Val53Ile): 11,348

✅ Data looks good for classification!
  Plate 2025_06_02_B18A8A10R1_P2T1: 2 ref × 2 var = 4 pairs
  Plate 2025_06_02_B18A8A10R1_P2T2: 2 ref × 2 var = 4 pairs
  Plate 2025_06_02_B18A8A10R1_P2T3: 2 ref × 2 var = 4 pairs
  Plate 2025_06_02_B18A8A10R1_P2T4: 2 ref × 2 var = 4 pairs

Total potential well pairs for classification: 16

🚀 Ready to proceed with full classification pipeline!


In [29]:
# Import classification functions
import importlib.util
# Load functions from classify_gfp_filtered.py
spec = importlib.util.spec_from_file_location("classify_gfp_filtered", "./classify_gfp_filtered.py")
classify_module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(classify_module)

# Import specific functions we need
from classify import (
    drop_low_cc_wells,
    # find_optimal_gfp_range_fast,
    # get_classifier_features
)

print("✅ Successfully imported classification functions from classify_gfp_filtered.py")

✅ Successfully imported classification functions from classify_gfp_filtered.py


In [None]:
# Step 1: Apply cell count filter
print("[CELL_COUNT_FILTER] Applying cell count threshold...")
start_time = time.time()

# Combine reference and variant data
df_exp_combined = pd.concat([df_reference, df_variant], ignore_index=True)
print(f"Combined experimental data: {len(df_exp_combined):,} cells")

# Apply cell count filtering
df_cc_filtered = df_exp_combined #drop_low_cc_wells(df_exp_combined, CC_THRESHOLD, log_file=None)
cc_filter_time = time.time() - start_time

print(f"After CC filtering (≥{CC_THRESHOLD} cells/well): {len(df_cc_filtered):,} cells ({cc_filter_time:.2f}s)")
print(f"Cells removed: {len(df_exp_combined) - len(df_cc_filtered):,} ({100*(len(df_exp_combined) - len(df_cc_filtered))/len(df_exp_combined):.1f}%)")

# Split back into reference and variant after filtering
df_reference_cc = df_cc_filtered[df_cc_filtered["Metadata_node_type"] == "disease_wt"].copy()
df_variant_cc = df_cc_filtered[df_cc_filtered["Metadata_gene_allele"] == TARGET_ALLELE].copy()

print(f"CC-filtered Reference: {len(df_reference_cc):,} cells")
print(f"CC-filtered Variant: {len(df_variant_cc):,} cells")

[CELL_COUNT_FILTER] Applying cell count threshold...
Combined experimental data: 15,973 cells
After CC filtering (≥20 cells/well): 15,973 cells (0.04s)
Cells removed: 0 (0.0%)


In [69]:
df_exp_combined_well = pl.DataFrame(
    pd.concat([df_reference_cc, df_variant_cc], axis=0)
).group_by(
    ["Metadata_Plate", "Metadata_Well", "Metadata_gene_allele"]
).agg(
    pl.col(col).median().alias(col)
    for col in df_exp_combined.columns
    if not col.startswith("Metadata_")
).unique() # .filter(pl.col("Metadata_gene_allele").is_in(balanced_class_alleles))

display(paired_ttest(
    df_exp_combined_well.filter(pl.col("Metadata_Well").is_in(["I21", "M19"])).to_pandas(), 
    "CCM2", TARGET_ALLELE, GFP_INTENSITY_COLUMN
))

t_stat,p_val,cohen_d,Gene,Variant
f64,f64,f64,str,str
10.759059,0.001717,4.965372,"""CCM2""","""CCM2_Val53Ile"""


In [70]:
## paired t-test to detect difference in cell count and gfp intensity
def paired_ttest(dat, reference: str, var: str, value: str, min_num_rep: int=3):
    # pivot to wide: one row per plate
    wide_gfp = dat.pivot(index="Metadata_Plate",
                        columns="Metadata_gene_allele",
                        values=value)
    # drop any plate that doesn’t have both measurements
    wide_gfp = wide_gfp.dropna(subset=[reference, var])
    if wide_gfp.shape[0] >= min_num_rep:
        # now run paired t-test
        t_stat, p_val = ttest_rel(wide_gfp[var].astype(float), wide_gfp[reference].astype(float))
    else:
        t_stat, p_val = None, None

    # Calculate Cohen's d
    mean_diff = np.mean(wide_gfp[var]) - np.mean(wide_gfp[reference])
    pooled_std = np.sqrt((np.std(wide_gfp[var], ddof=1) ** 2 + np.std(wide_gfp[reference], ddof=1) ** 2) / 2)
    cohen_d = mean_diff / pooled_std

    summary_df = pl.DataFrame(
        {
            "t_stat": t_stat,
            "p_val": p_val,
            "cohen_d": cohen_d
        }
    )
    summary_df = summary_df.with_columns(
        pl.lit(reference).alias("Gene"), pl.lit(var).alias("Variant")
    )
    return summary_df

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Step 2: Get well pairs and test GFP filtering on limited pairs
print("[WELL_PAIRS] Identifying well pairs...")

# Get well pairs
well_pairs = []
plates = list(set(df_reference_cc["Metadata_Plate"].unique()) & 
              set(df_variant_cc["Metadata_Plate"].unique()))

for plate in plates:
    ref_wells = df_reference_cc[df_reference_cc["Metadata_Plate"] == plate]["Metadata_well_position"].unique()
    var_wells = df_variant_cc[df_variant_cc["Metadata_Plate"] == plate]["Metadata_well_position"].unique()
    
    for ref_well in ref_wells:
        for var_well in var_wells:
            well_pairs.append((plate, ref_well, var_well))

print(f"Total well pairs identified: {len(well_pairs)}")

# Test on first 5 pairs for speed
TEST_PAIRS = 5
test_pairs = well_pairs#[:TEST_PAIRS]
print(f"Testing on first {TEST_PAIRS} well pairs:")

# df_exp_combined

gfp_filtered_results = []

for i, (plate, ref_well, var_well) in enumerate(test_pairs):
    print(f"\n[{i+1}/{TEST_PAIRS}] Testing plate {plate}, ref {ref_well} vs var {var_well}")
    
    # Get well data
    ref_well_data = df_reference_cc[
        (df_reference_cc["Metadata_Plate"] == plate) & 
        (df_reference_cc["Metadata_well_position"] == ref_well)
    ].copy()
    
    var_well_data = df_variant_cc[
        (df_variant_cc["Metadata_Plate"] == plate) & 
        (df_variant_cc["Metadata_well_position"] == var_well)
    ].copy()
    
    print(f"  Original cells: {len(ref_well_data)} ref, {len(var_well_data)} var")

    # df_exp_combined_well = pl.DataFrame(
    #     pd.concat([ref_well_data, var_well_data], axis=0)
    # ).group_by(
    #     ["Metadata_Plate", "Metadata_Well", "Metadata_gene_allele"]
    # ).agg(
    #     pl.col(col).median().alias(col)
    #     for col in df_exp_combined.columns
    #     if not col.startswith("Metadata_")
    # ).unique() # .filter(pl.col("Metadata_gene_allele").is_in(balanced_class_alleles))

    # display(df_exp_combined_well)
    # display(paired_ttest(
    #     df_exp_combined_well.to_pandas(), 
    #     "CCM2", TARGET_ALLELE, GFP_INTENSITY_COLUMN
    # ))
    
    # Test GFP filtering
    ref_gfp_values = ref_well_data["Cells_Intensity_IntegratedIntensity_GFP"].values
    var_gfp_values = var_well_data["Cells_Intensity_IntegratedIntensity_GFP"].values
    # print(ref_gfp_values, var_gfp_values)
    
    gfp_min, gfp_max, ref_filtered, var_filtered, quantile_info = find_optimal_gfp_range_fast(
        ref_gfp_values, var_gfp_values, MIN_CELLS_PER_WELL
    )
    
    # Apply GFP filtering
    ref_filtered = ref_well_data[
        (ref_well_data[GFP_INTENSITY_COLUMN] >= gfp_min) & 
        (ref_well_data[GFP_INTENSITY_COLUMN] <= gfp_max)
    ].copy()
    var_filtered = var_well_data[
        (var_well_data[GFP_INTENSITY_COLUMN] >= gfp_min) & 
        (var_well_data[GFP_INTENSITY_COLUMN] <= gfp_max)
    ].copy()

    gfp_filtered_results.append(ref_filtered)
    gfp_filtered_results.append(var_filtered)

    # sns.boxplot(data=pd.concat([var_filtered, ref_filtered],axis=0), 
    #             x="Metadata_gene_allele",
    #             y=GFP_INTENSITY_COLUMN)
    # plt.show()
    
    # # Prepare for classification
    # ref_filtered["Label"] = 1  # Reference = 1
    # var_filtered["Label"] = 0  # Variant = 0
    
    # combined_data = pd.concat([ref_filtered, var_filtered], ignore_index=True)
    # Stratify by plate for training/testing  
    # df_train, df_test = stratify_by_plate_gfp(combined_data, plate)
    
    # if gfp_min is not None:
    #     gfp_filtered_results.append({
    #         'pair_id': f"{plate}_{ref_well}_vs_{var_well}",
    #         'plate': plate,
    #         'ref_well': ref_well,
    #         'var_well': var_well,
    #         'ref_orig': len(ref_well_data),
    #         'var_orig': len(var_well_data),
            # 'ref_filtered': ref_filtered,
            # 'var_filtered': var_filtered,
            # 'gfp_min': gfp_min,
            # 'gfp_max': gfp_max,
            # 'quantile_range': quantile_info,
            # 'status': 'SUCCESS'
    #     })
    #     # print(f"  ✅ GFP filtered: {ref_filtered} ref, {var_filtered} var ({quantile_info}, GFP: {gfp_min:.1f}-{gfp_max:.1f})")
    # else:
    #     gfp_filtered_results.append({
    #         'pair_id': f"{plate}_{ref_well}_vs_{var_well}",
    #         'plate': plate,
    #         'ref_well': ref_well,
    #         'var_well': var_well,
    #         'ref_orig': len(ref_well_data),
    #         'var_orig': len(var_well_data),
    #         'ref_filtered': 0,
    #         'var_filtered': 0,
    #         'gfp_min': None,
    #         'gfp_max': None,
    #         'quantile_range': 'FAILED',
    #         'status': 'NO_SUITABLE_RANGE'
    #     })
    #     print(f"  ❌ No suitable GFP range found")


# Summary of GFP filtering results
# results_df = pd.DataFrame(gfp_filtered_results)
# successful_pairs = results_df[results_df['status'] == 'SUCCESS']

# print(f"\n=== GFP FILTERING SUMMARY ===")
# print(f"Successful pairs: {len(successful_pairs)}/{len(test_pairs)}")
# if len(successful_pairs) > 0:
#     print(f"Total cells retained: {successful_pairs['ref_filtered'].sum() + successful_pairs['var_filtered'].sum():,}")
#     print(f"Average cells per successful pair: {(successful_pairs['ref_filtered'].sum() + successful_pairs['var_filtered'].sum())/len(successful_pairs):.0f}")
#     print(f"Quantile ranges used: {successful_pairs['quantile_range'].value_counts().to_dict()}")
#     print(successful_pairs[['pair_id', 'ref_filtered', 'var_filtered', 'quantile_range']].to_string(index=False))

[WELL_PAIRS] Identifying well pairs...
Total well pairs identified: 16
Testing on first 5 well pairs:

[1/5] Testing plate 2025_06_02_B18A8A10R1_P2T1, ref A19 vs var M19
  Original cells: 453 ref, 1025 var

[2/5] Testing plate 2025_06_02_B18A8A10R1_P2T1, ref A19 vs var C23
  Original cells: 453 ref, 1321 var

[3/5] Testing plate 2025_06_02_B18A8A10R1_P2T1, ref I21 vs var M19
  Original cells: 305 ref, 1025 var

[4/5] Testing plate 2025_06_02_B18A8A10R1_P2T1, ref I21 vs var C23
  Original cells: 305 ref, 1321 var

[5/5] Testing plate 2025_06_02_B18A8A10R1_P2T2, ref A19 vs var M19
  Original cells: 808 ref, 1465 var

[6/5] Testing plate 2025_06_02_B18A8A10R1_P2T2, ref A19 vs var C23
  Original cells: 808 ref, 1414 var

[7/5] Testing plate 2025_06_02_B18A8A10R1_P2T2, ref I21 vs var M19
  Original cells: 480 ref, 1465 var

[8/5] Testing plate 2025_06_02_B18A8A10R1_P2T2, ref I21 vs var C23
  Original cells: 480 ref, 1414 var

[9/5] Testing plate 2025_06_02_B18A8A10R1_P2T3, ref I21 vs var C2

In [83]:
df_exp_combined_well_filtered = pl.DataFrame(
    pd.concat(gfp_filtered_results, axis=0)
).group_by(
    ["Metadata_Plate", "Metadata_Well", "Metadata_gene_allele"]
).agg(
    pl.col(col).median().alias(col)
    for col in df_exp_combined.columns
    if not col.startswith("Metadata_")
).unique() # .filter(pl.col("Metadata_gene_allele").is_in(balanced_class_alleles))

df_exp_combined_well_filtered

display(paired_ttest(
    df_exp_combined_well_filtered.filter(pl.col("Metadata_Well").is_in(["I21", "M19"])).to_pandas(), 
    "CCM2", TARGET_ALLELE, GFP_INTENSITY_COLUMN
))

t_stat,p_val,cohen_d,Gene,Variant
f64,f64,f64,str,str
2.480452,0.131276,0.938714,"""CCM2""","""CCM2_Val53Ile"""


In [33]:
# Step 3: Run classification using functions from classify_gfp_filtered.py

# Import classification functions we need
from classify_gfp_filtered import (
    classifier_gfp_filtered,
    # get_well_pairs
)

if len(successful_pairs) > 0:
    print("[CLASSIFICATION] Running classification test using classify_gfp_filtered.py functions...")
    
    # Get first successful pair info
    first_pair = successful_pairs.iloc[0]
    print(f"Testing pair: {first_pair['pair_id']}")
    
    # Use the actual classification function from the script
    try:
        result = classifier_gfp_filtered(
            df_reference_cc, df_variant_cc,
            first_pair['plate'], first_pair['ref_well'], first_pair['var_well'],
            MIN_CELLS_PER_WELL
        )
        
        if result is not None:
            print(f"\n🎯 CLASSIFICATION RESULTS")
            print(f"Pair ID: {result['pair_id']}")
            print(f"AUROC: {result['auroc']:.4f}")
            print(f"Cells used: {result['ref_cells']} ref, {result['var_cells']} var")
            print(f"GFP range: {result['gfp_range']}")
            print(f"Quantile range: {result['quantile_range']}")
            print(f"\n✅ Classification test completed successfully!")
        else:
            print("❌ Classification failed - check the function implementation")
            
    except Exception as e:
        print(f"❌ Error running classification: {e}")
        print("The classification function may need to be implemented in classify_gfp_filtered.py")
        
else:
    print("❌ No successful pairs found - cannot run classification test")

[CLASSIFICATION] Running classification test using classify_gfp_filtered.py functions...
Testing pair: 2025_06_02_B18A8A10R1_P2T4_I21_vs_M19
❌ Error running classification: classifier_gfp_filtered() takes from 2 to 3 positional arguments but 6 were given
The classification function may need to be implemented in classify_gfp_filtered.py


## Step 7: Classification Pipeline Testing

Now let's test the full classification pipeline using functions from classify_gfp_filtered.py

## Next Steps

The classification pipeline testing is now complete! You can:

1. **Run the full pipeline**: Use `classify_gfp_filtered.py` to process all well pairs
2. **Optimize parameters**: Adjust `MIN_CELLS_PER_WELL`, `CC_THRESHOLD`, or quantile ranges
3. **Analyze results**: Review classification performance and feature importance
4. **Scale up**: Process additional alleles or batches using the same approach

The GFP-filtered classification approach shows promise for improving variant discrimination by focusing on cells with similar protein expression levels.