In [33]:
import pandas as pd
df = pd.read_csv('output/filtered_extreme_phenotypes.csv')
df

Unnamed: 0,Plate,Well,Plate_Well,well_number,cell_death,migration_speed,proliferation,Has Phenotype,cell_death_label,migration_speed_label,proliferation_label,extreme_count
0,LT0001_02,A20,LT0001_02_A20,20,-0.002631,1.298089,1.086280,,LOW,HIGH,HIGH,3
1,LT0001_02,A21,LT0001_02_A21,21,-0.003591,1.291882,0.589791,,LOW,HIGH,MED,2
2,LT0001_02,B6,LT0001_02_B6,30,-0.002400,0.522008,1.190447,,LOW,MED,HIGH,2
3,LT0001_02,B23,LT0001_02_B23,47,0.000272,0.906874,0.905447,,MED,HIGH,HIGH,2
4,LT0001_02,C3,LT0001_02_C3,51,-0.003506,0.098430,0.689919,,LOW,MED,HIGH,2
...,...,...,...,...,...,...,...,...,...,...,...,...
23984,LT0603_06,M8,LT0603_06_M8,296,0.379299,-0.139327,-0.674514,yes,HIGH,LOW,MED,2
23985,LT0603_06,M10,LT0603_06_M10,298,0.052250,0.944677,-0.296883,,HIGH,HIGH,MED,2
23986,LT0603_06,M13,LT0603_06_M13,301,0.463245,0.195546,-1.519752,,HIGH,MED,LOW,2
23987,LT0603_06,O2,LT0603_06_O2,338,0.257390,0.421859,-1.140964,,HIGH,MED,LOW,2


In [34]:
df.columns

Index(['Plate', 'Well', 'Plate_Well', 'well_number', 'cell_death',
       'migration_speed', 'proliferation', 'Has Phenotype', 'cell_death_label',
       'migration_speed_label', 'proliferation_label', 'extreme_count'],
      dtype='object')

In [35]:

plate_counts = df['Plate'].value_counts()
# Calculate what percentage of total extreme samples these top plates represent
total_samples = len(df)
top_plates_sum = plate_counts.head(30).sum()
print(f"\nThe top 30 plates contain {top_plates_sum} samples ({top_plates_sum/total_samples*100:.1f}% of all extreme samples)")

# You could also look at plates with samples that have ALL three phenotypes as extreme
if 'extreme_count' in df.columns:
    all_extreme_df = df[df['extreme_count'] == 3]
    all_extreme_plates = all_extreme_df['Plate'].value_counts()
    
    print(f"\nPlates with samples having ALL THREE extreme phenotypes:")
    for plate, count in all_extreme_plates.head(10).items():
        print(f"Plate {plate}: {count} samples with all 3 extreme phenotypes")


The top 30 plates contain 3499 samples (14.6% of all extreme samples)

Plates with samples having ALL THREE extreme phenotypes:
Plate LT0023_11: 62 samples with all 3 extreme phenotypes
Plate LT0014_12: 35 samples with all 3 extreme phenotypes
Plate LT0040_44: 33 samples with all 3 extreme phenotypes
Plate LT0067_07: 30 samples with all 3 extreme phenotypes
Plate LT0048_13: 29 samples with all 3 extreme phenotypes
Plate LT0064_18: 29 samples with all 3 extreme phenotypes
Plate LT0066_19: 29 samples with all 3 extreme phenotypes
Plate LT0025_54: 28 samples with all 3 extreme phenotypes
Plate LT0025_37: 25 samples with all 3 extreme phenotypes
Plate LT0044_16: 25 samples with all 3 extreme phenotypes


In [36]:
# Get the top 30 plates
top_plates = plate_counts.head(30)

In [37]:
# Filter dataframe to only include the top 30 plates
top_plates_list = top_plates.index.tolist()
top_plates_df = df[df['Plate'].isin(top_plates_list)]

print(f"Total samples in top 30 plates: {len(top_plates_df)}")

# Analyze label columns
label_cols = [col for col in df.columns if col.endswith('_label')]

# Analyze each phenotype label column
for col in label_cols:
    print(f"\n--- {col} ---")
    value_counts = top_plates_df[col].value_counts()
    
    # Calculate percentages
    total = value_counts.sum()
    percentages = (value_counts / total * 100).round(1)
    
    # Display counts and percentages
    for value, count in value_counts.items():
        print(f"{value}: {count} samples ({percentages[value]}%)")

# Cross-tabulate to see relationships between phenotypes
print("\n--- Cross-tabulation of phenotypes ---")
phenotype_cross_tab = pd.crosstab(
    [top_plates_df['cell_death_label'], top_plates_df['migration_speed_label']],
    top_plates_df['proliferation_label']
)
print(phenotype_cross_tab)

# Show samples with all HIGH or all LOW values
print("\n--- Samples with all HIGH or all LOW values ---")
all_high = top_plates_df[(top_plates_df['cell_death_label'] == 'HIGH') & 
                         (top_plates_df['migration_speed_label'] == 'HIGH') & 
                         (top_plates_df['proliferation_label'] == 'HIGH')]

all_low = top_plates_df[(top_plates_df['cell_death_label'] == 'LOW') & 
                        (top_plates_df['migration_speed_label'] == 'LOW') & 
                        (top_plates_df['proliferation_label'] == 'LOW')]

print(f"All HIGH: {len(all_high)} samples")
print(f"All LOW: {len(all_low)} samples")

# Show distribution by plate
print("\n--- Distribution of extreme samples by plate ---")
plate_extreme_counts = top_plates_df.groupby(['Plate'])['extreme_count'].value_counts().unstack().fillna(0)
print(plate_extreme_counts)

# Find the wells that have all three extremes
if 'extreme_count' in top_plates_df.columns:
    all_extreme_wells = top_plates_df[top_plates_df['extreme_count'] == 3]
    print(f"\n--- Wells with all three phenotypes extreme ({len(all_extreme_wells)} total) ---")
    # Show a sample of these wells
    print(all_extreme_wells[['Plate', 'Well', 'cell_death_label', 'migration_speed_label', 'proliferation_label']].head(10))

Total samples in top 30 plates: 3499

--- cell_death_label ---
LOW: 1279 samples (36.6%)
HIGH: 1260 samples (36.0%)
MED: 960 samples (27.4%)

--- migration_speed_label ---
HIGH: 1509 samples (43.1%)
MED: 1234 samples (35.3%)
LOW: 756 samples (21.6%)

--- proliferation_label ---
HIGH: 1684 samples (48.1%)
LOW: 1163 samples (33.2%)
MED: 652 samples (18.6%)

--- Cross-tabulation of phenotypes ---
proliferation_label                     HIGH  LOW  MED
cell_death_label migration_speed_label                
HIGH             HIGH                     29  141  193
                 LOW                       4  220  136
                 MED                      84  453    0
LOW              HIGH                    197   10  190
                 LOW                      26   26  133
                 MED                     646   51    0
MED              HIGH                    646  103    0
                 LOW                      52  159    0

--- Samples with all HIGH or all LOW values ---
All 