# Comparing and visualizing results from pcawg and tgca datasets 

## 0. Combine or Align Results in a Single DataFrame

In [22]:
import pandas as pd
import numpy as np

In [23]:
pcawg = pd.read_csv('../../1_PCAWG_TEST_DATASET/_OUTPUTS_/pcawg_probes_of_interest.csv')
tgca = pd.read_csv('../../2_TGCA_VALIDATION_DATASET/_OUTPUTS_/tgca_probes_of_interest.csv')

In [24]:
df_combined = pd.concat([pcawg, tgca], ignore_index=True)


In [25]:
df_combined

Unnamed: 0,dataset,method,probe,variance,correlation,p_value,rf_importance_impute,rf_importance_drop,range,prop_above_0.8,prop_below_0.2,aggregate_score,cluster_label
0,pcawg_prim_window,var_1,cg00840341,0.047657,,,,,,,,,
1,pcawg_prim_window,var_1,cg13363969,0.043906,,,,,,,,,
2,pcawg_prim_window,var_1,cg15992272,0.043533,,,,,,,,,
3,pcawg_prim_window,var_1,cg15618210,0.043486,,,,,,,,,
4,pcawg_prim_window,var_1,cg08566882,0.043257,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
628,tgca_iqr_corr,aggregate_11,cg06552160,0.065880,0.522791,0.069511,,,0.925900,,,0.772293,
629,tgca_iqr_corr,aggregate_11,cg17163729,0.084354,0.424754,0.069049,,,0.930740,,,0.771820,
630,tgca_iqr_corr,aggregate_11,cg04311653,0.102940,0.356862,0.191138,,,0.946598,,,0.761781,
631,tgca_iqr_corr,aggregate_11,cg21711862,0.084107,0.391822,0.094002,,,0.940053,,,0.758443,


## 1. Create "source" and "test_done" columns

In [26]:
# 1) Add 'source' column
df_combined['source'] = np.where(
    df_combined['dataset'].str.contains('pcawg', case=False), 
    'pcawg',
    np.where(
        df_combined['dataset'].str.contains('tgca', case=False),
        'tgca',
        'other'  # if there's some unexpected naming
    )
)

# 2) Add 'test_done' column
df_combined['test_done'] = np.where(
    df_combined['dataset'].str.contains('prim',  case=False), 'primary_window',
    np.where(
        df_combined['dataset'].str.contains('iqr',   case=False), 'int_iqr_correlation',
        np.where(
            df_combined['dataset'].str.contains('sith',  case=False), 'sith_correlation',
            'other'  # fallback
        )
    )
)

# Quick check
df_combined[['dataset', 'source', 'test_done']].head(10)

Unnamed: 0,dataset,source,test_done
0,pcawg_prim_window,pcawg,primary_window
1,pcawg_prim_window,pcawg,primary_window
2,pcawg_prim_window,pcawg,primary_window
3,pcawg_prim_window,pcawg,primary_window
4,pcawg_prim_window,pcawg,primary_window
5,pcawg_prim_window,pcawg,primary_window
6,pcawg_prim_window,pcawg,primary_window
7,pcawg_prim_window,pcawg,primary_window
8,pcawg_prim_window,pcawg,primary_window
9,pcawg_prim_window,pcawg,primary_window


## 2. Compare PCAWG vs. tgca Probes for Each Test

In [27]:
unique_test_dones = ["primary_window", "int_iqr_correlation", "sith_correlation"]
unique_methods = df_combined['method'].unique()

for td in unique_test_dones:
    for method in unique_methods:
        # Filter to rows for that test_done & method
        subset = df_combined[(df_combined['test_done'] == td) &
                             (df_combined['method'] == method)]
        
        # Build sets of probes for each source
        pcawg_probes = set(subset[subset['source'] == 'pcawg']['probe'])
        tgca_probes  = set(subset[subset['source'] == 'tgca']['probe'])
        
        # Compute overlap
        overlap = pcawg_probes.intersection(tgca_probes)
        
        print(f"TestDone={td}, Method={method}:")
        print(f"  PCAWG probes: {len(pcawg_probes)}")
        print(f"  tgca probes:  {len(tgca_probes)}")
        print(f"  Overlap:      {len(overlap)} -> {sorted(list(overlap))[:5]}... [showing up to 5]")
        print("------------------------------------------------")


TestDone=primary_window, Method=var_1:
  PCAWG probes: 10
  tgca probes:  10
  Overlap:      0 -> []... [showing up to 5]
------------------------------------------------
TestDone=primary_window, Method=corr_2:
  PCAWG probes: 10
  tgca probes:  10
  Overlap:      0 -> []... [showing up to 5]
------------------------------------------------
TestDone=primary_window, Method=anova_3:
  PCAWG probes: 10
  tgca probes:  10
  Overlap:      0 -> []... [showing up to 5]
------------------------------------------------
TestDone=primary_window, Method=rf_impute_4a:
  PCAWG probes: 10
  tgca probes:  10
  Overlap:      0 -> []... [showing up to 5]
------------------------------------------------
TestDone=primary_window, Method=rf_drop_4b:
  PCAWG probes: 10
  tgca probes:  10
  Overlap:      0 -> []... [showing up to 5]
------------------------------------------------
TestDone=primary_window, Method=common_5:
  PCAWG probes: 2
  tgca probes:  0
  Overlap:      0 -> []... [showing up to 5]
-------

## 3. Checking Overlaps Across Methods and Tests

In [28]:
# Create a small helper: we mark presence as 1 if a row (source, test_done, method, probe) exists
df_presence = df_combined[['source', 'test_done', 'method', 'probe']].drop_duplicates()
df_presence['presence'] = 1

# Pivot so we get a wide table with presence across methods
pivoted = df_presence.pivot_table(
    index=['source', 'test_done', 'probe'],
    columns='method',
    values='presence',
    fill_value=0
).reset_index()

pivoted.head(15)


method,source,test_done,probe,aggregate_11,anova_3,cluster_7,combined_6,common_5,corr_2,diffmeth_10,range_8,rf_drop_4b,rf_impute_4a,threshold_9,var_1
0,pcawg,int_iqr_correlation,cg00686823,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,pcawg,int_iqr_correlation,cg00803453,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,pcawg,int_iqr_correlation,cg00807871,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,pcawg,int_iqr_correlation,cg00870279,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,pcawg,int_iqr_correlation,cg00960580,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5,pcawg,int_iqr_correlation,cg01662942,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,pcawg,int_iqr_correlation,cg01842321,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,pcawg,int_iqr_correlation,cg01861555,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,pcawg,int_iqr_correlation,cg03463523,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
9,pcawg,int_iqr_correlation,cg03895404,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0


## 4. Visualizing Overlaps

### A) Venn or UpSet Plots

In [29]:
records = []
for td in unique_test_dones:
    for method in unique_methods:
        subset = df_combined[
            (df_combined['test_done'] == td) &
            (df_combined['method'] == method)
        ]
        pcawg_probes = set(subset[subset['source'] == 'pcawg']['probe'])
        tgca_probes  = set(subset[subset['source'] == 'tgca']['probe'])
        overlap = pcawg_probes.intersection(tgca_probes)
        union   = pcawg_probes.union(tgca_probes)
        
        # compute overlap metrics
        overlap_count = len(overlap)
        union_count   = len(union)
        jaccard_index = (overlap_count / union_count) if union_count else 0
        
        records.append({
            'test_done': td,
            'method': method,
            'pcawg_count': len(pcawg_probes),
            'tgca_count':  len(tgca_probes),
            'overlap':     overlap_count,
            'union':       union_count,
            'jaccard':     jaccard_index
        })

df_overlap_summary = pd.DataFrame(records)
print(df_overlap_summary)


              test_done        method  pcawg_count  tgca_count  overlap  \
0        primary_window         var_1           10          10        0   
1        primary_window        corr_2           10          10        0   
2        primary_window       anova_3           10          10        0   
3        primary_window  rf_impute_4a           10          10        0   
4        primary_window    rf_drop_4b           10          10        0   
5        primary_window      common_5            2           0        0   
6        primary_window    combined_6           10          10        0   
7        primary_window     cluster_7           10          10        0   
8        primary_window       range_8           10          10        0   
9        primary_window   diffmeth_10           10          10        0   
10       primary_window  aggregate_11           10          10        0   
11       primary_window   threshold_9            0           0        0   
12  int_iqr_correlation  

In [30]:
df_combined.to_csv('../_OUTPUTS_/combined_probes_of_interest.csv', index=False)