# Explore the effects of tumor purity on ecDNA annotations.
**Please remember not to commit data or visualizations to GitHub!**

Best way to do this is to clear outputs of all cells before saving and committing changes to a notebook, and to make good use of the .gitignore file.

## Requirements
Data:
- `../data/cloud/opentarget/histologies.tsv` (get this file from OneDrive/2023-pedpancan/data/opentarget/histologies.tsv)

Software:
- pandas
- numpy
- seaborn
- scipy
- sklearn

## Results
- OpenPBTA has already applied 4 tumor purity estimators to many samples, using WGS or methylation data:
```
    total number of CBTN samples: 1874  
    samples with tumor_fraction_THetA2 estimate: 1655  
    samples with tumor_fraction_RFpurify_ABSOLUTE estimate: 1206  
    samples with tumor_fraction_RFpurify_ESTIMATE estimate: 1206  
    samples with tumor_fraction_LUMP estimate: 1205  
    samples with estimates from all estimators: 1099
```
- For these, we can ask whether there is an association between estimates from the same samples:
    - Weak correlation between THetA2 (WGS) and other estimators (methyl)
    - Strong correlation between estimators using methylation
- Logistic regression figure saved to `out/TODO`
- Tumor purity has nonsignificant effect on ecDNA prediction (likelihood ratio test, p=0.20, comparing models with covariates for sex, age, tumor type and extent of resection on n=1020 examples).

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import numpy as np
from scipy.stats import chi2
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression

import warnings
import sys
from pathlib import Path

sys.path.append('../src')
Path("out").mkdir(parents=True, exist_ok=True)

import data_imports

pd.set_option('display.max_columns', None)

In [None]:
warnings.filterwarnings('ignore', '.*differs between CAVATICA and opentarget annotations.*')
df = data_imports.generate_cbtn_biosample_table(verbose=2)
df = data_imports.unify_tumor_diagnoses(df,include_HM=False)
df = data_imports.clean_tumor_diagnoses(df)
df = data_imports.annotate_with_ecDNA(df)
df = data_imports.annotate_amplicon_class(df)
df = data_imports.annotate_duplicate_biosamples(df)
warnings.resetwarnings()
#df=df[df.in_unique_patient_set]
df.rename(columns={'tumor_fraction': 'tumor_fraction_THetA2'}, inplace=True)

In [None]:
print(len(df))
df.head(n=1)

In [None]:
df.to_csv('out/suppl_biosample_tumor_purity.tsv',sep='\t')

In [None]:
# Some helper functions
def savefig(fig, basename):
    pngName = basename + ".png"
    svgName = basename + ".svg"
    fig.savefig(pngName,format='png')
    fig.savefig(svgName,format='svg')
def set_plot_defaults():
    linewidth=0.75
    fontsize=10
    sns.set(rc={'svg.fonttype':'none',
                'pdf.use14corefonts':True,
                'figure.figsize':(7,2),
                'font.size': fontsize,'axes.labelsize': fontsize,'axes.titlesize': fontsize,'xtick.labelsize': fontsize,
                'ytick.labelsize': fontsize,'legend.fontsize': fontsize,
                'font.family':'sans-serif',
                'font.sans-serif':'Arial',
                'axes.linewidth':linewidth,
                'ytick.major.width':linewidth,
                })
    sns.set_style("white")
    sns.despine()
    return

## Tumor fraction metrics metaanalysis
OpenPBTA has 4 (four!) annotations for tumor fraction:
- tumor_fraction (THetA2, from WGS)
- tumor_fraction_RFpurify_ABSOLUTE (random forest predicting ABSOLUTE estimates (SNP array))
- tumor_fraction_RFpurify_ESTIMATE (random forest predicting ESTIMATE estimates (RNA-seq or affy array))
- tumor_fraction_LUMP (from leukocyte-specific CpGs in 450k array)

We're going to use THetA2 for downstream analyses but it's useful to know how well these estimates agree with each other.

In [None]:
purity_estimators = ['tumor_fraction_THetA2','tumor_fraction_RFpurify_ABSOLUTE','tumor_fraction_RFpurify_ESTIMATE','tumor_fraction_LUMP']
cbtn_purity_data = df[purity_estimators+['in_unique_tumor_set']].copy()
cbtn_purity_data.head(n=1)

In [None]:
# How complete are the tumor purity estimates?
def report_purity_completeness(df):
    ddf=df[df['in_unique_tumor_set']]
    df = df[purity_estimators]
    print(f'total number of CBTN samples: {len(df)}')
    print(f'total number of CBTN tumors: {len(ddf)}')
    for col in df.columns:
        print(f'samples with {col} estimate: {df[col].count()}')
        print(f'tumors with {col} estimate: {ddf[col].count()}')
    print(f'samples with estimates from all estimators: {len(df.dropna())}')
    print(f'tumors with estimates from all estimators: {len(ddf.dropna())}')
report_purity_completeness(cbtn_purity_data)

In [None]:
# How concordant are the tumor purity estimates?
def compare_purity_estimators(df):
    df=df[df['in_unique_tumor_set']]
    df=df.dropna()
    df = df[purity_estimators]
    df=df.rename(mapper=lambda x:x.removeprefix("tumor_fraction_") ,axis=1)
    corr = df.corr(method='pearson')

    mask = np.triu(np.ones_like(corr, dtype=bool),k=1)
    fig, ax = plt.subplots(figsize=(5, 4))
    cmap = sns.diverging_palette(220, 10, as_cmap=True)
    sns.heatmap(corr, mask=mask, cmap=cmap, vmin=-1, vmax=1, annot=True, fmt=".2f",
                square=True, linewidths=.5, cbar_kws={"shrink": .8}, ax=ax)
    
    ax.set_title("Concordance between tumor purity estimators")
    plt.tight_layout()
    plt.show()
    return fig, ax
set_plot_defaults()
fig, ax = compare_purity_estimators(cbtn_purity_data)
savefig(fig, 'out/purity_estimator_concordance')

## Association with ecDNA

- boxplot of ecDNA as a function of tumor_purity:  
  `out/boxplot_ecDNA_tumor_purity`
- likelihood ratio test to see if adding tumor_purity significantly improves the model.
  https://www.statology.org/likelihood-ratio-test-in-python/
    - LRT for THetA2 on base model of age, sex, extent of resection, cancer type: p=0.35, n=681
    - LRT for RFpurify_ABSOLUTE on base model of age, sex, extent of resection, cancer type: p=0.002, n=631
    - LRT for THetA2 on base model of age, sex, cancer type: p=0.12, n=2072

In [None]:
# Load THetA2 data for full cohort
def load_theta2_data(cbtn_data, sj_data):
    df = data_imports.import_biosamples()
    df['tumor_fraction_THetA2'] = cbtn_data['tumor_fraction_THetA2']
    df.loc[sj_data.index,'tumor_fraction_THetA2'] = sj_data['TumorFraction']
    #df=df[df['in_unique_tumor_set']]
    return df

sj_data = pd.read_excel('../manuscript/collaborators/Rishaan/SJ_tumor_purity.xlsx',index_col='biosample_id')
full_df = load_theta2_data(df,sj_data)

b = full_df[(~full_df['tumor_fraction_THetA2'].isna())]
t = b[b.in_unique_tumor_set]
print(f'Biosamples analyzed with THetA2: {len(b)}')
print(f'Tumors analyzed with THetA2: {len(t)}')

In [None]:
# How many SJ samples analyzed by THetA2?
full_df.head(n=1)
sjb = full_df[
    (full_df['cohort'].str.startswith('SJC')) &
    (~full_df['tumor_fraction_THetA2'].isna())
]
sjt = sjb[sjb.in_unique_tumor_set]
print(f'SJ biosamples analyzed with THetA2: {len(sjb)}')
print(f'SJ tumors analyzed with THetA2: {len(sjt)}')

In [None]:
from scipy.stats import f_oneway

def plot_ecDNA_vs_tumor_purity(data):
    data=data[data['in_unique_tumor_set']]
    data = data.dropna(subset=['amplicon_class','tumor_fraction_THetA2'])

    # one-way ANOVA:
    groups = [group['tumor_fraction_THetA2'].values for _, group in data.groupby('amplicon_class')]
    f_stat, p_value = f_oneway(*groups)
    print(f"F-statistic: {f_stat}, p-value: {p_value}")
    
    # figure
    fig, ax = plt.subplots(figsize=(5, 4))
    order=sorted(data['amplicon_class'].unique())
    sns.boxplot(data=data,x='amplicon_class',y='tumor_fraction_THetA2',order=order,ax=ax)
    plt.tight_layout()
    sns.despine()
    return fig,ax

set_plot_defaults()
fig, ax = plot_ecDNA_vs_tumor_purity(full_df)
savefig(fig, 'out/boxplot_ecDNA_tumor_purity')

In [None]:
"""
Copyright 2017 Ronald J. Nowling
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

def likelihood_ratio_test(features_alternate, labels, lr_model, features_null=None):
    """
    Compute the likelihood ratio test for a model trained on the set of features in
    `features_alternate` vs a null model.  If `features_null` is not defined, then
    the null model simply uses the intercept (class probabilities).  Note that
    `features_null` must be a subset of `features_alternative` -- it can not contain
    features that are not in `features_alternate`.
    Returns the p-value, which can be used to accept or reject the null hypothesis.
    """
    labels = np.array(labels)
    features_alternate = np.array(features_alternate)
    
    if features_null is not None:
        features_null = np.array(features_null)
        
        if features_null.shape[1] >= features_alternate.shape[1]:
            raise ValueError("Alternate features must have more features than null features")
        
        lr_model.fit(features_null, labels)
        null_prob = lr_model.predict_proba(features_null)#[:, 1]
        df = features_alternate.shape[1] - features_null.shape[1]
    else:
        null_prob = sum(labels) / float(labels.shape[0]) * \
                    np.ones(labels.shape)
        df = features_alternate.shape[1]
    
    lr_model.fit(features_alternate, labels)
    alt_prob = lr_model.predict_proba(features_alternate)

    alt_log_likelihood = -log_loss(labels,
                                   alt_prob,
                                   normalize=False)
    null_log_likelihood = -log_loss(labels,
                                    null_prob,
                                    normalize=False)

    G = 2 * (alt_log_likelihood - null_log_likelihood)
    p_value = chi2.sf(G, df)

    return p_value

In [None]:
def test_purity(asdf,variable,include_extent_of_resection=True):
    if include_extent_of_resection:
        v1 = ['sex','age_at_diagnosis','extent_of_tumor_resection','cancer_type']
    else:
        v1 = ['sex','age_at_diagnosis','cancer_type']
    v2 = v1 + [variable]
    asdf=asdf[asdf['in_unique_tumor_set']]
    data = asdf[v2+['amplicon_class']].dropna()
    # drop rare values
    if include_extent_of_resection:
        cts = data.extent_of_tumor_resection.value_counts()
        keep = cts[cts >=5].index
        data = data[data.extent_of_tumor_resection.isin(keep)]
    # keep only tumor types with at least 1 example of ecDNA, intrachromosomal, no amp
    amp_classes = set(data['amplicon_class'].unique())
    gb = data.groupby('cancer_type')['amplicon_class'].nunique()
    complete_types = gb[gb == len(amp_classes)].index
    print(complete_types)
    data = data[data['cancer_type'].isin(complete_types)]
    print(len(data))
    
    # data definitions
    y = data.amplicon_class
    x1=pd.get_dummies(data[v1])
    x2=pd.get_dummies(data[v2])
    
    # models
    model = LogisticRegression(penalty=None,solver='newton-cg')
    
    return likelihood_ratio_test(features_alternate=x2,features_null=x1,lr_model=model,labels=y)

In [None]:
# Test THetA2 on CBTN dataset
test_purity(df,'tumor_fraction_THetA2')

In [None]:
# Test RFpurify_ABSOLUTE on CBTN dataset
test_purity(df,'tumor_fraction_RFpurify_ABSOLUTE')

In [None]:
# Test THetA2 on full dataset
test_purity(full_df,'tumor_fraction_THetA2',include_extent_of_resection=False)

## Remove 'low-purity' tumors?

What happens if we remove 'low-purity' tumors?
- Lose 257 tumor samples
- ecDNA fraction increases for most tumor types
- Lose only ecDNA+ LGG
- Drop SJST031620_D1? Conflicting metadata, might be rhabdomyosarcoma

What do the ecDNA amps look like in 'low-purity' tumors?
- BS_DYA4EMPF: clear high-copy CDK4 ecDNA
- BS_2296JPP5: clear high-copy BIRC3 ecDNA
- BS_E1FCN4JK: 2 clear high-copy ecDNA
- SJOS001110_D1: low-copy ecDNA present
- SJLGG040_D: low-copy ecDNA present
- SJHGG030230_D1: Could be FP
- *SJHGG030230_R1: Same amp, complex rearrangements. Could be ecDNA or CNC.
- SJRHB031244_D2: Messy BP graph, lots of edges mapping to no change in CN.
- *SJRHB031244_D1: 2 clear ecDNA.
- SJST031395_D1: clear ecDNA
- SJST031620_D2: No amp
- SJST031620_D1: Clear MYCL ecDNA, but metadata ambiguously classify as Wilms or rhabdomyosarcoma.
- SJNBL031668_D1: Messy BP graph, high copy, probably heterogeneous ecDNA
- *SJNBL031668_D2: Same
- SJHGG031966_D1: Messy BP graph, probably ecDNA
- SJHGG032168_D1: low-copy PDGFB ecDNA present

\* Passes purity threshold but paired with sample that did not.

In [None]:
def examine_low_purity_tumors(df,threshold=0.1,metric='tumor_fraction_THetA2'):
    df = df[df[metric] < threshold]
    print(f"'Low-purity' tumors to be removed at threshold {threshold}: {len(df)}")
    print(f"Low-purity tumors by {df.groupby(["cancer_type","amplicon_class"])['patient_id'].count()}")
    ec_subset = df[df.amplicon_class == 'ecDNA']
    return ec_subset

In [None]:
ec_subset = examine_low_purity_tumors(full_df)

In [None]:
lq_tumors = ec_subset.index
lq_patients = ec_subset.patient_id
full_df[full_df.patient_id.isin(lq_patients)]

In [None]:
def find_suspected_duplicates(df):
    """
    Identify suspected biological duplicate samples based on metadata.
    
    Criteria:
    - Same patient_id, and
      - Two or more samples labeled "Diagnosis" from the "SJ" cohort, OR
      - Two or more samples from the "CBTN" cohort with the same age_at_diagnosis

    Parameters:
    df (pd.DataFrame): A DataFrame containing at least the following columns:
                       'patient_id', 'tumor_history', 'cohort', 'age_at_diagnosis'

    Returns:
    pd.DataFrame: Subset of df containing suspected duplicate samples
    """

    def select_duplicates(group):
        sj_case = group[
            (group['tumor_history'] == 'Diagnosis') &
            (group['cohort'].str.startswith('SJC'))
        ]
        if len(sj_case) >= 2:
            return sj_case

        cbtn_case = group[
            (group['cohort'].isin(['PBTA-X00','PBTA-X01','PNOC'])) &
            group['age_at_diagnosis'].duplicated(keep=False)
        ]
        if len(cbtn_case) >= 2:
            return cbtn_case
        return
    patient_ids = df.patient_id.copy()
    suspected = df.groupby('patient_id', group_keys=False).apply(select_duplicates,include_groups=False)
    suspected['patient_id'] = patient_ids
    return suspected

In [None]:
dups = find_suspected_duplicates(full_df)
print(len(dups[dups.in_unique_tumor_set])) # 197 tumors sampled more than once

In [None]:
def find_variable_duplicates(df, tumor_purity_col='tumor_fraction_THetA2', purity_threshold=0.1):
    """
    From a dataframe of biological duplicates, return samples where:
    - And at least one sample in the group is <= 0.1 tumor_purity while another is > 0.1,
    """

    suspected = find_suspected_duplicates(df)

    def is_discordant(group):
        purity = group[tumor_purity_col]
        crosses_threshold = (purity <= purity_threshold).any() and (purity > purity_threshold).any()
        if crosses_threshold:
            return group
        else:
            return pd.DataFrame(columns=group.columns)

    discordant = (
        df.groupby("patient_id", group_keys=False)
        .apply(is_discordant)
        .reset_index(drop=True)
    )

    return discordant


In [None]:
find_variable_duplicates(dups)

In [None]:
def find_discordant_duplicates(df, tumor_purity_col='tumor_fraction_THetA2', class_col='amplicon_class', purity_threshold=0.1):
    """
    From a dataframe of biological duplicates, return samples where:
    - And at least one sample in the group is <= 0.1 tumor_purity while another is > 0.1,
    and these samples have differing amplicon_class.
    """

    suspected = find_suspected_duplicates(df)

    def is_discordant(group):
        purity = group[tumor_purity_col]
        classes = group[class_col]

        crosses_threshold = (purity <= purity_threshold).any() and (purity > purity_threshold).any()
        differing_classes = classes.nunique() > 1

        if crosses_threshold and differing_classes:
            return group
        else:
            return pd.DataFrame(columns=group.columns)

    discordant = (
        df.groupby("patient_id", group_keys=False)
        .apply(is_discordant)
        .reset_index(drop=True)
    )

    return discordant


In [None]:
find_discordant_duplicates(dups)
# 2 tumors with differing classifications 

In [None]:
dups[dups.in_unique_tumor_set]

In [None]:
def estimate_fn_low_purity_tumors(df,threshold=0.1,metric='tumor_fraction_THetA2'):
    df = df[df[metric] < threshold]
    df = df[~df.index.isin(dups.index)]
    df = df[df.in_unique_tumor_set]
    return df
estimate_fn_low_purity_tumors(full_df)

In [None]:
216*2/33

In [None]:
18/(18+13)