# Explore the effects of tumor purity on ecDNA annotations.
**Please remember not to commit data or visualizations to GitHub!**

Best way to do this is to clear outputs of all cells before saving and committing changes to a notebook, and to make good use of the .gitignore file.

## Requirements
Data:
- `../data/cloud/opentarget/histologies.tsv` (get this file from OneDrive/2023-pedpancan/data/opentarget/histologies.tsv)

Software:
- pandas
- numpy
- seaborn
- scipy
- sklearn

## Results
- Logistic regression figure saved to `out/TODO`
- Tumor purity has nonsignificant effect on ecDNA prediction (likelihood ratio test, p=0.20, comparing models with covariates for sex, age, tumor type and extent of resection on n=1020 examples).

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import numpy as np
from scipy.stats import chi2
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression

import warnings
import sys
from pathlib import Path

sys.path.append('../src')
Path("out").mkdir(parents=True, exist_ok=True)

import data_imports

pd.set_option('display.max_columns', None)

In [None]:
warnings.filterwarnings('ignore', '.*differs between CAVATICA and opentarget annotations.*')
df = data_imports.generate_cbtn_biosample_table(verbose=1)
df = data_imports.unify_tumor_diagnoses(df,include_HM=False)
df = data_imports.clean_tumor_diagnoses(df)
df = data_imports.annotate_with_ecDNA(df)
df = data_imports.annotate_amplicon_class(df)
df = data_imports.annotate_duplicate_biosamples(df)
warnings.resetwarnings()
df=df[df.in_unique_patient_set]

In [None]:
df.head()

## Tumor fraction metrics metaanalysis
OpenPBTA has 4 (four!) annotations for tumor fraction:
- tumor_fraction (Theta2, from WGS)
- tumor_fraction_RFpurify_ABSOLUTE (random forest predicting ABSOLUTE estimates (SNP array))
- tumor_fraction_RFpurify_ESTIMATE (random forest predicting ESTIMATE estimates (RNA-seq or affy array))
- tumor_fraction_LUMP (from leukocyte-specific CpGs in 450k array)

We're going to use Theta2 for downstream analyses but it's useful to know how well these estimates agree with each other.

### TODO:
- report how complete these annotations are (of our cohort, how many have annotations for each of these metrics?)
- report pairwise Pearson's R between these values.

## Association with ecDNA

### TODO
- plot ecDNA as a function of tumor_purity using [seaborn regplot](https://seaborn.pydata.org/generated/seaborn.regplot.html)
### Done
- likelihood ratio test to see if adding tumor_purity significantly improves the model.
  https://www.statology.org/likelihood-ratio-test-in-python/  
  p = np.float64(0.20063803277943237), n=1020

In [None]:
"""
Copyright 2017 Ronald J. Nowling
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

def likelihood_ratio_test(features_alternate, labels, lr_model, features_null=None):
    """
    Compute the likelihood ratio test for a model trained on the set of features in
    `features_alternate` vs a null model.  If `features_null` is not defined, then
    the null model simply uses the intercept (class probabilities).  Note that
    `features_null` must be a subset of `features_alternative` -- it can not contain
    features that are not in `features_alternate`.
    Returns the p-value, which can be used to accept or reject the null hypothesis.
    """
    labels = np.array(labels)
    features_alternate = np.array(features_alternate)
    
    if features_null is not None:
        features_null = np.array(features_null)
        
        if features_null.shape[1] >= features_alternate.shape[1]:
            raise ValueError("Alternate features must have more features than null features")
        
        lr_model.fit(features_null, labels)
        null_prob = lr_model.predict_proba(features_null)#[:, 1]
        df = features_alternate.shape[1] - features_null.shape[1]
    else:
        null_prob = sum(labels) / float(labels.shape[0]) * \
                    np.ones(labels.shape)
        df = features_alternate.shape[1]
    
    lr_model.fit(features_alternate, labels)
    alt_prob = lr_model.predict_proba(features_alternate)

    alt_log_likelihood = -log_loss(labels,
                                   alt_prob,
                                   normalize=False)
    null_log_likelihood = -log_loss(labels,
                                    null_prob,
                                    normalize=False)

    G = 2 * (alt_log_likelihood - null_log_likelihood)
    p_value = chi2.sf(G, df)

    return p_value

In [None]:
# feature selection
v1 = ['sex','age_at_diagnosis','extent_of_tumor_resection','cancer_type']
v2 = v1 + ['tumor_fraction']
data = df[v2+['amplicon_class']].dropna()
# drop rare values
cts = data.extent_of_tumor_resection.value_counts()
keep = cts[cts >=5].index
data = data[data.extent_of_tumor_resection.isin(keep)]
keep=['LGG','CPG','CPT','EMBT','EPN','ETMR','GCT','HGG','MBL','NBL','PINT']
#keep=['CPT','EPN','GCT','HGG','MBL','NBL','PINT']
data=data[data.cancer_type.isin(keep)]

# data definitions
y = data.amplicon_class
x1=pd.get_dummies(data[v1])
x2=pd.get_dummies(data[v2])

# models
model = LogisticRegression(penalty=None,solver='newton-cg')

likelihood_ratio_test(features_alternate=x2,features_null=x1,lr_model=model,labels=y)

In [None]:
data.shape