# preprocess-rodriguezfos-data.ipynb
This notebook preprocess data received from Elias Rodriguez-Fos and Anton Henssen from their paper "Mutational topography reflects clinical neuroblastoma heterogeneity", Cell Genomics, 2023.  Preprocessing is done in python because existing preprocessing code is in python; survival is in R because existing survival analysis code is in R; hence, 2 notebooks.

## Validation
Note that we have 8 tumors in this dataset annotated as 'ecDNA (MYCN)' in the publication for which AC returns ecDNA- classifications:  
| Patient ID  | MYCN status | ecDNA (MYCN) | HSR (MYCN) |   amplicon_class | AC result |
|-------------|-------------|--------------|------------|------------------|-----------|
| CB2022      | amplified   |       yes    |     no | intrachromosomal | CNC at MYCN |
| CB2024      | amplified   |       yes    |     no | intrachromosomal | CNC, elsewhere |
| CB2031      | amplified   |       yes    |     no | no amplification | No amp, no genes |
| CB2042      | amplified   |       yes    |     no | no amplification | Invalid, no genes|
| CB2045      | amplified   |       yes    |     no | no amplification | No amp, no genes |
| NBL26       | amplified   |       yes    |     no | intrachromosomal | CNC at MYCN |
| NBL31       | amplified   |       yes    |     no | no amplification | No amp, no genes |
| NBL32       | amplified   |       yes    |     no | intrachromosomal | CNC at MYCN |

Going to tentatively use the AC classifications for consistency, but maybe bring it up with Elias and Anton.  

TODO: Maybe run a clustering to confirm that there are no duplicated samples between our cohort and theirs.

## Outputs
Writes "processed_rodriguezfos_survival_data.tsv" to `./out`. Required for NBL-survival.ipynb.

In [None]:
import pandas as pd
import numpy as np
import warnings

# import data_imports.py
import sys
sys.path.append('../src')
from data_imports import *

In [None]:
pts = import_patients()
pts = pts[pts.cancer_type == 'NBL']
pts.head()

In [None]:
genes = import_genes()
genes.head()

In [None]:
def load_rodriguezfos_clinical_data(path,clean=False):
    '''
    Load clinical data from Rodriguez Fos et al, 2023.
    clean: if true, preprocess columns to match what is expected in preprocess_rodriguezfos_data.
    '''
    df = pd.read_csv(path)
    if clean:
        # rename columns we need
        df = df.rename(columns={
            "Patient ID":"patient_id",
            "Sex":"sex",
            "Age (days)":"age_at_diagnosis",
            "Overall survival":"OS_days",
            "Status DOA":"OS_status",
            "MYCN status":"MYCN_amp"
        })
        # convert units, unify terminology, etc.
        df = df.replace({
            "sex":{
                "M":"Male",
                "F":"Female",
                "na":np.nan
            },
            "OS_status":{
                1:"Deceased",
                0:"Alive"
            },
            "MYCN_amp":{
                "amplified":True,
                "non_amp":False
            }
        }).infer_objects(copy=False)
        df["OS_months"] = df["OS_days"] / 365.25 * 12
        # Remove columns not used downstream. Comment if you want them.
        df = df.drop(columns=[
            "Risk group",
            "Age (class)",
            "Stage of the disease (INSS)",
            "Status",
            #"MYCN status",
            "ecDNA (MYCN)",
            "HSR (MYCN)",
            "OS_days"
        ])
        # Add columns expected downstream
        df["cohort"] = "Rodriguez_Fos_2023"
        df["cancer_type"] = "NBL"
        df["cancer_subclass"] = np.nan
    df = df.set_index(df.columns[0])
    return df


def load_rodriguezfos_gene_amp_data(path):
    df = pd.read_csv(path,sep='\t',names=["gene","sv_class","risk_group","patient_id"])
    return(df)

def preprocess_rodriguezfos_data(path_to_clinical_data,
                                 path_to_ampliconclassifier_data,
                                 path_to_rodriguezfos_gene_amp_data):
    '''
    Preprocess data from Henssen lab into the same format as we use in this publication, as a table
    row-indexed by patient and the following columns:
        patient_id sex age_at_diagnosis cohort cancer_type cancer_subclass amplicon_class OS_status OS_months
    '''
    cldata = load_rodriguezfos_clinical_data(path=path_to_clinical_data,clean=True)
    # Add amplification classes
    cldata = annotate_amplicon_class(cldata,path_to_ampliconclassifier_data)
    # Add MYCN amp status
    cldata = annotate_mycn_amp(cldata,load_rodriguezfos_gene_amp_data(path_to_rodriguezfos_gene_amp_data))
    return cldata 

def sample_patient_map():
    '''
    Get a dict of sample id -> patient id for our dataset.
    '''
    return import_biosamples().patient_id.to_dict()

def load_this_gene_amp_data():
    df = import_genes()
    df['patient_id'] = df.sample_name.map(sample_patient_map())
    return df

def annotate_mycn_amp(patient_df,gene_df):
    mycn_amp_patients = gene_df[gene_df.gene=='MYCN']['patient_id']
    patient_df['MYCN_amp_AC'] = patient_df.index.isin(mycn_amp_patients)
    return patient_df

def preprocess_our_data():
    df = import_patients()
    df = df[df.cancer_type=='NBL']
    genes = load_this_gene_amp_data()
    df = annotate_mycn_amp(df,genes)
    df['MYCN_amp'] = df['MYCN_amp_AC'] # we only have amp calls from ac, so propagate these.
    return df

In [None]:
pd.set_option('display.max_rows', None)
path_to_clinical_data='../data/cloud/RodriguezFos2023/clinicaldata_neuroblastoma_Rodriguez-Fos_etal_MYCNampstatus_25.02.17.csv'
path_to_ampliconclassifier_data='../data/cloud/RodriguezFos2023/AmpliconClassifier_results_RodriguezFos_etal/input_file_classif_amplicons_AA_amplicon_classification_profiles.tsv'
path_to_gene_amp_data='../data/cloud/RodriguezFos2023/table_GENESoverlapingcomplexrearrangements_INTERSECT_114patients_infopatient_inforiskgroup_proteincoding.txt'
rf_data = preprocess_rodriguezfos_data(path_to_clinical_data,path_to_ampliconclassifier_data,path_to_gene_amp_data)
our_data = preprocess_our_data()
data = pd.concat([rf_data,our_data])
data.to_csv("out/processed_nbl_survival_data.tsv",sep='\t')

In [None]:
data

In [None]:
def validate_rodriguezfos_data(path_to_clinical_data,
                                 path_to_ampliconclassifier_data):
    '''
    We have published annotations from this paper indicating ecDNA at the MYCN locus, and also 
    AC predictions of ecDNA genome-wide. This function validates that all samples annotated as 
    having ecDNA at MYCN also are annotated as having ecDNA according to AC.
    '''
    cldata = load_rodriguezfos_clinical_data(path=path_to_clinical_data,clean=False)
    cldata = annotate_amplicon_class(cldata,path_to_ampliconclassifier_data)
    problems = cldata[(cldata["ecDNA (MYCN)"]=='yes') & (cldata["amplicon_class"]!='ecDNA')]
    if len(problems) > 0:
        warnings.warn("Some samples have inconsistent ecDNA annotations:")
        print(problems[['MYCN status','ecDNA (MYCN)','HSR (MYCN)','amplicon_class']])
    else:
        print("all clear")
    return

validate_rodriguezfos_data(path_to_clinical_data,path_to_ampliconclassifier_data)