# Primaries, relapses and longitudinal pairs

Sunita's and my counts for primary and secondary tumors are largely similar. However, my count
of longitudinal samples is about half Sunita's. 
The reason for this seems to be that she is including Autopsy samples in pairs, and secondary/secondary pairs. Open question whether this is desired behavior. Certainly makes it more confusing to describe.

In [None]:
import pandas as pd
pd.set_option('display.max_rows', 5)
pd.set_option('display.max_columns', None)
import numpy as np
import scipy.stats
import sys
sys.path.append('../src')
import data_imports

In [None]:
BIOSAMPLES = data_imports.import_biosamples()
# BIOSAMPLES.head()
# BIOSAMPLES.tumor_history.unique()

# Count primary and secondary tumors with ecDNA

In [None]:
# how many diagnosis tumors have ecDNA? 126 / 1291
# Update 11/2024: 210 / 2558
def primaries_w_ecDNA(tumor_types=None,verbose=True):
    if tumor_types == None:
        df1 = BIOSAMPLES[BIOSAMPLES.tumor_history == "Diagnosis"].groupby('patient_id').agg(aggregated_value=('amplicon_class', lambda x: (x == 'ecDNA').sum())).reset_index()
    else:
        df1 = BIOSAMPLES[(BIOSAMPLES.tumor_history == "Diagnosis") & (BIOSAMPLES.cancer_type.isin(tumor_types))].groupby('patient_id').agg(aggregated_value=('amplicon_class', lambda x: (x == 'ecDNA').sum())).reset_index()
    df1["primary_ecDNA"] = df1.aggregated_value > 0
    df1.set_index("patient_id",inplace=True)
    df1.drop("aggregated_value",axis=1,inplace=True)
    a = len(df1[df1.primary_ecDNA])
    b = len(df1)
    if verbose:
        print(f"{a} of {b} ({round(a/b*100,1)}%) primary tumors have ecDNA")
    return(df1)
p = primaries_w_ecDNA()

In [None]:
# how many secondary tumors have ecDNA? 33 / 199
# Update 11/2024: 57 / 511
def secondaries_w_ecDNA(verbose=True):
    df2 = BIOSAMPLES[BIOSAMPLES.tumor_history.isin(["Recurrence","Progressive","Relapse","Metastasis"])].groupby('patient_id').agg(aggregated_value=('amplicon_class', lambda x: (x == 'ecDNA').sum())).reset_index()
    df2["secondary_ecDNA"] = df2.aggregated_value > 0
    df2.set_index("patient_id",inplace=True)
    df2.drop("aggregated_value",axis=1,inplace=True)
    a = len(df2[df2.secondary_ecDNA])
    b = len(df2)
    if verbose:
        print(f"{a} of {b} ({round(a/b*100,1)}%) secondary tumors have ecDNA")
    return(df2)
s = secondaries_w_ecDNA()

In [None]:
def test_primary_secondary_independence(p,s):
    '''
    Run a chi-sq test to test for association between primary/secondary
    and ecDNA. Note that for the test to be valid, we can't have paired
    samples so we throw out primary tumors also in the secondary set. 
    '''
    s = s[~s.index.isin(p.index)]
    a = p.primary_ecDNA.sum()
    b = len(p) - a
    c = s.secondary_ecDNA.sum()
    d = len(s) - c
    tbl = [[a,b],[c,d]]
    return scipy.stats.chi2_contingency(tbl)
test_primary_secondary_independence(p,s)

# Longitudinal tumors

In [None]:
# longitudinal samples
def get_longitudinal_primary_secondary_pairs(verbose=True):
    df1 = primaries_w_ecDNA(verbose=False)
    df2 = secondaries_w_ecDNA(False)
    df=df1.merge(df2,how='inner',left_index=True,right_index=True)
    if verbose:
        print(f'{len(df)} primary/secondary pairs')
    return df
get_longitudinal_primary_secondary_pairs()

In [None]:
pd.set_option('display.max_rows', 5)
SECONDARIES = ['Diagnosis','Progressive','Autopsy','Recurrence','Relapse','Metastasis'] #exclude second malignancies, no sample, unavailable
def get_sj_pairs():
    '''
    We define a longitudinal case from SJ or PNOC which has a diagnosis sample and a non-diagnosis sample.
    '''
    df = BIOSAMPLES[(BIOSAMPLES.cohort.str.startswith("SJ") | (BIOSAMPLES.cohort == "PNOC")) &
                    (BIOSAMPLES.duplicated('patient_id',keep=False))]
    grp = df.groupby('patient_id').filter(lambda x: x['tumor_history'].nunique() >= 2).sort_values(["patient_id","tumor_history"])
    return grp
def get_cbtn_pairs():
    '''
    We define a longitudinal case from CBTN which has samples with different dates of diagnosis.
    '''
    df = BIOSAMPLES[BIOSAMPLES.cohort.isin(["PBTA-X00","PBTA-X01"]) &
                    (BIOSAMPLES.tumor_history.isin(SECONDARIES)) &
                    (BIOSAMPLES.duplicated('patient_id',keep=False))]
    grp = df.groupby('patient_id').filter(lambda x: x['age_at_diagnosis'].max()-x['age_at_diagnosis'].min()>=30).sort_values(["patient_id","age_at_diagnosis"])
    return grp
def get_longitudinal_cases(verbose=True):
    df = pd.concat([get_cbtn_pairs(),get_sj_pairs()])
    if verbose:
        a = df.patient_id.nunique()
        b = df[df.amplicon_class == 'ecDNA'].patient_id.nunique()
        print(f"{b} of {a} longitudinal cases have ecDNA")
    return df

#get_sj_pairs()
#get_cbtn_pairs()
longitudinal_cases = get_longitudinal_cases()
longitudinal_cases
# 5/2024: 18 of 85 longitudinal cases have ecDNA
# 9/2024: 31 of 213 longitudinal cases have ecDNA

In [None]:
longitudinal_cases[longitudinal_cases.amplicon_class=='ecDNA'].patient_id.unique()

In [None]:
longitudinal_cases.to_excel("out/longitudinal_cases.xlsx")

In [None]:
def get_suppl_tbl_7():
    return pd.read_excel(data_imports.SUPPLEMENTARY_TABLES_PATH,sheet_name="7. Paired biosamples")

In [None]:
df = get_suppl_tbl_7()
df.groupby('evolution_class').count()

In [None]:
def get_longitudinal_tumors_with_multi_ecDNA():
    pass
    

## Notes on tumors with multiple ecDNAs
- PT_7WYPEC3Q (SHH MBL, primary -> progressive)
  - (loss) chr17p11.2, CN 15 
  - (loss) chr17:28,683,354-29,500,780 (chr17q), CN 14
  - (gain) TERT, CN 26
  - (gain) PPM1D, CN 13
- PT_KTRJ8TFY (H3K27 DMG, primary -> progressive)
  - (gain) PICALM, CN 6
  - (gain) FLT3 2x / CDX2, CN 15
- PT_XA98HG1C (SHH MBL, primary -> progressive)
  - (gain) MYCN, CN 54
  - (loss) FHL2 partial, CN 7
  - (recombinant) CCND2 partial, CN 15 -> 118
- SJ004912 (OST, primary -> metastasis)
  - (loss) amp6 no oncogenes, CN 12
  - (loss) amp9 chr7 no oncogenes, only LOC124901577, CN 2
  - (loss) amp10, probable FP
- SJ000912
  - 

# AmpliconSimilarity Inputs

In [None]:
# TODO: these samples still need longitudinal analyses done.
def get_ampliconsimilarity_todos():
    df = get_suppl_tbl_7()
    a = set(df.patient_id)
    df = get_longitudinal_cases(False)
    b = set(df[df.amplicon_class=='ecDNA'].patient_id)
    return df[(~df.patient_id.isin(a)) & df.patient_id.isin(b)]
pd.set_option('display.max_rows', None)
get_ampliconsimilarity_todos()

In [None]:
# Generate required input file for ampliconsimilarity: features_to_graph.txt file of the biosamples for longitudinal cases.

def get_features_to_graph(file='../data/source/AmpliconClassifier/pedpancan_features_to_graph.txt'):
    df = pd.read_csv(file,sep='\t',header=None,names=['bed','graph'])
    return df
def subset_features_to_graph(pairs_df):
    #pairs_df = get_longitudinal_cases(verbose=False)
    ftg_df = get_features_to_graph()
    for pt in pairs_df.patient_id.unique():
        print(pt)
        bs_set = pairs_df[pairs_df.patient_id == pt].index
        ftg_subset = ftg_df[ftg_df.bed.str.contains('|'.join(bs_set))]
        if len(ftg_subset) > 1:
            print("cp2")
            filepath=f'out/{pt}_features_to_graph.txt'
            ftg_subset.to_csv(filepath,sep='\t',header=False,index=False)
        else:
            print("cp3")
            continue
    return
subset_features_to_graph(get_ampliconsimilarity_todos())

In [None]:
get_ampliconsimilarity_todos()

In [None]:
import os
os.getcwd()

In [None]:
# Parse output

def get_feature_similarity_scores(file="../data/source/AmpliconClassifier/pedpancan_feature_similarity_scores.tsv"):
    df = pd.read_csv(file,sep='\t')
    return df
df = get_feature_similarity_scores()

In [None]:
pd.set_option('display.max_rows', None)
df.head()

# AmpliconSimilarity outputs

In [None]:
# Get AS outputs
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


class Breakpoint(object):
    def __init__(self, lchrom, lpos, lstrand, rchrom, rpos, rstrand, cn):
        self.lchrom = lchrom
        self.lpos = int(lpos)
        self.lstrand = lstrand
        self.rchrom = rchrom
        self.rpos = int(rpos)
        self.rstrand = rstrand
        self.cn = cn

    def __str__(self):
        return f"{self.lchrom}:{str(self.lpos)}{'+' if self.lstrand else '-'}|{self.rchrom}:{str(self.rpos)}{'+' if self.rstrand else '-'}"

    def d_similar(self, bp2, d):
        bp2_chrom_set = {bp2.lchrom, bp2.rchrom}
        if self.lchrom not in bp2_chrom_set or self.rchrom not in bp2_chrom_set:
            return False

        sbp1 = sorted([(self.lchrom, self.lpos, self.lstrand), (self.rchrom, self.rpos, self.rstrand)])
        sbp2 = sorted([(bp2.lchrom, bp2.lpos, bp2.lstrand), (bp2.rchrom, bp2.rpos, bp2.rstrand)])
        
        # if chromosomes and strands are the same
        if sbp1[0][0] == sbp2[0][0] and sbp1[1][0] == sbp2[1][0] and sbp1[0][2] == sbp2[0][2] and sbp1[1][2] == sbp2[1][2]:
            # if left and right breakpoint locations are within d
            if (abs(sbp1[0][1] - sbp2[0][1]) + abs(sbp1[1][1] - sbp2[1][1]) < d) or (abs(sbp1[0][1] - sbp2[1][1]) + abs(sbp1[1][1] - sbp2[0][1]) < d):
                return True
        return False

class Cycle(object):
    
    def __init__(self,cn,length,breakpoints,circular,trivial):
        self.cn=float(cn)
        self.length=int(length)
        self.breakpoints=breakpoints # list of breakpoints
        self.circular=bool(circular)
        self.trivial=bool(trivial)
        
    def __str__(self):
        bp_expanded=','.join(str(b) for b in self.breakpoints)
        return(f'Copy_count={self.cn};Length={self.length}bp;Breakpoints={bp_expanded};Circular={self.circular};Trivial={self.trivial}')
        
    def __add__(self,o):
        return Cycle(cn=np.mean([self.cn,o.cn]), length=self.length+o.length, breakpoints=self.breakpoints+o.breakpoints,
                    circular = self.circular&o.circular, trivial = self.trivial&o.trivial)
    
    def diff(self,o,d=1000):
        df = np.zeros((len(self.breakpoints),len(o.breakpoints)))
        for i in range(len(self.breakpoints)):
            for j in range(len(o.breakpoints)):
                df[i,j]=self.breakpoints[i].d_similar(o.breakpoints[j],d)
        return df
    
    def plot_diff(self,other,d=1000):
        df = self.diff(other,d=d)
        #plt.imshow(binary_matrix, cmap='viridis', interpolation='nearest')
        plt.imshow(df)
        # Set x-axis labels
        plt.yticks(np.arange(len(self.breakpoints)), map(str,self.breakpoints))

        # Set y-axis labels
        plt.xticks(np.arange(len(other.breakpoints)), map(str,other.breakpoints), rotation=90)

        # Add labels to the axes
        plt.ylabel("Self")
        plt.xlabel("Other")
        
        plt.grid(True, which='both', linestyle='-', linewidth=1, color='white')

        # Show the plot
        plt.show()
        
class Converted_Cycles(object):
    def __init__(self,cycles,segments,intervals):
        self.cycles=cycles
        self.segments=segments
        self.intervals=intervals
    
    def __init__(self,path):
        '''
        path or iobuffer to a _BPG_converted_cycles.txt file
        '''
        self.intervals=[('Source',0,0)] #(chr start end)
        self.segments=[('Source',0,0)] #(chr start end)
        self.cycles=[] # Cycles
        with open(path,'r') as file:
            for line in file:
                if line.startswith('Interval'):
                    self.parse_interval(line)
                elif line.startswith('Segment'):
                    self.parse_segment(line)
                elif line.startswith('Cycle'):
                    self.parse_cycle(line)
                else:
                    continue
                        
    def parse_segment(self,line):
        line=line.strip().split()
        self.segments.append((line[2],int(line[3]),int(line[4])))
    def parse_interval(self,line):
        line=line.strip().split()
        self.intervals.append((line[2],int(line[3]),int(line[4])))
    def parse_breakpoints(self,subline):
        subline=subline.split(',')
        breakpoints=[]
        for i in range(len(subline)):
            c=(int(subline[i][:-1]),subline[i][-1]) #current
            n_i = i+1 if i+1 < len(subline) else 0
            n=(int(subline[n_i][:-1]),subline[n_i][-1]) #next
            if c[1]=='+' and n[1]=='+' and n[0]-c[0]==1:
                continue
            elif c[1]=='-' and n[1]=='-' and c[0]-n[0]==1:
                continue
            else:
                breakpoints.append(Breakpoint(
                    self.segments[c[0]][0],self.segments[c[0]][2] if c[1]=='+' else self.segments[c[0]][1],c[1]=='+',
                    self.segments[n[0]][0],self.segments[n[0]][1] if n[1]=='+' else self.segments[n[0]][2],n[1]!='+',
                    None
                ))
        return breakpoints
        
    def parse_cycle(self,line):
        line=list(map(lambda x: x.split('='), line.strip().split(';')))
        self.cycles.append(Cycle(cn=line[1][1],
                                length=line[2][1][:-2],
                                breakpoints=self.parse_breakpoints(line[3][1]),
                                circular=line[4][1] == 'TRUE',
                                trivial=line[5][1] == 'TRUE'))
        
        
BS_W37QBA12_parse = Converted_Cycles('CycleViz/BS_W37QBA12/BS_W37QBA12_amplicon1_BPG_converted_cycles.txt')
BS_2J4FG4HV_parse = Converted_Cycles('CycleViz/BS_2J4FG4HV/BS_2J4FG4HV_amplicon1_BPG_converted_cycles.txt')
BS_5JC116NM_parse = Converted_Cycles('CycleViz/BS_5JC116NM/BS_5JC116NM_amplicon1_BPG_converted_cycles.txt')

empty=Cycle(np.nan,0,[],True,True)
BS_W37QBA12_ecdna = sum(BS_W37QBA12_parse.cycles[:3],empty)
BS_2J4FG4HV_ecdna = sum(BS_2J4FG4HV_parse.cycles[:3],empty)
BS_5JC116NM_ecdna = sum(BS_5JC116NM_parse.cycles[:6],empty)


In [None]:
SJRHB012_D_parse = Converted_Cycles('CycleViz/SJRHB012_D/SJRHB012_D_amplicon1_BPG_converted_cycles.txt')
SJRHB012_R_parse1 = Converted_Cycles('CycleViz/SJRHB012_S/SJRHB012_S_amplicon1_BPG_converted_cycles.txt')
SJRHB012_R_parse2 = Converted_Cycles('CycleViz/SJRHB012_S/SJRHB012_S_amplicon2_BPG_converted_cycles.txt')

In [None]:
SJRHB012_D_ecdna1 = SJRHB012_D_parse.cycles[3]
SJRHB012_R_ecdna1 = SJRHB012_R_parse1.cycles[0]
SJRHB012_D_ecdna2 = SJRHB012_D_parse.cycles[0]
SJRHB012_R_ecdna2 = SJRHB012_R_parse2.cycles[0]

In [None]:
SJRHB012_D_ecdna1.plot_diff(SJRHB012_R_ecdna1)

In [None]:
SJRHB012_D_ecdna2.plot_diff(SJRHB012_R_ecdna2)

In [None]:
# primary vs primary
BS_2J4FG4HV_ecdna.plot_diff(BS_W37QBA12_ecdna)

In [None]:
# primary vs relapse
BS_2J4FG4HV_ecdna.plot_diff(BS_5JC116NM_ecdna)

# Dead code

In [None]:
# What tumor types do the paired biosamples come from?
paired_tumor_types = set(BIOSAMPLES[BIOSAMPLES.patient_id.isin(df.index)].cancer_type.unique())
ecDNA_tumor_types = set(BIOSAMPLES[BIOSAMPLES.amplicon_class == 'ecDNA'].cancer_type.unique())
paired_set = paired_tumor_types & ecDNA_tumor_types
paired_set

In [None]:
df1["primary_ecDNA"] = df1.aggregated_value > 0
df1.set_index("patient_id",inplace=True)
df1.drop("aggregated_value",axis=1,inplace=True)
a = len(df1[df1.primary_ecDNA])
b = len(df1)
print(f"{a} of {b} ({round(a/b*100,1)}%) primary tumors have ecDNA")
df1.head()

In [None]:
df1["has_secondary"] = df1.index.isin(df.index)

In [None]:
a= pd.crosstab(df1.primary_ecDNA, df1.has_secondary)
print(a)
scipy.stats.chi2_contingency(a)

In [None]:
pd.crosstab(df.primary_ecDNA, df.secondary_ecDNA)

In [None]:
def import_sj_survival_data(path="../data/local/sjcloud/SJ_SurvivalMaster.xlsx"):
    path = pathlib.Path(path)
    df = pd.read_excel(path,index_col=0)
    return df
def clean_sj_survival_data(df):
    df = df.dropna(subset=['Date of Primary Dx']).copy()
    df['tmp']=df['Date of Death'].fillna(df['Date of data collection'])
    df['OS_months'] = (df.tmp - df['Date of Primary Dx']).apply(lambda x:x.days * 12 / 365.25)
    df = df.rename(columns={
        'Survival Status':'OS_status'
    })
    df = df[['OS_status','OS_months']]
    df = df.replace({
        'OS_status':{
            "Expired": "Deceased",
        }
    })
    return df
def import_clean_cbtn_survival_data():
    df = generate_cbtn_biosample_table(verbose=1)
    df['OS_months']=df['OS_days']*12/365.25
    df = df[['OS_status','OS_months']]
    df = df.replace({
        'OS_status':{
            "DECEASED": "Deceased",
            "LIVING":"Alive",
        }
    })
    return df
    
def generate_patient_table():
    # Start with biosamples
    df = generate_biosample_table()
    df = df[df.in_unique_patient_set == True]
    df = df[['sex','patient_id','age_at_diagnosis','cohort','cancer_type','amplicon_class']]
    # Add sj survival data
    surv = import_sj_survival_data()
    surv = clean_sj_survival_data(surv)
    # Add cbtn survival data
    surv = pd.concat([surv,import_clean_cbtn_survival_data()])
    df = df.join(surv)
    df.set_index('patient_id')
    return df

df = generate_patient_table()
df.tail(n=20)


In [None]:
df.head()

In [None]:
print(len(df))
print(len(df.subject_name.unique()))

In [None]:
print(len(df2))
print(len(df2["Kids First Participant ID"].unique()))

In [None]:
print(df2['Tumor Descriptor'].unique())
print(df.sample_type.unique())

In [None]:
def import_pedcbioportal_metadata(path="../2023-11-27_cavatica-api/out/openpbta-biosample-metadata.tsv"):
    path = pathlib.Path(path)
    df = pd.read_csv(path, sep='\t',index_col=0)
    return df
def get_cbtn_cell_lines():
    df = import_pedcbioportal_metadata()
    df = df[df.SAMPLE_TYPE == "Derived Cell Line"]
    return df.SPECIMEN_ID.str.cat(sep=';').split(';')
#get_cbtn_cell_lines()

In [None]:
cbtn_pairs = df2[df2.in_deduplicated_sample_cohort]
cbtn_pairs = cbtn_pairs[cbtn_pairs["Tumor Descriptor"] != "Second Malignancy"]
cbtn_pairs = cbtn_pairs[cbtn_pairs["Kids First Participant ID"].duplicated(keep=False)].sort_values("Kids First Participant ID")
print(len(cbtn_pairs["Kids First Participant ID"].unique()))
#cbtn_pairs

In [None]:
pairs = df[df.in_deduplicated_sample_cohort]
pairs = pairs[pairs.subject_name.duplicated(keep=False)].sort_values('subject_name')
pairs = pairs[~pairs.subject_name.isin(["SJ030303","SJ030890"])] # these patients had multiple primaries of different histologies.
len(pairs)

In [None]:
# List all patients with more than one sample.
sj_dups = df[df.subject_name.duplicated(keep=False)].sort_values('subject_name')
print(len(sj_dups.subject_name.unique()))
#sj_dups

In [None]:
# Weirdly, there are 10 biosamples not in Sunita's paired samples table. Some, but not all, are true duplicates.
def import_sunita_sj_master_table(path="/Users/ochapman/Library/CloudStorage/OneDrive-SanfordBurnhamPrebysMedicalDiscoveryInstitute/projects/2023-pedpancan/data/PedPanCancer_StJude_MasterAnalysis_Copy.xlsx"):
    path=pathlib.Path(path)
    df = pd.read_excel(path,index_col=0,sheet_name='PairedSamples')
    return df
sunita = import_sunita_sj_master_table()
missing = set(dups.subject_name.unique())-set(sunita.index.unique())
print(len(missing))
#dups[dups.subject_name.isin(missing)]