# Cross-reference AC gene amplifications with CBTN fusion calls
This notebook identifies all fusion pairs in AmpliconClassifier results from CBTN which are also annotated in the fusion calls provided by CBTN. For a pair to be listed, it must satisfy the following requirements:
- In AC gene_list results, one must be annotated as 5' truncated and the other 3' truncated, on the same amplicon classified as ecDNA
- Both genes are listed as fusion partners in the annoFuse results for a RNA-seq fusion analysis corresponding to the same external biosample ID (7316-XXXX).

TODO: For this notebook to work on CBTN-X01 biosamples, biosample names must be sanitized to the format BS_XXXXXXX, I think.

In [None]:
import sevenbridges as sbg
import pandas as pd
import pathlib
import os
import shutil

pd.set_option('display.max_columns', None)

In [None]:
def import_biosamples(path="../data/Supplementary Tables.xlsx"):
    df = pd.read_excel(path, sheet_name="2. Biosamples",index_col=0)
    return df
def import_annoFuse(path="../data/local/annoFuse/annoFuse_all.tsv"):
    df = pd.read_csv(path,sep='\t')
    return df
def import_gene_list(path="../data/Supplementary Tables.xlsx"):
    df = pd.read_excel(path, sheet_name="4. Gene amplifications")
    return df

In [None]:
#BS = import_biosamples()
#BS = bs[~bs.external_sample_id.isna()]
#AF = import_annoFuse()
#GL = import_gene_list()

In [None]:
## Setup API, set global variables

# default config location is ~/.sevenbridges/credentials
api = sbg.Api(config=sbg.Config(profile='cavatica'))

PROJECT_ID='chapmano/pancancer-ecdna'
ANNOFUSE_DIR = api.files.query(project=PROJECT_ID, names=['annoFuse'])[0]

def get_metadata_table():
    '''
    Generate the metadata table of annoFuse results stored in chapmano/pancancer-ecdna/annoFuse
    '''
    # Assemble table of metadata, one row per file
    metadata = pd.DataFrame()
    files = api.files.query(parent=ANNOFUSE_DIR).all()
    for file in files:
        df = pd.DataFrame(data=file.metadata,index=[file.name])
        metadata = pd.concat([metadata,df])
    
    metadata = metadata[["Kids First Biospecimen ID","Kids First Participant ID","sample_id"]]
    return metadata
metadata = get_metadata_table()

In [None]:
BS = import_biosamples()
BS = bs[~bs.external_sample_id.isna()]
AF = import_annoFuse()

def view_group_head(grp):
    for key, item in grp:
        print(grp.get_group(key), "\n\n")
        break

def af_pairs(df):
    pairs = set()
    for index, row in df.iterrows():
        if not pd.isna(row.Gene1A) and not pd.isna(row.Gene1B):
            pairs.add((row.Gene1A, row.Gene1B))
        if not pd.isna(row.Gene1A) and not pd.isna(row.Gene2B):
            pairs.add((row.Gene1A, row.Gene2B))
        if not pd.isna(row.Gene2A) and not pd.isna(row.Gene1B):
            pairs.add((row.Gene2A, row.Gene1B))
        if not pd.isna(row.Gene2A) and not pd.isna(row.Gene2B):
            pairs.add((row.Gene2A, row.Gene2B))
    return pairs

def helper_fusions_by_amplicon(bs,af):
    fusions = set()
    genes_5p = bs[bs.truncated == '5p']['gene'].values
    genes_3p = bs[bs.truncated == '3p']['gene'].values
    pairs = af_pairs(af)
    #print(genes_5p, genes_3p, pairs)
    for pair in pairs:
        if (pair[0] in genes_5p and pair[1] in genes_3p) or (pair[1] in genes_5p and pair[0] in genes_3p):
            fusions.add(pair)
    fusions = pd.DataFrame(fusions)
    return fusions

def id_fusions_by_amplicon(df):
    bs=df.sample_name.unique()[0] # grouped by BS, so exactly 1 sample_name.
    try:
        external_sample_id = BS.loc[bs,'external_sample_id'] # at most 1 external sample id
    except:
        return pd.DataFrame()
    bs_rna = metadata[metadata.sample_id == external_sample_id]['Kids First Biospecimen ID']
    af = AF[AF.Sample.isin(bs_rna)]
    fusions = helper_fusions_by_amplicon(df,af)
    #fusions["WGS_biosample"] = bs
    fusions["external_biosample"] = external_sample_id
    fusions["RNA_biosample"] = ", ".join(bs_rna.values)
    return fusions

def id_cbtn_fusions_on_ecDNA():
    gl = import_gene_list()
    # only look at truncated genes, on ecDNA, from CBTN
    gl = gl[(gl.feature.str.startswith('ecDNA')) & (~gl.truncated.isna())]
    grp = gl.groupby(by=['sample_name','feature'])
    view_group_head(grp)
    result = grp.apply(id_fusions_by_amplicon)
    return result
results = id_cbtn_fusions_on_ecDNA()

In [None]:
results.to_excel("out/annoFuse_fusions_on_ecDNA.xlsx")

In [None]:
results

In [None]:
# FOXO1-PAX7 fusions not listed because they are from SJ biosamples.
gl[gl.gene=='FOXO1']

In [None]:
# Some external biosamples have multiple annoFuse results?! are the files the same?
print(len(metadata))
print(len(metadata.sample_id.unique()))