In [None]:
# first, import packages
import polars as pl
import pandas as pd
import marimo

### **Processing of Raw List of Gene Fusion Transcripts from Arriba and FusionCatcher on CCLE+internal Cell Lines**

This notebook details the processes (semi-automated) done to further process the raw output files from Arriba and FusionCatcher fusion transcript callers. 

1. Run the `pypolars-process-ft-tsv.py` script to generate fusion transcript list from Arriba and FusionCatcher output files. The script takes a mandatory input of path to the directory where sample-specific fusion call output files from Arriba or FusionCatcher are stored as the first argument, and the specific string that is used to identify tool name (`arr` for Arriba fusion transcript call output file prefix, for instance). 

	For example:
	> ``` pypolars-process-ft-tsv.py data/FTmyBRCAs_raw/Arriba arr ```

	Do the same for the FusionCatcher raw output files, as well as the same Arriba and FusionCatcher output files generated from the processing 113 TCGA-Normals (to use as a panel of normals for FT filtering).

2. Then, load up the two datasets on Jupyter Notebook and concatenate the dataframes together so that Arriba+FusionCatcher unfiltered FT data are combined into one data table and saved in one `.parquet` and `.tsv` file. Do the same for the `TCGANormals` panel of normals.

In [3]:
# load up MyBrCa datasets
arr_mdf = pl.scan_parquet('../output/myBRCA/Arriba-FT-all-unfilt-list-v2.parquet')

fc_mdf = pl.scan_parquet('../output/myBRCA/FusionCatcher-FT-all-unfilt-list-v2.parquet')

# now load TCGANormals
arr_tdf = pl.scan_parquet('../output/TCGANormals/Arriba-Normal-FT-all-unfilt-list-v2.parquet')
fc_tdf = pl.scan_parquet('../output/TCGANormals/FusionCatcher-Normal-FT-all-unfilt-list-v2.parquet')

# html_arr_t = to_html_datatable(pl.DataFrame.to_pandas(arr_tdf.collect(), use_pyarrow_extension_array=True).head(5), display_logo_when_loading=False)

# html_fc_t = to_html_datatable(pl.DataFrame.to_pandas(fc_tdf.collect(), use_pyarrow_extension_array=True).head(5), display_logo_when_loading=False)


### Loaded Polars dataFrames
[//]: # (-.- .tabset .tabset-pills)

Here all datasets from the two different fusion transcript calling tools run on both MyBrCa and TCGA-Normals cohorts are shown in tabs.

In [None]:
# %%jmd 

# #### **Dataset 1A** (MyBrCa): Arriba unfiltered
# Arriba MyBrCa datatable dimension: <b>{{arr_mdf.collect().shape}}</b>

# Showing truncated table:


#### **Dataset 1A** (MyBrCa): Arriba unfiltered
Arriba MyBrCa datatable dimension: <b>(49465, 11)</b>

Showing truncated table:

In [6]:
# -.-|m { input: false, output: true}
pl.DataFrame.to_pandas(arr_mdf.collect(), use_pyarrow_extension_array=True)

Unnamed: 0,fusionTranscriptID,fusionGeneID,breakpointID,strand1,strand2,site1,site2,type,confidence,sampleID,toolID
0,TRMT11::SMG6__6:125986622-17:2244719,TRMT11::SMG6,6:125986622-17:2244719,+,-,CDS/splice-site,CDS/splice-site,translocation,high,1,Arriba
1,STAG3::MEF2C-AS1__7:100189570-5:88919251,STAG3::MEF2C-AS1,7:100189570-5:88919251,+,-,CDS,intron,translocation/5'-5',low,1,Arriba
2,MAPK13::C1QL1__6:36132629-17:44965446,MAPK13::C1QL1,6:36132629-17:44965446,+,+,CDS,intron,translocation/5'-5',low,1,Arriba
3,STX16::NPEPL1__20:58673711-20:58691724,STX16::NPEPL1,20:58673711-20:58691724,+,+,CDS/splice-site,5'UTR/splice-site,deletion/read-through,low,1,Arriba
4,MAPK13::NMT1__6:36132629-17:44965446,MAPK13::NMT1,6:36132629-17:44965446,+,+,CDS,intron,translocation,low,1,Arriba
...,...,...,...,...,...,...,...,...,...,...,...
49460,"DENND5B::AC087311.1(22711),SYT10(357773)__12:3...","DENND5B::AC087311.1(22711),SYT10(357773)",12:31479608-12:33016465,-,+,CDS,intergenic,inversion,low,992,Arriba
49461,LINC01145::AC245100.2__1:145201150-1:148436753,LINC01145::AC245100.2,1:145201150-1:148436753,-,-,exon,exon,duplication/5'-5',low,992,Arriba
49462,NET1::RNF169__10:5412820-11:74834676,NET1::RNF169,10:5412820-11:74834676,+,+,CDS/splice-site,CDS/splice-site,translocation,low,992,Arriba
49463,MAN2C1::SIN3A__15:75366522-15:75375872,MAN2C1::SIN3A,15:75366522-15:75375872,-,-,CDS/splice-site,CDS/splice-site,duplication,low,992,Arriba


In [13]:
# %%jmd

# #### **Dataset 1B** (MyBrCa): FusionCatcher unfiltered
# Arriba MyBrCa datatable dimension: <b>{{fc_mdf.collect().shape}}</b>

# Showing truncated table:

In [7]:
# -.-|m { input: false, output: true}
pl.DataFrame.to_pandas(fc_mdf.collect(), use_pyarrow_extension_array=True)

Unnamed: 0,fusionTranscriptID,fusionGeneID,breakpointID,strand1,strand2,site1,site2,type,confidence,sampleID,toolID
0,SIDT2::TAGLN__11:117195915-11:117203002,SIDT2::TAGLN,11:117195915-11:117203002,+,+,CDS(truncated),UTR,.,.,2,FusionCatcher
1,AZGP1::GJC3__7:99971746-7:99923603,AZGP1::GJC3,7:99971746-7:99923603,-,-,in-frame,.,.,.,2,FusionCatcher
2,NPEPPS::TBC1D3__17:47592545-17:38191030,NPEPPS::TBC1D3,17:47592545-17:38191030,+,-,CDS(complete),UTR,.,.,2,FusionCatcher
3,CYP4F11::CYP4F23P__19:15914762-19:15583580,CYP4F11::CYP4F23P,19:15914762-19:15583580,-,+,CDS(truncated),exonic(no-known-CDS),.,.,2,FusionCatcher
4,SLC49A3::ATP5ME__4:686532-4:673401,SLC49A3::ATP5ME,4:686532-4:673401,-,-,out-of-frame,.,.,.,3,FusionCatcher
...,...,...,...,...,...,...,...,...,...,...,...
31359,CTBS::GNG5__1:84563257-1:84501970,CTBS::GNG5,1:84563257-1:84501970,-,-,in-frame,.,.,.,991,FusionCatcher
31360,MRPS30-DT::LINC02224__5:44808642-5:44658557,MRPS30-DT::LINC02224,5:44808642-5:44658557,-,-,exonic(no-known-CDS),exonic(no-known-CDS),.,.,991,FusionCatcher
31361,NBEA::CR382287.1__13:35070852-21:10127330,NBEA::CR382287.1,13:35070852-21:10127330,+,+,CDS(truncated),exonic(no-known-CDS),.,.,991,FusionCatcher
31362,HACL1::COLQ__3:15563358-3:15489637,HACL1::COLQ,3:15563358-3:15489637,-,-,out-of-frame,.,.,.,991,FusionCatcher


In [None]:
# %%jmd 

# #### **Dataset 2A** (TCGA Normals): FusionCatcher unfiltered
# Arriba TCGA datatable dimension: <b>{{arr_tdf.collect().shape}}</b>

# Showing truncated table:


#### **Dataset 2A** (TCGA Normals): FusionCatcher unfiltered
Arriba TCGA datatable dimension: <b>(10802, 11)</b>

Showing truncated table:

In [8]:
# -.-|m { input: false, output: true}
pl.DataFrame.to_pandas(arr_tdf.collect(), use_pyarrow_extension_array=True)

Unnamed: 0,fusionTranscriptID,fusionGeneID,breakpointID,strand1,strand2,site1,site2,type,confidence,sampleID,toolID
0,TRPM7::SPPL2A__15:50686531-15:50749746,TRPM7::SPPL2A,15:50686531-15:50749746,-,-,CDS/splice-site,CDS/splice-site,duplication,high,TCGA-A7-A0CE,Arriba
1,AC084756.2::SPPL2A__15:50686531-15:50749746,AC084756.2::SPPL2A,15:50686531-15:50749746,-,-,CDS/splice-site,CDS/splice-site,duplication,high,TCGA-A7-A0CE,Arriba
2,BOLA2B::SMG1P5__16:30193358-16:30288681,BOLA2B::SMG1P5,16:30193358-16:30288681,-,-,CDS/splice-site,exon,duplication,high,TCGA-A7-A0CE,Arriba
3,FBXO25::SEPTIN14__8:435707-7:55796092,FBXO25::SEPTIN14,8:435707-7:55796092,+,-,CDS/splice-site,CDS/splice-site,translocation,high,TCGA-A7-A0CE,Arriba
4,TVP23C::CDRT4__17:15503098-17:15440285,TVP23C::CDRT4,17:15503098-17:15440285,-,-,CDS,5'UTR/splice-site,deletion/read-through,medium,TCGA-A7-A0CE,Arriba
...,...,...,...,...,...,...,...,...,...,...,...
10797,"PRSS45P::PRSS50(438),PRSS46P(1276)__3:46742488...","PRSS45P::PRSS50(438),PRSS46P(1276)",3:46742488-3:46718307,-,-,CDS,intergenic,deletion/read-through,low,TCGA-GI-A2C9,Arriba
10798,"LRRK1::AC090907.2(4523),AC019254.2(26014)__15:...","LRRK1::AC090907.2(4523),AC019254.2(26014)",15:101049667-15:101090589,+,+,CDS,intergenic,deletion/read-through,low,TCGA-GI-A2C9,Arriba
10799,PTPRT::AL049812.2__20:42081907-20:42052835,PTPRT::AL049812.2,20:42081907-20:42052835,-,-,CDS,intron,deletion/read-through/5'-5',low,TCGA-GI-A2C9,Arriba
10800,"LINC01128::PDCD2(26680),AL031259.1(1250)__1:82...","LINC01128::PDCD2(26680),AL031259.1(1250)",1:828888-6:170611372,.,.,intron,intergenic,translocation,low,TCGA-GI-A2C9,Arriba


In [None]:
# %%jmd

# #### **Dataset 2B** (TCGA Normals): FusionCatcher unfiltered
# FusionCatcher TCGA Normals datatable dimension: <b>{{fc_tdf.collect().shape}}</b>

# Showing truncated table:


#### **Dataset 2B** (TCGA Normals): FusionCatcher unfiltered
FusionCatcher TCGA Normals datatable dimension: <b>(3759, 11)</b>

Showing truncated table:

In [9]:
# -.-|m { input: false, output: true}
pl.DataFrame.to_pandas(fc_tdf.collect(), use_pyarrow_extension_array=True)

Unnamed: 0,fusionTranscriptID,fusionGeneID,breakpointID,strand1,strand2,site1,site2,type,confidence,sampleID,toolID
0,AZGP1::GJC3__7:99971746-7:99923603,AZGP1::GJC3,7:99971746-7:99923603,-,-,in-frame,.,.,.,TCGA-A7-A0CE,FusionCatcher
1,AZGP1::GJC3__7:99971745-7:99929620,AZGP1::GJC3,7:99971745-7:99929620,-,-,UTR,CDS(complete),.,.,TCGA-A7-A0CE,FusionCatcher
2,NPEPPS::TBC1D3__17:47592545-17:38191030,NPEPPS::TBC1D3,17:47592545-17:38191030,+,-,CDS(complete),UTR,.,.,TCGA-A7-A0CE,FusionCatcher
3,SMG1::NPIPB5__16:18858170-16:22513522,SMG1::NPIPB5,16:18858170-16:22513522,-,+,CDS(truncated),UTR,.,.,TCGA-A7-A0CE,FusionCatcher
4,SMG1::NPIPB5__16:18858211-16:22513522,SMG1::NPIPB5,16:18858211-16:22513522,-,+,CDS(truncated),UTR,.,.,TCGA-A7-A0CE,FusionCatcher
...,...,...,...,...,...,...,...,...,...,...,...
3754,F11R::RBM4__1:161019399-11:66652310,F11R::RBM4,1:161019399-11:66652310,-,+,intronic,CDS(truncated),.,.,TCGA-GI-A2C9,FusionCatcher
3755,FBXO25::FAM157B__8:435707-9:138243620,FBXO25::FAM157B,8:435707-9:138243620,+,+,CDS(truncated),exonic(no-known-CDS),.,.,TCGA-GI-A2C9,FusionCatcher
3756,MAGT1::STAC2__X:77891455-17:39211643,MAGT1::STAC2,X:77891455-17:39211643,-,-,intronic,UTR,.,.,TCGA-GI-A2C9,FusionCatcher
3757,PTPRF::IGK@__1:43622869-2:89150246,PTPRF::IGK@,1:43622869-2:89150246,+,-,UTR,---,.,.,TCGA-GI-A2C9,FusionCatcher


## **Concatenate Arriba and FusionCatcher Datasets**

Now, we can merge the two dataframes into one masterFrame for each cohort data (MyBrCa & TCGA panel of normals) using Polars' `concat`.

**NOTE:** Vertical concatenation is the default, where two dataframes sharing the exact same columns would be joined together, adding all rows of dataframe 1 and 2 vertically.

[//]: # (-.- .alert .alert-warning)

In [10]:
# -.-|m { input: false, output: false}
from polars.exceptions import CategoricalRemappingWarning
import warnings
warnings.filterwarnings('ignore', category=CategoricalRemappingWarning)

joined_df = pl.concat(
    [
        arr_mdf.collect(),
        fc_mdf.collect()
    ]
)

In [15]:
# -.-|m { input: false, output: true}
display(HTML(f"Concatenated MyBrCa Arriba+FusionCatcher datatable dimension: " + f"<b>{joined_df.shape}</b>"))

# show(joined_df.head(5), maxBytes=0, classes="display compact")

Do the same with the TCGA panel of normal FTs.

In [16]:
# -.-|m { input: false, output: false}
from polars.exceptions import CategoricalRemappingWarning
import warnings
warnings.filterwarnings('ignore', category=CategoricalRemappingWarning)

joined_norms_df = pl.concat(
    [
        arr_tdf.collect(),
        fc_tdf.collect()
    ]
)

In [17]:
# -.-|m { input: false, output: true}

display(HTML(f"Concatenated TCGA-Normals Arriba+FusionCatcher datatable dimension: " + f"<b>{joined_norms_df.shape}</b>"))

# show(joined_norms_df.head(5), maxBytes=0, classes="display compact")

## **Filter MyBrCa Merged DataFrame using Panel of Normals**
Now we can filter the unfiltered, concatenated FT dataframes by discarding those that are present in TCGA Normal data.

In [18]:
# -.-|m { input: true, output: true}

mybrca_ccdf = pl.scan_parquet('../output/myBRCA/Arr_FC-concat-FT-all-unfilt-list-v2.parquet')

tcganorms_ccdf = pl.scan_parquet('../output/TCGANormals/Arr_FC-Normals-concat-FT-all-unfilt-list-v2.parquet')


Once they are loaded, we can convert to Pandas from Polars for ease of processing.

In [19]:
# -.-|m { input: true, output: true, input_fold: show}

my_concat_df = pl.DataFrame.to_pandas(mybrca_ccdf.collect(), use_pyarrow_extension_array=True)
tn_concat_df = pl.DataFrame.to_pandas(tcganorms_ccdf.collect(), use_pyarrow_extension_array=True)

In [None]:
# # -.-|m { input: false, output: true}
# display(HTML(f"Concatenated MyBrCa Arriba+FusionCatcher datatable dimension: " + f"<b>{my_concat_df.shape}</b>"))

# show(my_concat_df.head(5), maxBytes=0, classes="display compact")

In [None]:
# -.-|m { input: false, output: true}
# display(HTML(f"Concatenated TCGA Normals Arriba+FusionCatcher datatable dimension: " + f"<b>{tn_concat_df.shape}</b>"))

# show(tn_concat_df.head(5), maxBytes=0, classes="display compact")

In [None]:
# load up test tsvs
arr_test_df = pd.read_csv('../data/minimal-test/373T_arr-v113-refannot.tsv', sep='\t')
# arr_test_df

In [None]:
fc_test_df = pd.read_csv('../data/minimal-test/373T_fc-v113-refannot.tsv', sep='\t')
# fc_test_df

### ASIDE: Modify `collate-fts-nf.py` to handle NaN in GeneIDs (in FusionCatcher files)

In [40]:
%%bash

python /home/ec2-user/repos/FT-NeonDisco/scripts/collate-fts-nf-v2.py 373T /home/ec2-user/repos/FT-NeonDisco/data/minimal-test/373T_arr.tsv arr /home/ec2-user/repos/FT-NeonDisco/data/minimal-test/373T_fc.tsv fc

Sample name: 373T
Input arguments: [('/home/ec2-user/repos/FT-NeonDisco/data/minimal-test/373T_arr.tsv', 'arr'), ('/home/ec2-user/repos/FT-NeonDisco/data/minimal-test/373T_fc.tsv', 'fc')]
Setting tool name...
Reading Arriba of 373T TSV file...(sample ID: 373)
Setting tool name...
Reading FusionCatcher of 373T TSV file...(sample ID: 373)
Concatenating lazy DataFrames from Arriba and FusionCatcher...
Concatenation completed. Collecting...
Saving as parquet and tsv files...
Done.


In [8]:
# load up collated data
collated_df = pd.read_csv('../data/minimal-test/373T-collated-FT-UNFILTERED-collate-v2.tsv', sep='\t')
collated_df

Unnamed: 0,fusionTranscriptID,fusionGenePair,breakpointID,strand1,strand2,site1,site2,type,confidence,sampleID,toolID
0,SPAG5-AS1::RAB11FIP4__17:28616389-17:31529712,SPAG5-AS1::RAB11FIP4,17:28616389-17:31529712,-,-,exon,intron,duplication,high,373,Arriba
1,NSF::ENSG00000289599__17:46643259-17:48009736,NSF::ENSG00000289599,17:46643259-17:48009736,+,+,CDS/splice-site,exon/splice-site,deletion,high,373,Arriba
2,NSF::ENSG00000289599__17:46643259-17:48012093,NSF::ENSG00000289599,17:46643259-17:48012093,+,+,CDS/splice-site,exon/splice-site,deletion,high,373,Arriba
3,NSF::ENSG00000289599__17:46643259-17:48012140,NSF::ENSG00000289599,17:46643259-17:48012140,+,+,CDS/splice-site,exon/splice-site,deletion,medium,373,Arriba
4,NSF::ENSG00000289599__17:46643259-17:48011155,NSF::ENSG00000289599,17:46643259-17:48011155,+,+,CDS/splice-site,exon/splice-site,deletion,medium,373,Arriba
...,...,...,...,...,...,...,...,...,...,...,...
121,ENSG00000294461::PITPNB__22:27930694-22:27914347,ENSG00000294461::PITPNB,22:27930694-22:27914347,-,-,exonic(no-known-CDS),CDS(truncated),.,.,373,FusionCatcher
122,BMS1P14::ENSG00000286129__9:40571871-22:22318899,BMS1P14::ENSG00000286129,9:40571871-22:22318899,-,+,exonic(no-known-CDS),exonic(no-known-CDS),.,.,373,FusionCatcher
123,GATAD2A::LINC02684__19:19386138-11:135061947,GATAD2A::LINC02684,19:19386138-11:135061947,+,-,UTR,exonic(no-known-CDS),.,.,373,FusionCatcher
124,NAIP::OCLN__5:70979869-5:69534694,NAIP::OCLN,5:70979869-5:69534694,-,+,out-of-frame,.,.,.,373,FusionCatcher


In [None]:
# load up combined parquet then convert to polars
combined_pq_df = pl.scan_parquet('../data/minimal-test/373T-collated-FT-UNFILTERED.parquet').collect().to_pandas(use_pyarrow_extension_array=True)
combined_pq_df

In [6]:
# #show only unique breakpointIDs
# collated_unique_df = collated_df.drop_duplicates(subset=['breakpointID'])
# collated_unique_df.to_csv('../data/minimal-test/373T-collated-FT-UNFILTERED-collate-v2-unique.tsv', sep='\t', index=False)


In [24]:
combined_pq_df = pd.read_csv('../data/minimal-test/373T-collated-FT-UNFILTERED-collate-v2.tsv', sep='\t')

In [28]:
#### 1. Get unique breakpointIDs from the combined ft dataframe based on toolID and add detection information

# Split the dataframe by toolID
arriba_df = combined_pq_df[combined_pq_df['toolID'] == 'Arriba'].copy()
fusioncatcher_df = combined_pq_df[combined_pq_df['toolID'] == 'FusionCatcher'].copy()

# Drop duplicates by breakpointID for each tool separately
arriba_unique_df = arriba_df.drop_duplicates(subset=['breakpointID']).copy()
fusioncatcher_unique_df = fusioncatcher_df.drop_duplicates(subset=['breakpointID']).copy()

# Get sets of breakpointIDs from each tool
arriba_breakpoints = set(arriba_unique_df['breakpointID'])
fusioncatcher_breakpoints = set(fusioncatcher_unique_df['breakpointID'])

# Find breakpoints detected by both tools
common_breakpoints = arriba_breakpoints.intersection(fusioncatcher_breakpoints)

# Add detection information using assign() with the new column name
arriba_unique_df = arriba_unique_df.assign(detectedBy='Arriba')
fusioncatcher_unique_df = fusioncatcher_unique_df.assign(detectedBy='FusionCatcher')

# Mark breakpoints found by both tools
mask = arriba_unique_df['breakpointID'].isin(common_breakpoints)
arriba_unique_df.loc[mask, 'detectedBy'] = 'Both'

# Only keep FusionCatcher rows that weren't detected by Arriba
fusioncatcher_exclusive_df = fusioncatcher_unique_df[~fusioncatcher_unique_df['breakpointID'].isin(arriba_breakpoints)]

# Combine Arriba rows with FusionCatcher-exclusive rows
combined_unique_df = pd.concat([arriba_unique_df, fusioncatcher_exclusive_df])

# 2. Filter out breakpoints seen in TCGA normals
# load up the TCGA normals (all Arr and FC) dataframe
panel_normals_uniq_df = pd.read_csv('../output/TCGANormals/Arr-and-FC_TCGANormals-UNIQUE-breakpointID-list.tsv', sep='\t')

# Get the unique breakpointIDs from the panel normals
panel_normals_uniq_fts = set(panel_normals_uniq_df['breakpointID'])
panel_normals_uniq_fts

# Filter out breakpoints seen in TCGA normals
final_unique_df = combined_unique_df[~combined_unique_df['breakpointID'].isin(panel_normals_uniq_fts)]

# Save the result
final_unique_df.to_csv('../output/minimal-test/373T-combined-FT-uniq-FILTERED.tsv', sep='\t', index=False)

### ASIDE: Postprocess TCGA Normal to get unique breakpoints and add detection information

In [15]:
#### Get unique breakpointIDs from the combined ft dataframe based on toolID and add detection information

# load up the TCGA normals (all Arr and FC) dataframe
panel_normals_df = pl.scan_parquet('../output/TCGANormals/Arr_FC-Normals-concat-FT-all-unfilt-list-v2.parquet').collect().to_pandas(use_pyarrow_extension_array=True)

# Split the dataframe by toolID
arriba_norm_df = panel_normals_df[panel_normals_df['toolID'] == 'Arriba'].copy()
fusioncatcher_norm_df = panel_normals_df[panel_normals_df['toolID'] == 'FusionCatcher'].copy()

# Drop duplicates by breakpointID for each tool separately
arriba_norm_unique_df = arriba_norm_df.drop_duplicates(subset=['breakpointID']).copy()
fusioncatcher_norm_unique_df = fusioncatcher_norm_df.drop_duplicates(subset=['breakpointID']).copy()

# Get sets of breakpointIDs from each tool
arriba_norm_breakpoints = set(arriba_norm_unique_df['breakpointID'])
fusioncatcher_norm_breakpoints = set(fusioncatcher_norm_unique_df['breakpointID'])

# Find breakpoints detected by both tools
common_norm_breakpoints = arriba_norm_breakpoints.intersection(fusioncatcher_norm_breakpoints)

# Add detection information using assign() with the new column name
arriba_norm_unique_df = arriba_norm_unique_df.assign(detectedBy='Arriba')
fusioncatcher_norm_unique_df = fusioncatcher_norm_unique_df.assign(detectedBy='FusionCatcher')

# Mark breakpoints found by both tools
mask = arriba_norm_unique_df['breakpointID'].isin(common_norm_breakpoints)
arriba_norm_unique_df.loc[mask, 'detectedBy'] = 'Both'

# Only keep FusionCatcher rows that weren't detected by Arriba
fusioncatcher_norm_exclusive_df = fusioncatcher_norm_unique_df[~fusioncatcher_norm_unique_df['breakpointID'].isin(arriba_norm_breakpoints)]

# Combine Arriba rows with FusionCatcher-exclusive rows
combined_norm_unique_df = pd.concat([arriba_norm_unique_df, fusioncatcher_norm_exclusive_df])

combined_norm_unique_df


# Save the result
combined_norm_unique_df.to_csv('../output/TCGANormals/Arr-and-FC_TCGANormals-UNIQUE-breakpointID-list.tsv', sep='\t', index=False)