# Notebook for matching between assayed fusions

In [1]:
import warnings
from os import environ

warnings.filterwarnings("ignore")

# These are the configurations for the UTA and SeqRepo databases. These should
# be adjusted by the user based on the locations where these databases exist.
environ["UTA_DB_URL"] = "postgresql://anonymous@localhost:5432/uta/uta_20241220"
environ["SEQREPO_ROOT_DIR"] = "/usr/local/share/seqrepo/2024-12-20"

### Load FUSOR and Translator modules
Run the cell below to load the FUSOR and Translator modules

In [2]:
from fusor.fusor import FUSOR

fusor = FUSOR()

***Using Gene Database Endpoint: http://localhost:8000***


### Generate list of AssayedFusion objects from STAR-Fusion 

Run the cell below to generate a list of queried AssayedFusion objects from a file of STAR-Fusion output

In [3]:
# Generate AssayedFusion list from STAR-Fusion file
from pathlib import Path

from cool_seq_tool.schemas import Assembly

from fusor.harvester import StarFusionHarvester

path = Path("../../tests/fixtures/star-fusion.fusion_predictions.abridged.tsv")
harvester = StarFusionHarvester(fusor=fusor, assembly=Assembly.GRCH38.value)
fusions_list = await harvester.load_records(path)

assayed_fusion_star_fusion = [fusions_list[1]] # Use EML4::ALK fusion as querying example

Unable to get MANE Transcript data for gene: RN7SKP80
Could not find a transcript for RN7SKP80 on NC_000022.11
Unable to get MANE Transcript data for gene: RN7SKP118
Could not find a transcript for RN7SKP118 on NC_000016.10
Gene does not exist in UTA: AC021660.2
Unable to get MANE Transcript data for gene: EEF1A1P13
Could not find a transcript for EEF1A1P13 on NC_000005.10
Gene does not exist in UTA: AC098590.1
Gene does not exist in UTA: AC099789.1
Unable to get MANE Transcript data for gene: USP27X-DT
38584945 on NC_000021.9 occurs more than 150 bp outside the exon boundaries of the NM_182918.4 transcript, indicating this may not be a chimeric transcript junction and is unlikely to represent a contiguous coding sequence. Confirm that the genomic position 38584945 is being used to represent transcript junction and not DNA breakpoint.
Unable to get MANE Transcript data for gene: LINC00158
Gene does not exist in UTA: AP001341.1
Gene does not exist in UTA: AC021660.2
Gene does not exist 

### Generate list of comparator AssayedFusion objects

Run the cell below to generate a list of comparator AssayedFusion objects

In [4]:
from fusor.models import AssayedFusion

A = AssayedFusion(structure=[fusor.gene_element("EML4")[0], fusor.gene_element("ALK")[0]])
B = AssayedFusion(structure=[fusor.gene_element("EML4")[0], fusor.gene_element("BRAF")[0]])
C = AssayedFusion(structure=[fusor.gene_element("TPM3")[0], fusor.gene_element("ALK")[0]])
D = assayed_fusion_star_fusion[0]
E = AssayedFusion(structure=[fusor.gene_element("EML4")[0], fusor.unknown_gene_element()])
F = AssayedFusion(structure=[fusor.unknown_gene_element(), fusor.gene_element("ALK")[0]])
comparator_set = [A, B, C, D, E, F]

### Generated AssayedAssayedMatching object

Run the cell below to generate an `AssayedAssayedMatching` object

In [5]:
from fusor.fusion_matching import AssayedAssayedMatching

aam = AssayedAssayedMatching(assayed_fusions_query=assayed_fusion_star_fusion,
                             assayed_fusions_comparator=comparator_set)

### Perform gene symbol matching

Perform matching where both gene symbol partners in the comparator set match
those in the queried fusion ("EML4", "ALK"). We would expect fusions A and D
to be returned.

In [6]:
await aam.match_fusion(gene_partner_match=True)

[[AssayedFusion(type=<FUSORTypes.ASSAYED_FUSION: 'AssayedFusion'>, regulatoryElement=None, structure=[GeneElement(type=<FUSORTypes.GENE_ELEMENT: 'GeneElement'>, gene=MappableConcept(id=None, extensions=None, conceptType='Gene', name='EML4', primaryCoding=Coding(id='hgnc:1316', extensions=None, name=None, system='https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/', systemVersion=None, code=code(root='HGNC:1316'), iris=None), mappings=None)), GeneElement(type=<FUSORTypes.GENE_ELEMENT: 'GeneElement'>, gene=MappableConcept(id=None, extensions=None, conceptType='Gene', name='ALK', primaryCoding=Coding(id='hgnc:427', extensions=None, name=None, system='https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/', systemVersion=None, code=code(root='HGNC:427'), iris=None), mappings=None))], readingFramePreserved=None, viccNomenclature=None, causativeEvent=None, assay=None, contig=None, readData=None),
  AssayedFusion(type=<FUSORTypes.ASSAYED_FUSION: 'AssayedFusion'>, regulatoryEl

### Perform gene symbol matching on 5' partner

Perform matching where the 5' partner in the query matches either partner in the
comparator. We would expect fusions A, B, D, and E to be returned.

In [7]:
await aam.match_fusion(five_prime_match=True)

[[AssayedFusion(type=<FUSORTypes.ASSAYED_FUSION: 'AssayedFusion'>, regulatoryElement=None, structure=[GeneElement(type=<FUSORTypes.GENE_ELEMENT: 'GeneElement'>, gene=MappableConcept(id=None, extensions=None, conceptType='Gene', name='EML4', primaryCoding=Coding(id='hgnc:1316', extensions=None, name=None, system='https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/', systemVersion=None, code=code(root='HGNC:1316'), iris=None), mappings=None)), GeneElement(type=<FUSORTypes.GENE_ELEMENT: 'GeneElement'>, gene=MappableConcept(id=None, extensions=None, conceptType='Gene', name='ALK', primaryCoding=Coding(id='hgnc:427', extensions=None, name=None, system='https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/', systemVersion=None, code=code(root='HGNC:427'), iris=None), mappings=None))], readingFramePreserved=None, viccNomenclature=None, causativeEvent=None, assay=None, contig=None, readData=None),
  AssayedFusion(type=<FUSORTypes.ASSAYED_FUSION: 'AssayedFusion'>, regulatoryEl

### Perform gene symbol matching on 3' partner

Perform matching where the 3' partner in the query matches either partner in the
comparator. We would expect fusions A, B, D, and F to be returned.

In [8]:
await aam.match_fusion(three_prime_match=True)

[[AssayedFusion(type=<FUSORTypes.ASSAYED_FUSION: 'AssayedFusion'>, regulatoryElement=None, structure=[GeneElement(type=<FUSORTypes.GENE_ELEMENT: 'GeneElement'>, gene=MappableConcept(id=None, extensions=None, conceptType='Gene', name='EML4', primaryCoding=Coding(id='hgnc:1316', extensions=None, name=None, system='https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/', systemVersion=None, code=code(root='HGNC:1316'), iris=None), mappings=None)), GeneElement(type=<FUSORTypes.GENE_ELEMENT: 'GeneElement'>, gene=MappableConcept(id=None, extensions=None, conceptType='Gene', name='ALK', primaryCoding=Coding(id='hgnc:427', extensions=None, name=None, system='https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/', systemVersion=None, code=code(root='HGNC:427'), iris=None), mappings=None))], readingFramePreserved=None, viccNomenclature=None, causativeEvent=None, assay=None, contig=None, readData=None),
  AssayedFusion(type=<FUSORTypes.ASSAYED_FUSION: 'AssayedFusion'>, regulatoryEl

### Perform deep matching

Perform deep matching for the queried fusion against the comparator set.
This looks at the transcript accession, exon number, exon offset, and genomic
breakpoint.

In this case, we would expect D to return the highest score as it is exactly the
fusion from the STAR-Fusion output. We would then expect A as it has the same
partners, followed by fusions E and F as one partner is unknown for each 
fusion.


In [9]:
await aam.match_fusion()

[[(AssayedFusion(type=<FUSORTypes.ASSAYED_FUSION: 'AssayedFusion'>, regulatoryElement=None, structure=[TranscriptSegmentElement(type=<FUSORTypes.TRANSCRIPT_SEGMENT_ELEMENT: 'TranscriptSegmentElement'>, transcript='refseq:NM_019063.5', strand=<Strand.POSITIVE: 1>, exonStart=None, exonStartOffset=None, exonEnd=13, exonEndOffset=0, gene=MappableConcept(id=None, extensions=None, conceptType='Gene', name='EML4', primaryCoding=Coding(id='hgnc:1316', extensions=None, name=None, system='https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/', systemVersion=None, code=code(root='HGNC:1316'), iris=None), mappings=None), elementGenomicStart=None, elementGenomicEnd=SequenceLocation(id='ga4gh:SL.PQzV-kfeCQ4MBmxD5mSHqZmId3I_f-Ib', type='SequenceLocation', name=None, description=None, aliases=None, extensions=None, digest='PQzV-kfeCQ4MBmxD5mSHqZmId3I_f-Ib', sequenceReference=SequenceReference(id='refseq:NC_000002.12', type='SequenceReference', name=None, description=None, aliases=None, extensi