# Notebook for demonstrating evidence matching between assayed fusions and categorical fusions

In [1]:
import warnings
from os import environ

warnings.filterwarnings("ignore")

environ["GENE_NORM_DB_URL"] = "postgresql://postgres@localhost:5432/gene_normalizer"
environ["UTA_DB_URL"] = "postgresql://uta_admin:uta@localhost:5432/uta/uta_20240523b"

In [2]:
from civicpy import civic

from fusor.fusor import FUSOR
from fusor.translator import Translator

fusor = FUSOR()
translator = Translator(fusor=fusor)

In [3]:
# Generate KIF5B::RET AssayedFusion from StarFusion file
from pathlib import Path

from cool_seq_tool.schemas import Assembly, CoordinateType

from fusor.harvester import StarFusionHarvester

path = Path("../../../tests/fixtures/star-fusion.fusion_predictions.abridged.tsv")
harvester = StarFusionHarvester()
fusions_list = harvester.load_records(path)
assayed_fusion = await translator.from_star_fusion(
        fusions_list[0],
        CoordinateType.RESIDUE.value,
        Assembly.GRCH38.value
    )

In [4]:
# Load in accepted fusion variants
variants = civic.get_all_fusion_variants(include_status="accepted")

In [5]:
# Generate CategoricalFusion objects from CIViC
from fusor.harvester import CIVICHarvester

harvester = CIVICHarvester(fusions_list=variants)
fusions_list = harvester.load_records()

categorical_fusions = []
for fusion in fusions_list:
    if "?" in fusion.vicc_compliant_name:
        continue
    cex = await translator.from_civic(civic=fusion)
    categorical_fusions.append(cex)



In [6]:
# Define evidence matching method
from fusor.models import (
    AssayedFusion,
    CategoricalFusion,
    GeneElement,
    MultiplePossibleGenesElement,
    TranscriptSegmentElement,
    UnknownGeneElement,
)


def compare_structure(assayed_element: TranscriptSegmentElement | UnknownGeneElement | GeneElement, categorical_element: TranscriptSegmentElement | MultiplePossibleGenesElement | GeneElement, is_five_prime_partner: bool) -> int:
    """Compare transcript segments for an assayed and categorical fusions
    :param assayed_transcript: The assayed fusion transcript or unknown gene element or gene element
    :param categorical_transcript: The categorical fusion transcript or mulitple possible genes element
    :param is_five_prime_partner: If the 5' fusion partner is being compared
    :return A score indiciating the degree of match
    """
    match_score = 0
    # If the assayed partner is unknown and the categorical partner can be any symbol, return match score of 1
    if isinstance(assayed_element, UnknownGeneElement) or isinstance(categorical_element, MultiplePossibleGenesElement):
        return 1

    # Compare gene partners first
    if assayed_element.gene == categorical_element.gene:
        match_score += 1

    # Then compare transcript partners if transcript data exists
    if isinstance(assayed_element, TranscriptSegmentElement) and isinstance(categorical_element, TranscriptSegmentElement):
        if (
            assayed_element.transcript
            and categorical_element.transcript
            and assayed_element.transcript == categorical_element.transcript
        ):
            match_score += 1

        if is_five_prime_partner:
            fields_to_compare = ["exonEnd", "exonEndOffset", "elementGenomicEnd"]
        else:
            fields_to_compare = ["exonStart", "exonStartOffset", "elementGenomicStart"]

        for field in fields_to_compare:
            if getattr(assayed_element, field) == getattr(categorical_element, field):
                match_score += 1

    return match_score

def compare_fusion(assayed_fusion: AssayedFusion, categorical_fusion: CategoricalFusion) -> int:
    """Compare assayed and categorical fusions
    :param assayed_fusion: AssayedFusion object
    :param categorical_fusion: CategoricalFusion object
    :return Match score
    """
    assayed_transcript_segments = assayed_fusion.structure
    categorical_transcript_segments = categorical_fusion.structure
    match_score = 0

    # Check for linker elements
    if (
        len(assayed_transcript_segments) == len(categorical_transcript_segments) == 3
        and assayed_transcript_segments[1] == categorical_transcript_segments[1]
        ):
        match_score += 1

    # Compare other structural elements
    else:
        match_score += compare_structure(assayed_transcript_segments[0], categorical_transcript_segments[0], True)
        match_score += compare_structure(assayed_transcript_segments[1], categorical_transcript_segments[1], False)
    return match_score

def match_fusion(assayed_fusion: AssayedFusion, categorical_fusions: list[CategoricalFusion]) -> tuple[list[CategoricalFusion], int]:
    """Return best matching fusion
    :param assayed_fusion: The assayed fusion object
    :param categorical_fusions: A list of categorical fusion objects
    :return A list with best matching categorical fusion objects and its corresponding score
    """
    fusions_to_return = []
    score_to_return = 0
    for categorical_fusion in categorical_fusions:
        match_score = compare_fusion(assayed_fusion, categorical_fusion)
        if match_score > score_to_return:
            fusions_to_return = [categorical_fusion]
            score_to_return = match_score
        elif match_score == score_to_return:
            fusions_to_return.append(categorical_fusion)
    return (fusions_to_return, score_to_return)

In [7]:
# Generate list of matches, report match score
matches = match_fusion(assayed_fusion, categorical_fusions)
matches

([CategoricalFusion(type=<FUSORTypes.CATEGORICAL_FUSION: 'CategoricalFusion'>, regulatoryElement=None, structure=[TranscriptSegmentElement(type=<FUSORTypes.TRANSCRIPT_SEGMENT_ELEMENT: 'TranscriptSegmentElement'>, transcript='refseq:NM_004521.3', exonStart=None, exonStartOffset=None, exonEnd=24, exonEndOffset=0, gene=MappableConcept(id=None, extensions=None, conceptType='Gene', name='KIF5B', primaryCoding=Coding(id='hgnc:6324', extensions=None, name=None, system='https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/', systemVersion=None, code=code(root='HGNC:6324'), iris=None), mappings=None), elementGenomicStart=None, elementGenomicEnd=SequenceLocation(id='ga4gh:SL.nk8wv9yKzCFQ0n7Ph2JnJhOkf2Fzfh_U', type='SequenceLocation', name=None, description=None, aliases=None, extensions=None, digest='nk8wv9yKzCFQ0n7Ph2JnJhOkf2Fzfh_U', sequenceReference=SequenceReference(id='refseq:NC_000010.11', type='SequenceReference', name=None, description=None, aliases=None, extensions=None, refgetA

In [None]:
# View evidence item linked to matched categorical fusion
matches[0][0].civicMolecularProfiles[0].evidence_items[0].__dict__

{'_assertions': [],
 '_therapies': [<CIViC therapy 117>],
 '_phenotypes': [],
 '_incomplete': {'therapies'},
 '_partial': False,
 'type': 'evidence',
 'id': 698,
 'variant_origin': 'SOMATIC',
 'therapy_interaction_type': None,
 'therapy_ids': [117],
 'status': 'accepted',
 'source_id': 378,
 'significance': 'SENSITIVITYRESPONSE',
 'rating': 2,
 'phenotype_ids': [],
 'name': 'EID698',
 'molecular_profile_id': 269,
 'evidence_type': 'PREDICTIVE',
 'evidence_level': 'C',
 'evidence_direction': 'SUPPORTS',
 'disease_id': 30,
 'description': 'A case study of a patient with EGFR, KRAS, BRAF, HER2, ALK, ROS1 and MET negative adenocarcinoma of the lung. FISH analysis revealed a KIF5B-RET fusion. The RET inhibitor Vandetanib led to remission in the patient.',
 'assertion_ids': [],
 '_include_status': ['accepted', 'submitted', 'rejected']}