In [1]:
file = "./FP.vcf"

In [2]:
from dataclasses import dataclass

In [4]:
@dataclass
class Event: 
    eid: str
    tid: str
    
    chr1: str
    pos1: str
    gene1: str
    
    chr2: str
    pos2: str 
    gene2: str
    
    sr: str

In [5]:
event_list  = []
for line in open(file):
    if line.startswith("#"):
        continue

    columns = line.strip().split()
    info  = columns[7].split(";")

    
    event = Event(eid=columns[2],
                  tid=info[-3].split("=")[-1],
                  
                  chr1=columns[0],
                  pos1=columns[1],
        
                  gene1=info[-2].split("=")[-1],

                  chr2=info[0].split("=")[-1],
                  pos2=info[1].split("=")[-1],
                  gene2=info[-1].split("=")[-1],
        
                  sr=info[2].split("=")[-1],
                 )
    
    event_list.append(event)

In [7]:
event_list

[Event(eid='GF01', tid='GF01', chr1='chr1', pos1='44776032', gene1='RPS8', chr2='chr19', pos2='49491840', gene2='RPL13A', sr='6'),
 Event(eid='GF02', tid='GF02', chr1='chr11', pos1='809986', gene1='RPLP2', chr2='chr6', pos2='34417453', gene2='RPS10-NUDT3', sr='4'),
 Event(eid='GF03', tid='GF03', chr1='chr16', pos1='1962059', gene1='RPS2', chr2='chr3', pos2='101576943', gene2='PCNP', sr='7'),
 Event(eid='GF04', tid='GF04', chr1='chr16', pos1='1964627', gene1='RPS2', chr2='chr3', pos2='101576437', gene2='PCNP', sr='6'),
 Event(eid='GF05', tid='GF05', chr1='chr11', pos1='63974627', gene1='COX8A', chr2='chrX', pos2='154400912', gene2='RPL10', sr='5'),
 Event(eid='GF06', tid='GF06', chr1='chr2', pos1='101002701', gene1='RPL31', chr2='chr5', pos2='181236932', gene2='RACK1', sr='4'),
 Event(eid='GF07', tid='GF07', chr1='chr14', pos1='49586356', gene1='RPS29', chr2='chr6', pos2='34417453', gene2='RPS10-NUDT3', sr='8'),
 Event(eid='GF08', tid='GF08', chr1='chr19', pos1='58394803', gene1='RPS5',

In [10]:
from collections import defaultdict
breakpoints = defaultdict(list)

for event in event_list:
    breakpoints[event.gene1].append(event.pos1)
    breakpoints[event.gene2].append(event.pos2)
    

In [11]:
breakpoints

defaultdict(list,
            {'RPS8': ['44776032',
              '44776032',
              '44778739',
              '44775583',
              '44776069',
              '44776032',
              '44778737'],
             'RPL13A': ['49491840',
              '49487619',
              '49491839',
              '49491840',
              '49491836',
              '49487619',
              '49491840',
              '49491840',
              '49489846',
              '49491840',
              '49487618',
              '49491840'],
             'RPLP2': ['809986',
              '812875',
              '809979',
              '809975',
              '809977',
              '809977',
              '812871',
              '809976',
              '809975',
              '809987',
              '812865'],
             'RPS10-NUDT3': ['34417453',
              '34417453',
              '34417459',
              '34426061',
              '34417453',
              '34417453',
              '34417453

In [12]:
for event in event_list:
    event.pos1 = breakpoints[event.gene1][0]
    event.pos2 = breakpoints[event.gene2][0]

    
    

In [13]:
def write_to_infusion(event_list, file_name):
    header = "#id     ref1    break_pos1      region1 ref2    break_pos2      region2 num_span        num_paired      genes_1   genes_2 fusion_class"
    header = "\t".join(header.split())

    with open(file_name, "w") as f:
        f.write(f"{header}\n")

        ind = 0
        for event in event_list:
            
            num_paired = 1 
            num_span = 1

            chr1 = event.chr1[3:]
            chr2 = event.chr2[3:]

            if chr1 == "X":
                chr1 = "X"
            elif chr1 == "Y":
                chr1 = "Y"
            elif chr1 == "M":
                chr1 = "M"

            if chr2 == "X":
                chr2 = "X"
            elif chr2 == "Y":
                chr2 = "Y"
            elif chr2 == "M":
                chr2 = "M"
        
            if event.chr1 == event.chr2:
                fusion_class = "intra-chromosomal" 
                # num_span = event.sr
            else:
                fusion_class = "inter-chromosomal"
                # num_paired = event.sr 
            
            line = f"{ind}\t{chr1}\t{event.pos1}\t[{event.pos1},{int(event.pos1)+1}]\t{chr2}\t{event.pos2}\t[{event.pos2},{int(event.pos2) + 1}]\t{num_span}\t{num_paired}\t{event.gene1}\t{event.gene2}\t{fusion_class}\n"
            f.write(line)
            ind+=1

In [14]:
write_to_infusion(event_list, "vcap_002_gene_fusion_infusion_sr1_test.txt")

In [30]:
!pwd

/Users/ylk4626/writing_project/deepchopper-paper/figures/supplementary_figures/gene_fusion_circle


In [41]:
len(event_list)

199

In [47]:
gene_pairs = [tuple(sorted([event.gene1, event.gene2])) for event in event_list ]

In [48]:
len(gene_pairs)

199

In [52]:
set(gene_pairs)

{('ACTG1', 'RPS16'),
 ('ACTG1', 'RPS3'),
 ('ACTG1', 'RPS5'),
 ('AMBRA1', 'RPL12'),
 ('AMBRA1', 'RPL13A'),
 ('AMBRA1', 'RPS2'),
 ('AMBRA1', 'RPS3A'),
 ('AMBRA1', 'RPSA'),
 ('ANK2', 'RPS18'),
 ('ANK2', 'RPSA'),
 ('COX8A', 'RPL10'),
 ('COX8A', 'RPL6'),
 ('COX8A', 'RPLP2'),
 ('COX8A', 'RPS11'),
 ('COX8A', 'RPS29'),
 ('DGKI', 'OOEP'),
 ('DGKI', 'RPS3A'),
 ('EEF1A1P5', 'RPL30'),
 ('EEF1G', 'RACK1'),
 ('EEF1G', 'RPLP2'),
 ('ENO1', 'RPL13A'),
 ('FAU', 'RPL7A'),
 ('FAU', 'RPS10-NUDT3'),
 ('FTL', 'RPL19'),
 ('FTL', 'RPL7A'),
 ('HSP90B1', 'TUBA1B'),
 ('IL6ST', 'RPS3'),
 ('ITM2A', 'RPL7A'),
 ('KLK2', 'RPL10'),
 ('KLK2', 'RPL19'),
 ('MT-CO1', 'MT-RNR2'),
 ('MT-ND2', 'RACK1'),
 ('MT-ND3', 'RPL28'),
 ('NACA', 'RPS2'),
 ('NBEAL1', 'RPS3A'),
 ('OOEP', 'RPS3'),
 ('PCNP', 'RPL17-C18orf32'),
 ('PCNP', 'RPL7A'),
 ('PCNP', 'RPS2'),
 ('PCNP', 'RPS3A'),
 ('PIN4', 'RPS11'),
 ('PPIA', 'RPS12'),
 ('PPIA', 'UBE2Q2P2'),
 ('RACK1', 'RPL10'),
 ('RACK1', 'RPL13A'),
 ('RACK1', 'RPL30'),
 ('RACK1', 'RPL31'),
 ('RACK1',