In [1]:
# Parse gene annotations to populate circos plots

In [2]:
import pandas as pd
import os
import subprocess
import pybedtools

In [3]:
def parse_attributes_gff3(string):
    string=string.split(';')
    string=dict(map(lambda x: x.split('='),string))
    return string
def parse_exon_name_gff3(string):
    d = parse_attributes_gff3(string)
    return d['gene_name'] + '_' + d['exon_number']


def bed_features_from_gff3(gff3_file,mode="genes"):
    '''
    Prep function.
    Takes the gencode gff3 file, converts to bed, keeps genes.
    Inputs:
        mode: may be "genes" or "exons"
    '''
    gff3 = pd.read_table(gff3_file,comment='#',header=None,
                         names=['chr','source','type','start','end','score','strand','phase','attributes'],
                        usecols=['chr','type','start','end','attributes'])
    if mode == 'genes':
        gff3 = gff3[gff3.type == 'gene']
        gff3['name']=gff3.attributes.map(lambda x: parse_attributes_gff3(x)['gene_name'])
        outfile = gff3_file[:-5]+'_genes.bed'
    elif mode == 'exons':
        gff3 = gff3[gff3.type == 'exon']
        gff3['name']=gff3.attributes.map(lambda x: parse_exon_name_gff3(x))
        outfile = gff3_file[:-5]+'_exons.bed'
    else:
        raise NotImplementedError("mode must be 'genes' or 'exons'.")
    gff3 = gff3[['chr','start','end','name']]
    gff3.start -= 1
    gff3.to_csv(outfile,sep='\t',header=False,index=False)
    return gff3

def generate_annotations(karyotype_file,annot='hg38',mode='genes',genes=None):
    '''
    Get all the genes overlapping the regions in a bed file.
    Inputs:
        bed_file: a bed with the following columns defined: chr, start, end, name
        annot: may be 'hg38' or 'hg19'
        mode: may be 'genes' or 'exons'
        genes: may specify a subset of genes to consider.
    '''
    # set some globals
    if annot == 'hg19' and mode == 'genes':
        ref = '/mnt/c/Users/ochapman/Documents/circos/genes/gencode.v34lift37.annotation_genes.bed'
    elif annot == 'hg38' and mode == 'genes':
        ref = '/mnt/c/Users/ochapman/Documents/circos/genes/gencode.v33.basic.annotation_genes.bed'
    elif annot == 'hg19' and mode == 'exons':
        ref = '/mnt/c/Users/ochapman/Documents/circos/genes/gencode.v34lift37.annotation_exons.bed'
    elif annot == 'hg38' and mode == 'exons':
        ref = '/mnt/c/Users/ochapman/Documents/circos/genes/gencode.v33.basic.annotation_exons.bed'
    else:
        raise NotImplementedError("mode must be 'genes' or 'exons' and annot must be 'hg19' or 'hg38'.")

    df = pd.DataFrame()
    with open(karyotype_file,'r') as f:
        for line in f.readlines():
            line = line.split(' ') #chr - chrom name start end color
            loc = line[3]+' '+line[4]+' '+line[5]
            ec = pybedtools.BedTool(loc, from_string=True)
            g = pybedtools.BedTool(ref)
            res = g.intersect(ec)#,wa=True)
            del(ec)
            del(g)
            try:
                res = res.to_dataframe(disable_auto_names=True,header=None)
            except pd.errors.EmptyDataError:
                continue
            res["circos_chr"] = line[2]
            df=df.append(res)
    
    # convert to circos format.
    df.columns = ['original_chr','start','end','name','circos_chr']
    df = df[["circos_chr",'start','end','name']]
    if genes != None:
        df = df[df.name.map(lambda x: x.split('_')[0] in genes)]
    outfile = os.path.join(os.path.dirname(karyotype_file),mode+'.txt')
    df.to_csv(outfile,sep='\t',header=False,index=False)
    # remember to clear your /tmp!
    return df

def generate_karyotype(bed_file):
    pass
    #f = open(bed_file,'r')
    #g = open(karyotype_file,'w')

    #f.close(); g.close()



In [17]:
# MB275
bed = '/mnt/c/Users/ochapman/Documents/circos/MB275/MB275_ecDNA.hg38.karyotype'
genes=None
t = generate_annotations(bed,annot='hg38',mode='genes',genes=genes)
t = generate_annotations(bed,annot='hg38',mode='exons',genes=genes)
t

Unnamed: 0,circos_chr,start,end,name
0,1,15561187,15561334,NBAS_1
1,1,15558579,15558634,NBAS_2
2,1,15556782,15556819,NBAS_3
3,1,15554060,15554138,NBAS_4
4,1,15553425,15553473,NBAS_5
...,...,...,...,...
232,1,16295061,16295120,AC010745.5_5
233,1,16294444,16294505,AC010745.5_6
234,1,16319395,16319566,AC010745.4_1
235,1,16316323,16316892,AC010745.4_2


In [8]:
# MB248
bed = '/mnt/c/Users/ochapman/Documents/circos/MB248/MB248_ecDNA.karyotype'
genes=None
t = generate_annotations(bed,annot='hg38',mode='genes',genes=genes)
t = generate_annotations(bed,annot='hg38',mode='exons',genes=genes)
t


Unnamed: 0,circos_chr,start,end,name
0,1,126877671,126877790,PCAT1_5
1,1,127006554,127006618,PCAT1_6
2,1,127292811,127292862,PCAT1_7
3,1,126877671,126877790,PCAT1_2
4,1,127006554,127006618,PCAT1_3
...,...,...,...,...
439,1,127890625,127890720,AC084123.1_1
440,1,127946558,127948723,TMEM75_1
441,1,127960632,127960695,MIR1205_1
442,1,127999130,127999294,RNU1-106P_1


In [7]:
# MB268
bed = '/mnt/c/Users/ochapman/Documents/circos/MB268/MB268_ecDNA.karyotype'
genes=None
#genes = ['CCDC26','MYC','PVT1','LINC00824','PCAT1','POU5F1B','CCAT2','CD96','TMEM260','OTX2-AS1','OTX2']
t = generate_annotations(bed,annot='hg38',mode='genes',genes=genes)
t = generate_annotations(bed,annot='hg38',mode='exons',genes=genes)
t

Unnamed: 0,circos_chr,start,end,name
0,1,194350942,194351238,AL513348.1_1
1,1,194352282,194352426,AL513348.1_2
2,1,194488102,194488205,RNU6-983P_1
3,1,194718794,194719236,AL353072.1_1
4,1,194931129,194931310,AL353072.2_1
...,...,...,...,...
2985,3,205162922,205163006,DSTYK_3
2986,3,205162035,205162212,DSTYK_4
2987,3,205161257,205161387,DSTYK_5
2988,3,205160005,205160270,DSTYK_6


In [6]:
# MB106
bed = '/mnt/c/Users/ochapman/Documents/circos/MB106/MB106_ecDNA.karyotype'
genes=None
#genes = ['CCDC26','MYC','PVT1','LINC00824','PCAT1','POU5F1B','CCAT2','CD96','TMEM260','OTX2-AS1','OTX2']
t = generate_annotations(bed,annot='hg38',mode='genes',genes=genes)
t = generate_annotations(bed,annot='hg38',mode='exons',genes=genes)
t

Unnamed: 0,circos_chr,start,end,name
0,1,127006554,127006618,PCAT1_6
1,1,127292811,127292862,PCAT1_7
2,1,127006554,127006618,PCAT1_3
3,1,127013302,127013774,PCAT1_4
4,1,127049237,127049300,PCAT1_5
...,...,...,...,...
673,2,143468208,143468278,ZC3H3_8
674,2,143465716,143465848,ZC3H3_9
675,2,143440935,143441120,ZC3H3_10
676,2,143440040,143440363,ZC3H3_11


In [75]:
# case11
bed = '/mnt/c/Users/ochapman/Documents/circos/case11/case11_ecDNA_2.karyotype'
genes=None
#genes = ['CCDC26','MYC','PVT1','LINC00824','PCAT1','POU5F1B','CCAT2','CD96','TMEM260','OTX2-AS1','OTX2']
t = generate_annotations(bed,annot='hg38',mode='genes',genes=genes)
t = generate_annotations(bed,annot='hg38',mode='exons',genes=genes)
t

Unnamed: 0,circos_chr,start,end,name
0,1,1138188,1138257,C7orf50_1
1,1,1127256,1127386,C7orf50_2
2,1,1138217,1138257,C7orf50_1
3,1,1127256,1127386,C7orf50_2
4,1,1137314,1138247,C7orf50_1
...,...,...,...,...
111,22,47978828,47978917,CDK5RAP3_11
112,22,47980592,47980798,CDK5RAP3_12
113,22,47981162,47981334,CDK5RAP3_13
114,22,47981436,47981781,CDK5RAP3_14


In [73]:
# case11
bed = '/mnt/c/Users/ochapman/Documents/circos/case11/case11_ecDNA_1.karyotype'
genes=None
#genes = ['CCDC26','MYC','PVT1','LINC00824','PCAT1','POU5F1B','CCAT2','CD96','TMEM260','OTX2-AS1','OTX2']
t = generate_annotations(bed,annot='hg38',mode='genes',genes=genes)
t = generate_annotations(bed,annot='hg38',mode='exons',genes=genes)
t

Unnamed: 0,circos_chr,start,end,name
0,1,86547077,86547278,CLCA4_1
1,1,86559931,86560072,CLCA4_2
2,1,86560210,86560358,CLCA4_3
3,1,86563660,86563769,CLCA4_4
4,1,86565273,86565451,CLCA4_5
...,...,...,...,...
460,2,95477338,95479356,LINC01761_4
461,2,95510071,95510362,LINC02607_1
462,2,95514150,95518255,LINC02607_2
463,2,95510166,95510362,LINC02607_1


In [4]:
# D458
bed = '/mnt/c/Users/ochapman/Documents/circos/D458/D458_ecDNA.karyotype'
#bed = '/mnt/c/Users/ochapman/Documents/circos/D458/1.3/D458_ecDNA_1.6.karyotype'
genes=None
#genes = ['CCDC26','MYC','PVT1','LINC00824','PCAT1','POU5F1B','CCAT2','CD96','TMEM260','OTX2-AS1','OTX2']
t = generate_annotations(bed,annot='hg38',mode='genes',genes=genes)
t = generate_annotations(bed,annot='hg38',mode='exons',genes=genes)
t

Unnamed: 0,circos_chr,start,end,name
0,2,56810375,56810448,OTX2_1
1,2,56810158,56810222,OTX2_2
2,2,56805359,56805575,OTX2_3
3,2,56804187,56804339,OTX2_4
4,2,56799904,56802355,OTX2_5
...,...,...,...,...
27,1,56725889,56725987,AL161757.5_2
28,1,56723136,56723308,AL161757.5_3
29,1,56683420,56683741,AL161757.1_1
30,1,56759378,56759565,AL161757.3_1


In [58]:
# SKPNDW
bed = '/mnt/c/Users/ochapman/Documents/circos/SKPNDW/SKPNDW_ecDNA.karyotype'
genes = ['CCDC26','MYC','PVT1','LINC00824','MTRF1LP2','LINC00976','LINC00976']
t = generate_annotations(bed,annot='hg19',mode='genes',genes=genes)
t = generate_annotations(bed,annot='hg19',mode='exons',genes=genes)
t

Unnamed: 0,circos_chr,start,end,name
28,2,128747679,128748869,MYC_1
29,2,128750493,128751265,MYC_2
30,2,128752643,128752723,MYC_3
31,2,128747718,128748063,MYC_1
32,2,128747764,128748082,MYC_1
...,...,...,...,...
82,15,130694092,130694359,CCDC26_3
83,15,130695817,130695925,CCDC26_4
92,15,130740989,130741553,MTRF1LP2_1
93,15,130742036,130742257,MTRF1LP2_2


In [72]:
# Make annotation bed files
#hg19_gff3='/mnt/c/Users/ochapman/Documents/circos/genes/gencode.v34lift37.annotation.gff3'
#t = bed_features_from_gff3(hg19_gff3,mode='exons')
#hg38_gff3='/mnt/c/Users/ochapman/Documents/circos/genes/gencode.v33.basic.annotation.gff3'
#t = bed_features_from_gff3(hg38_gff3,mode='exons')
#t

Unnamed: 0,chr,start,end,name
2,chr1,11868,12227,DDX11L1_1
3,chr1,12612,12721,DDX11L1_2
4,chr1,13220,14409,DDX11L1_3
6,chr1,12009,12057,DDX11L1_1
7,chr1,12178,12227,DDX11L1_2
...,...,...,...,...
1770112,chrM,14148,14673,MT-ND6_1
1770117,chrM,14673,14742,MT-TE_1
1770120,chrM,14746,15887,MT-CYB_1
1770125,chrM,15887,15953,MT-TT_1
