In [1]:
import numpy as np
import pandas as pd
from scipy.stats import gmean
from tqdm import tqdm
import gffutils
import os.path
#from liftover import ChainFile
from collections import defaultdict
from math import ceil
import pyfastx
import re
from scipy.sparse.csgraph import connected_components
from glob import glob
from scipy.sparse import lil_matrix,csr_matrix,coo_matrix,dok_matrix, save_npz
import pickle
#
blacklist = pd.read_csv('/odinn/tmp/benediktj/Data/SplicePrediction-GTEX-V8/ENCFF220FIN.bed',header=None,sep='\t')



In [2]:
simpileRepeats = pd.read_csv('/odinn/tmp/benediktj/Data/SplicePrediction-GTEX-V8/simpleRepeat.txt',header=None,sep='\t')

In [3]:
#gffutils.create_db('/odinn/tmp/benediktj/Data/SplicePrediction-GTEX/gencode.v42.annotation.gtf', "/odinn/tmp/benediktj/Data/SplicePrediction-GTEX/gencode.v42.annotation.db", force=True,disable_infer_genes=True, disable_infer_transcripts=True)

In [4]:
gtf_gencode = gffutils.FeatureDB('/odinn/tmp/benediktj/Data/SplicePrediction-GTEX-V8/gencode.v42.annotation.db')

In [5]:
leafcutterFiles = glob('/nfs/odinn/users/solvir/GTEx/GTEx_Analysis_v8_sQTL_leafcutter_counts/*_perind_numers.counts.gz')

In [6]:
#junctions = pd.read_csv('/odinn/tmp/benediktj/Data/SplicePrediction-GTEX-V8/GTEx_Analysis_2017-06-05_v8_STARv2.5.3a_junctions.gct', skiprows=2,sep='\t')

In [7]:
def findLowCountJunctions(junctions,blacklist,simpileRepeats):
    tmp2 = junctions.iloc[:,2:]
    transcriptCount = np.sum(tmp2,axis=1)
    tmp2.columns = [x.split('-')[1] for x in junctions.columns[2:]]
    tmp2 = tmp2.T.groupby(tmp2.columns).sum().T

    includeJunction = np.sum(tmp2 > 0,axis=1)>=4

    discardJunctionDict = defaultdict(bool)
    discardReason = defaultdict(bool)
    prev_chrom = 'chr1'
    blacklist_chrom = blacklist[blacklist[0]==prev_chrom]
    simpileRepeats_chrom = simpileRepeats[simpileRepeats[1]==prev_chrom]
    
    for i,junction in tqdm(enumerate(junctions.Name.values)):
        chrom,start,end = junctions.iloc[i,0].split('_')
        start,end = int(start),int(end)
        cond1 = includeJunction[i]==False
        
        if cond1:
            geneID = junctions.iloc[i,1].split('.')[0]
            discardJunctionDict[junction+'_'+geneID] = True
            discardReason[junction+'_'+geneID] = 'LowReadCount'
            prev_chrom = chrom
            continue
            
        if prev_chrom != chrom:
            blacklist_chrom = blacklist[blacklist[0]==chrom]
        tmp = blacklist_chrom[np.logical_not(np.any([blacklist_chrom[1]>=end, blacklist_chrom[2]<=start],0))]
        cond2 = False
        for i_b in range(tmp.shape[0]):
            if (np.isin(start, range(tmp.iloc[i_b,1],tmp.iloc[i_b,2]+1)) or np.isin(end, range(tmp.iloc[i_b,1],tmp.iloc[i_b,2]+1))):
                cond2 = True
                break
                
        if cond2:
            geneID = junctions.iloc[i,1].split('.')[0]
            discardJunctionDict[junction+'_'+geneID] = True
            discardReason[junction+'_'+geneID] = 'InBlacklistedRegion'
            prev_chrom = chrom
            continue
                
        if prev_chrom != chrom:
            simpileRepeats_chrom = simpileRepeats[simpileRepeats[1]==chrom]
        
        tmp = simpileRepeats_chrom[np.logical_not(np.any([simpileRepeats_chrom[2]>=end,simpileRepeats_chrom[3]<=start],0))]
        cond3 = False
        for i_b in range(tmp.shape[0]):
            if (np.isin(start, range(tmp.iloc[i_b,2],tmp.iloc[i_b,3]+1)) or np.isin(end, range(tmp.iloc[i_b,2],tmp.iloc[i_b,3]+1))):
                cond3 = True
                break
        
        if cond3:
            geneID = junctions.iloc[i,1].split('.')[0]
            discardJunctionDict[junction+'_'+geneID] = True
            discardReason[junction+'_'+geneID] = 'InRepeatRegion'
            prev_chrom = chrom
            continue
        
        prev_chrom = chrom
        
    return discardJunctionDict,discardReason

In [8]:
#discardJunctionDict,discardReason = findLowCountJunctions(junctions,blacklist,simpileRepeats)

In [8]:
with open('/odinn/tmp/benediktj/Data/SplicePrediction-GTEX-V8/discardJunctions.pkl', 'rb') as f:
    discardJunctionDict = pickle.load(f)

with open('/odinn/tmp/benediktj/Data/SplicePrediction-GTEX-V8/discardReason.pkl', 'rb') as f:
     discardReason = pickle.load(f)


In [9]:
#with open('/odinn/tmp/benediktj/Data/SplicePrediction-GTEX-V8/discardJunctions.pkl', 'wb') as f:
#    pickle.dump(discardJunctionDict, f)

In [10]:
#with open('/odinn/tmp/benediktj/Data/SplicePrediction-GTEX-V8/discardReason.pkl', 'wb') as f:
#    pickle.dump(discardReason, f)

In [11]:
leafcutterFiles = glob('/odinn/tmp/bjarnih/RNA/leafCutter/GTEx/*/meta_results/clusters_*_summary.tab')
chrmToLeafcutterFiles = defaultdict(list)
for file in leafcutterFiles:
    chrm = file.split('/')[-1].split('_')[1]
    chrmToLeafcutterFiles[chrm].append(file)

In [12]:
def getCombinedLeafCutterDF(chrm,chrmToLeafcutterFiles):
    files = chrmToLeafcutterFiles[chrm]
    for i in range(len(files)):
        if i == 0:
            df = pd.read_csv(files[i],sep='\t')
        else:
            df = pd.concat([df,pd.read_csv(files[i],sep='\t')],axis=0)
            df = df.drop_duplicates('splice_event_id')
    df = df[['Chrom','Start','End','Gene_id','ClusterID']]
    results = []
    for i,gene in enumerate(df.Gene_id.values):
        chrom,start,end,gene_id,cluster_id = df.iloc[i,:]
        for unique_id in str(gene).split(','):
            results.append([chrom,start,end,unique_id,cluster_id])
    df = pd.DataFrame(results)
    df.columns = ['Chrom','Start','End','Gene_id','ClusterID']
    df = pd.concat([rna_splice_junctions[rna_splice_junctions['Chrom']==chrm],df],axis=0)
    return df.sort_values('Start')

In [13]:
df2 = pd.read_csv('/odinn/tmp/benediktj/Data/SplicePrediction-rnasplice-blood-050623/filtered_junctions.gor', sep='\t')
df2 = df2.rename(columns={'Startx':'Start'})
df2['End'] = df2['End']+1

In [14]:
#df1 = getCombinedLeafCutterDF('chr1',chrmToLeafcutterFiles)

In [15]:
def cluster_splice_junctions(splice_junctions):
    clusters = []
    labels = []
    
    for junction in splice_junctions:
        found_cluster = False
        
        for i, cluster in enumerate(clusters):
            for j in range(len(cluster)):
                if junction[0] == cluster[j][0] or junction[1] == cluster[j][1]:
                    cluster.append(junction)
                    labels.append(i)
                    found_cluster = True
                    break
            else:  # only execute when it's no break in the inner loop
                continue
            break
        
        if not found_cluster:
            clusters.append([junction])
            labels.append(len(clusters) - 1)
    return clusters, labels

In [16]:
fasta = pyfastx.Fasta('/odinn/tmp/benediktj/Data/SplicePrediction-GTEX-V8/GRCh38.p13.genome.fa')

In [17]:
data_dir = '/odinn/tmp/benediktj/Data/SplicePrediction-rnasplice-blood-070623/'

seqData = {}

CHROM_GROUP = ['chr1', 'chr3', 'chr5', 'chr7', 'chr9',
'chr11', 'chr13', 'chr15', 'chr17', 'chr19', 'chr21',
'chr2', 'chr4', 'chr6', 'chr8', 'chr10', 'chr12',
'chr14', 'chr16', 'chr18', 'chr20', 'chr22', 'chrX', 'chrY']

for chrom in CHROM_GROUP:
        seqData[chrom] = dok_matrix((len(fasta[chrom]), 5), dtype=np.int8)

seqData[chrom] = dok_matrix((len(fasta[chrom]), 5), dtype=np.int8)

In [18]:
genes = gtf_gencode.features_of_type('gene')
gene_name_to_id = defaultdict(lambda: None)
for gene in tqdm(genes):
    #print(gene['gene_id'],gene['gene_name'])
    gene_name_to_id[gene['gene_name'][0]] = gene['gene_id'][0].split('.')[0]

62696it [00:51, 1223.86it/s]


In [19]:
df2['Gene_id'] = df2['filtered__Gene_name'].apply(lambda x: gene_name_to_id[x])

In [20]:
rna_splice_junctions = df2[df2['Gene_id'].apply(lambda x: x is not None)][['Chrom','Start','End','Gene_id']]
#clusters, labels = cluster_splice_junctions(df1.sort_values('ClusterID').iloc[:10,1:3].values)
#print(labels)
#rna_splice_junctions['ClusterID'] = rna_splice_junctions['Gene_id']+'_rna_blood'

In [62]:
cluster_id = []
for gene in tqdm(rna_splice_junctions.Gene_id.unique()):
    splice_junctions = rna_splice_junctions[rna_splice_junctions['Gene_id']==gene].iloc[:,1:3].values
    clusters, labels = cluster_splice_junctions(splice_junctions)
    cluster_id.append([f'rna_blood_{gene}_{x}' for x in labels])
    
cluster_id = [item for sublist in cluster_id for item in sublist]

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 14972/14972 [05:04<00:00, 49.20it/s]


In [70]:
#with open(data_dir+'cluster_ids.pkl', 'wb') as f:
#    pickle.dump(cluster_id, f)

In [21]:
with open(data_dir+'cluster_ids.pkl', 'rb') as f:
    cluster_id = pickle.load(f)

In [22]:
rna_splice_junctions['ClusterID'] = cluster_id

In [23]:
def create_datapoints(seq, strand, tx_start, tx_end):
    # This function first converts the sequence into an integer array, where
    # A, C, G, T, Missing are mapped to 1, 2, 3, 4, 5 respectively. If the strand is
    # negative, then reverse complementing is done. . It then calls reformat_data and one_hot_encode

    seq = seq.upper()
    seq = re.sub(r'[^AGTC]', '5',seq)
    seq = seq.replace('A', '1').replace('C', '2')
    seq = seq.replace('G', '3').replace('T', '4')

    tx_start = int(tx_start)
    tx_end = int(tx_end) 

    Y_idx = []
    
    X0 = np.asarray([int(x) for x in seq])

    X = one_hot_encode(X0)

    return X

def ceil_div(x, y):
    return int(ceil(float(x)/y))


IN_MAP = np.asarray([[0, 0, 0, 0,0],
                     [1, 0, 0, 0,0],
                     [0, 1, 0, 0,0],
                     [0, 0, 1, 0,0],
                     [0, 0, 0, 1,0],
                    [0, 0, 0, 0,1]])
# One-hot encoding of the inputs: 0 is for padding, and 1, 2, 3, 4 correspond
# to A, C, G, T, Missing respectively.

OUT_MAP = np.asarray([[1, 0, 0],
                      [0, 1, 0],
                      [0, 0, 1],
                      [0, 0, 0]])

def one_hot_encode(Xd):
    return IN_MAP[Xd.astype('int8')]

def getJunctions(gtf,transcript,strand):
    #transcript = gtf[transcript_id.split('.')[0]]
    exon_junctions = []
    tx_start = int(transcript[3])
    tx_end = int(transcript[4])
    exons = gtf.children(transcript, featuretype="exon")
    for exon in exons:
        exon_start = int(exon[3])
        exon_end = int(exon[4])
        exon_junctions.append((exon_start,exon_end))

    intron_junctions = []

    if strand=='+':
        intron_start = exon_junctions[0][1]
        for i,exon_junction in enumerate(exon_junctions[1:]):
            intron_end = exon_junction[0]
            intron_junctions.append((intron_start,intron_end))
            if i+1 != len(exon_junctions[1:]):
                intron_start = exon_junction[1]

    elif strand=='-':
        exon_junctions.reverse()
        intron_start = exon_junctions[0][1]
        for i,exon_junction in enumerate(exon_junctions[1:]):
            intron_end = exon_junction[0]
            intron_junctions.append((intron_start,intron_end))
            if i+1 != len(exon_junctions[1:]):
                intron_start = exon_junction[1]

    return np.array(intron_junctions)

In [24]:
transcripts = gtf_gencode.features_of_type('transcript')
gene_to_label = {}
save_seq = True
prev_chrom = 'chr1'
leaf_cutter_junctions = getCombinedLeafCutterDF(prev_chrom,chrmToLeafcutterFiles)
for transcript in tqdm(transcripts): 
    chrom,gene_start,gene_end,strand,gene_id,transcript_id,gene_type,gene_name,level = transcript[0],transcript[3],transcript[4],transcript[6],transcript[8]['gene_id'][0],transcript[8]['transcript_id'][0],transcript[8]['gene_type'][0],transcript[8]['gene_name'][0],transcript[8]['level'][0]
    
    try:
        cond1 = 'Ensembl_canonical' in transcript[8]['tag']
        cond2 = gene_type=='protein_coding'
        cond3 = int(level)<3
        if cond1 and cond2 and cond3:
            intron_junctions = getJunctions(gtf_gencode,transcript,strand)
            junction_starts = defaultdict(int)
            junction_ends = defaultdict(int)
            
           
            if len(intron_junctions>0):
                if chrom!=prev_chrom:
                    leaf_cutter_junctions = getCombinedLeafCutterDF(chrom,chrmToLeafcutterFiles)
                for junction in intron_junctions:
                    junction_starts[junction[0]] = 1
                    junction_ends[junction[1]] = 1
                simple_gene_id = gene_id.split('.')[0]
                alt_junctions = leaf_cutter_junctions[leaf_cutter_junctions['Gene_id']==simple_gene_id]
                if alt_junctions.shape[0]>0:
                    clusters = defaultdict(int)
                    for i,pos in enumerate(alt_junctions['Start']):
                        if junction_starts[pos] == 1:
                            clusters[alt_junctions.iloc[i,:]['ClusterID']] = 1
                    for i,pos in enumerate(alt_junctions['End']):
                        if junction_ends[pos] == 1:
                            clusters[alt_junctions.iloc[i,:]['ClusterID']] = 1
                    for cluster in clusters.keys():
                        cluster_junctions = alt_junctions[alt_junctions['ClusterID']==cluster][['Start','End']]
                        for i_junc in range(cluster_junctions.shape[0]):
                            start,end = cluster_junctions.iloc[i_junc,:]['Start'],cluster_junctions.iloc[i_junc,:]['End']
                            junction_id = '{}_{}_{}_{}'.format(chrom,start,end,simple_gene_id)
                            if not discardJunctionDict[junction_id]:
                                junction_starts[start] = 1
                                junction_ends[end] = 1
                    
                    junction_starts = {k:v for k,v in junction_starts.items() if v != 0}
                    junction_ends = {k:v for k,v in junction_ends.items() if v != 0}
                    gene_to_label[gene_id] = [junction_starts, junction_ends]
            
                    if save_seq:
                        seq = fasta[chrom][int(gene_start)-1:int(gene_end)]
                        seq = seq.seq
                        X = create_datapoints(seq, strand, gene_start, gene_end)
                        seqData[chrom][int(gene_start)-1:int(gene_end)] = X
                        jn_start = list(junction_starts.keys())
                        jn_end = list(junction_ends.keys())
                        name = '{}\t{}\t{}\t{}'.format(gene_name,gene_id,transcript_id,level)

                        if strand=='+':
                            with open('{}/annotation_GTEX_v8.txt'.format(data_dir), 'a') as the_file:
                                the_file.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(name,chrom,strand,gene_start,gene_end,','.join([str(x) for x in jn_start]),','.join([str(x) for x in jn_end])))
                        if strand=='-':
                            with open('{}/annotation_GTEX_v8.txt'.format(data_dir,), 'a') as the_file:
                                the_file.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(name,chrom,strand,gene_start,gene_end,','.join([str(x) for x in jn_end]),','.join([str(x) for x in jn_start])))

                        if chrom!=prev_chrom:
                            save_npz('{}/sparse_sequence_data/{}.npz'.format(data_dir,prev_chrom), seqData[prev_chrom].tocoo())
                            del seqData[prev_chrom]

                        prev_chrom = chrom
                
    except:
        pass
        #print(gene[2])
        #print(gene[8]['transcript_support_level'])

252416it [5:43:30, 12.25it/s]  


In [25]:
save_npz('{}/sparse_sequence_data/{}.npz'.format(data_dir,prev_chrom), seqData[prev_chrom].tocoo())
#del seqData[prev_chrom]

In [26]:
with open('{}/gene_to_label.pickle'.format(data_dir), 'wb') as handle:
    pickle.dump(gene_to_label, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
SAMD11	ENSG00000187634.13	ENST00000616016.5	2	chr1	+	923923	944574	924948,926013,930336,931089,935896,939129,939412,941306,942251,942488,943058,943377,943808,925189,925477,925524,925765,925800,928751,936934,938950,939460,940462,941048,941861,943453	925922,930155,931039,935772,939040,939275,941144,942136,942410,942559,943253,943698,943908,925898,931946,936154,936850,936853,939236,939272,939505,942173,942436,943893,943916
NOC2L	ENSG00000188976.11	ENST00000327044.7	2	chr1	-	944203	959256	945057,945518,946173,946402,948131,948490,951127,952000,952412,953175,953782,954004,955923,956095,956894,957099,958929,959215,944693,945042,945323,946147,946757,954464,955638,958721,959179	944800,945146,945653,946286,946545,948232,948603,951238,952139,952600,953288,953892,954082,956013,956215,957025,957273,959081,944550,944819,945422,946497,946610,946839,946864,948577,948591,948594,952127,952535,954523,954814,955477,955982,959136
KLHL17	ENSG00000187961.15	ENST00000338591.8	2	chr1	+	960584	965719	960800,961552,961750,962047,962471,962917,963253,963504,964008,964180,964530,960921,961729,961980,962244,962615,962618,962775,962913,963209,964167,964563	961293,961629,961826,962355,962704,963109,963337,963920,964107,964349,964963,961981,962286,962139,962314,962616,963032,963857,964437,964935
PLEKHN1	ENSG00000187583.11	ENST00000379410.8	2	chr1	+	966482	975865	966614,966803,970423,970601,970758,971006,971208,971404,972150,972424,973010,973326,973640,974051,974364,967010,970396,970760,971219,971593,971607,971899	966704,970277,970521,970686,970879,971113,971324,972075,972288,972861,973186,973500,973833,974316,974442,970182,966914,967218,971077,971330,971327,972264,972895,974295,974414
PERM1	ENSG00000187642.10	ENST00000433179.4	2	chr1	-	975198	982093	976499,978881,982065,976642,979056,978464,978289,981288,981970	976269,976624,981173,976558,978615,981047,981224
HES4	ENSG00000188290.11	ENST00000304952.11	2	chr1	-	998964	1000097	999526,999692,999866,999533	999432,999613,999787
ISG15	ENSG00000187608.10	ENST00000649529.1	2	chr1	+	1013497	1014540	1013576,981877,982487,982666,983450,995680,999457,1001263,1008279,1013769	1013984,989590,995531,982598,982594,982883,985755,1008194,1008470,1008571
AGRN	ENSG00000188157.15	ENST00000379370.7	2	chr1	+	1020120	1056116	1020373,1022462,1035324,1040880,1041397,1041702,1042162,1043457,1043732,1044023,1044257,1044439,1045277,1045523,1045876,1046088,1046265,1046735,1046957,1047454,1047687,1047895,1048365,1049059,1049451,1049795,1050037,1050329,1050591,1050837,1051369,1051645,1051815,1053977,1054551,1017465,1024832,1031899,1034703,1034966,1039063,1042597,1042748,1042846,1043435,1047036,1047162,1049995,1051043,1051094,1052016,1052573,1053493,1055215	1022201,1035277,1040665,1041173,1041478,1041956,1043239,1043538,1043823,1044109,1044334,1045161,1045359,1045733,1045964,1046160,1046397,1046820,1047327,1047573,1047776,1048012,1048867,1049236,1049566,1049903,1050233,1050427,1050726,1051253,1051453,1051728,1053753,1054448,1054824,1031785,1022633,1032232,1041161,1041207,1041961,1043244,1042691,1043331,1043871,1047063,1047503,1047782,1047823,1050732,1051032,1051027,1051526,1051746,1051742,1051993,1053461,1053725,1052731,1052823,1054051,1055066,1055377
RNF223	ENSG00000237330.3	ENST00000453464.3	2	chr1	-	1070967	1074306	1074016,1069307,1068561,1067798,1066566,1066474,1066746,1066791	1072575,1065833,1066598,1066666,1066798,1067579,1067979,1068016,1068040,1072397,1072589
C1orf159	ENSG00000131591.18	ENST00000421241.7	2	chr1	-	1081823	1116089	1084353,1084481,1085878,1087139,1087502,1090353,1091472,1091991,1116060,1083294,1083915,1083442,1084383,1084492,1085251,1087498,1088864,1090876,1091278,1105956,1111674,1115022,1092566,1106650,1095077,1093598,1112009,1104686,1113353,1099718,1099793,1106659,1112907	1082987,1084383,1084506,1086012,1087204,1087597,1090428,1091565,1092103,1082982,1083418,1083441,1083625,1084011,1084016,1084032,1084071,1084086,1085310,1085963,1086336,1090022,1090587,1090983,1091374,1091609,1092293,1095124,1096611,1099638,1106049,1111821,1112193,1115075


In [None]:
SAMD11	ENSG00000187634.13	ENST00000616016.5	2	chr1	+	923923	944574	924948,926013,930336,931089,935896,939129,939412,941306,942251,942488,943058,943377,943808,925189,925477,925524,925765,925800,928751,936934,938950,939460,940462,941048,941861,943453	925922,930155,931039,935772,939040,939275,941144,942136,942410,942559,943253,943698,943908,925898,931946,936154,936853,936850,939272,939236,939505,942173,942436,943893,943916
NOC2L	ENSG00000188976.11	ENST00000327044.7	2	chr1	-	944203	959256	945057,945518,946173,946402,948131,948490,951127,952000,952412,953175,953782,954004,955923,956095,956894,957099,958929,959215,944693,945042,945323,946147,946757,954464,955638,958721,959179	944800,945146,945653,946286,946545,948232,948603,951238,952139,952600,953288,953892,954082,956013,956215,957025,957273,959081,944550,944819,945422,946497,946610,946839,946864,948577,948591,948594,952127,952535,954523,954814,955477,955982,959136
KLHL17	ENSG00000187961.15	ENST00000338591.8	2	chr1	+	960584	965719	960800,961552,961750,962047,962471,962917,963253,963504,964008,964180,964530,960921,961729,961980,962244,962615,962618,962775,962913,963209,964167,964563	961293,961629,961826,962355,962704,963109,963337,963920,964107,964349,964963,961981,962286,962139,962314,962616,963032,963857,964437,964935
PLEKHN1	ENSG00000187583.11	ENST00000379410.8	2	chr1	+	966482	975865	966614,966803,970423,970601,970758,971006,971208,971404,972150,972424,973010,973326,973640,974051,974364,967010,970396,970760,971219,971593,971607,971899	966704,970277,970521,970686,970879,971113,971324,972075,972288,972861,973186,973500,973833,974316,974442,967218,966914,970182,971077,971327,971330,972264,972895,974295,974414
PERM1	ENSG00000187642.10	ENST00000433179.4	2	chr1	-	975198	982093	976499,978881,982065,976642,979056,978289,978464,981288,981970	976269,976624,981173,976558,978615,981047,981224
HES4	ENSG00000188290.11	ENST00000304952.11	2	chr1	-	998964	1000097	999526,999692,999866,999137,999533	999432,999613,999787,999085
ISG15	ENSG00000187608.10	ENST00000649529.1	2	chr1	+	1013497	1014540	1013576,981877,982487,982666,983450,995680,999457,1001263,1008279,1013769	1013984,989590,982594,982598,995531,985755,1008194,982883,1008571,1008470
AGRN	ENSG00000188157.15	ENST00000379370.7	2	chr1	+	1020120	1056116	1020373,1022462,1035324,1040880,1041397,1041702,1042162,1043457,1043732,1044023,1044257,1044439,1045277,1045523,1045876,1046088,1046265,1046735,1046957,1047454,1047687,1047895,1048365,1049059,1049451,1049795,1050037,1050329,1050591,1050837,1051369,1051645,1051815,1053977,1054551,1017465,1024832,1031899,1034703,1034966,1039063,1042597,1042748,1042846,1043435,1047036,1047162,1049995,1051043,1051094,1052016,1052573,1053192,1053493,1055215	1022201,1035277,1040665,1041173,1041478,1041956,1043239,1043538,1043823,1044109,1044334,1045161,1045359,1045733,1045964,1046160,1046397,1046820,1047327,1047573,1047776,1048012,1048867,1049236,1049566,1049903,1050233,1050427,1050726,1051253,1051453,1051728,1053753,1054448,1054824,1031785,1022633,1032232,1041207,1041161,1041961,1042691,1043244,1043331,1043871,1047503,1047063,1047782,1047823,1050732,1051032,1051027,1051526,1051746,1051742,1051993,1053725,1053461,1052731,1052823,1053244,1054051,1055377,1055066
RNF223	ENSG00000237330.3	ENST00000453464.3	2	chr1	-	1070967	1074306	1074016	1072575,1072397,1072589
C1orf159	ENSG00000131591.18	ENST00000421241.7	2	chr1	-	1081823	1116089	1084353,1084481,1085878,1087139,1087502,1090353,1091472,1091991,1116060,1082604,1083915,1083294,1083442,1084492,1084383,1085251,1087498,1088864,1090876,1091278,1111674,1115022,1105956,1092566,1095077,1106650,1112009,1113353,1093598,1104686,1099334,1099177,1099639,1099560,1099718,1099793,1099870,1099868,1106659,1109139,1112907	1082987,1084383,1084506,1086012,1087204,1087597,1090428,1091565,1092103,1082253,1082982,1083418,1083441,1083625,1084011,1084016,1084032,1084071,1084086,1085310,1085963,1086336,1090022,1090587,1090983,1091374,1091609,1092293,1095124,1096611,1099103,1099407,1099484,1099561,1099638,1099717,1099792,1106049,1109107,1111821,1112193,1115075