# Merge driver coordinates into a single file

For each transcript of a driver gene associated to a protein-coding or non-coding gene, get its exons (CDS and UTRs in case of protein-coding genes), splice sites, introns, proximal and distal promoters. Then merge all coordinates from the total driver genes in a single file. 

In [1]:
import os, sys
os.environ["PATH"] = os.path.dirname(sys.executable) + os.pathsep + os.environ["PATH"]

import pandas as pd
import pybedtools

In [2]:
BUILD = 'hg38'
RELEASE = '2'
ge_directory = '../data'

# Protein-coding genes associated regions
cds_file = os.path.join(ge_directory, f'{BUILD}_cds.transcript_level.overlap.gz')
splice_file = os.path.join(ge_directory, f'{BUILD}_splice_sites.transcript_level.overlap.gz')
utr5_file = os.path.join(ge_directory, f'{BUILD}_5utr.transcript_level.overlap.gz')
utr3_file = os.path.join(ge_directory, f'{BUILD}_3utr.transcript_level.overlap.gz')
proximal_promoters_file = os.path.join(ge_directory,f'{BUILD}_proximal_promoters.transcript_level.overlap.gz')
distal_promoters_file = os.path.join(ge_directory,  f'{BUILD}_distal_promoters.transcript_level.overlap.gz')
introns_file = os.path.join(ge_directory, f'{BUILD}_introns.transcript_level.overlap.gz')

# lncRNA genes associated regions
lncrna_exons_file = os.path.join(ge_directory, f'{BUILD}_lncrna_exons.transcript_level.overlap.gz')
lncrna_splice_sites_file = os.path.join(ge_directory, f'{BUILD}_lncrna_splice_sites.transcript_level.overlap.gz')
lncrna_proximal_promoters_file = os.path.join(ge_directory, f'{BUILD}_lncrna_proximal_promoters.transcript_level.overlap.gz')
lncrna_distal_promoters_file = os.path.join(ge_directory, f'{BUILD}_lncrna_distal_promoters.transcript_level.overlap.gz')
lncrna_introns_file = os.path.join(ge_directory, f'{BUILD}_lncrna_introns.transcript_level.overlap.gz')

cancergenes_f = '../data/cancerdrivers.txt'

In [3]:
output_f = '../data/cancerdrivers_regions.tsv'

#### Load cancer genes

In [4]:
cancergenes_df = pd.read_csv(cancergenes_f, sep='\t', header=0)
cancergenes = set(cancergenes_df['SYMBOL'])
len(cancergenes)

782

#### Load regions from cancer genes in a dataframe

In [5]:
lines = []

In [6]:
# Load drivers associated with protein-coding genes
# Note that some genes do not have introns/splice sites
cancergenes_1 = [i for i in cancergenes if i != 'MALAT1']
for ge_file in [cds_file, splice_file, utr5_file, utr3_file, proximal_promoters_file, distal_promoters_file, introns_file]:

    ge_name = ge_file.split('/')[-1].split('.')[0].split('hg38_')[-1]
    ge_df = pd.read_csv(ge_file, sep='\t', header=0, low_memory=False)
    drivers_ge_df = ge_df.loc[ge_df['SYMBOL'].isin(cancergenes_1)].copy()
    lines.append(drivers_ge_df)
    print(f'{ge_name} loaded: {len(drivers_ge_df["SYMBOL"].unique())}')
    
    # Check length for each genomic element type
    drivers_ge_df['START'] = drivers_ge_df.apply(lambda x: x['START'] - 1, axis=1)
    drivers_ge_bed = pybedtools.BedTool.from_dataframe(drivers_ge_df)
    drivers_ge_bed_merged = drivers_ge_bed.merge()
    ge_merged = pd.read_csv(drivers_ge_bed_merged.fn, sep='\t', header=None)
    ge_merged.columns = ['CHR', 'START', 'END']
    ge_merged['LENGTH'] = ge_merged.apply(lambda x: x['END'] - x['START'], axis=1)
    print(ge_merged['LENGTH'].sum())

cds loaded: 781
2282620
splice_sites loaded: 765
654405
5utr loaded: 781
577942
3utr loaded: 781
2304406
proximal_promoters loaded: 781
1030460
distal_promoters loaded: 781
4036397
introns loaded: 765
93620830


In [7]:
# Load drivers associated to lncRNA genes
cancergenes_2 = ['MALAT1']
for ge_file in [lncrna_exons_file, lncrna_splice_sites_file, lncrna_proximal_promoters_file, lncrna_distal_promoters_file, lncrna_introns_file]:
    ge_name = ge_file.split('/')[-1].split('.')[0].split('hg38_')[-1]
    ge_df = pd.read_csv(ge_file, sep='\t', header=0, low_memory=False)
    drivers_ge_df = ge_df.loc[ge_df['SYMBOL'].isin(cancergenes_2)].copy()
    lines.append(drivers_ge_df)
    print(f'{ge_name} loaded')

lncrna_exons loaded
lncrna_splice_sites loaded
lncrna_proximal_promoters loaded
lncrna_distal_promoters loaded
lncrna_introns loaded


In [8]:
regions = pd.concat(lines)
len(regions['SYMBOL'].unique())

782

#### Merge coordinates

In [9]:
lines = []
for chromosome in [str(c) for c in list(range(1, 23)) + ['X', 'Y']]: 
    subset_data = regions.loc[regions['CHROMOSOME'] == chromosome]
    subset_data_sorted = subset_data.sort_values(by=['START', 'END'], ascending=[True, True], inplace=False)
    lines.append(subset_data_sorted)
regions_sorted = pd.concat(lines)

In [10]:
regions_sorted.head()

Unnamed: 0,CHROMOSOME,START,END,STRAND,GENE_ID,TRANSCRIPT_ID,SYMBOL
231,1,2226318,2228318,+,ENSG00000157933,ENST00000378536,SKI
231,1,2228118,2228518,+,ENSG00000157933,ENST00000378536,SKI
369,1,2228318,2228766,+,ENSG00000157933,ENST00000378536,SKI
1983,1,2228766,2229735,+,ENSG00000157933,ENST00000378536,SKI
3910,1,2229735,2229760,+,ENSG00000157933,ENST00000378536,SKI


In [11]:
regions = pd.concat(lines)
len(regions['SYMBOL'].unique())

782

In [12]:
regions['START'] = regions.apply(lambda x: x['START'] - 1, axis=1)    # Transform to BED
regions_bed = pybedtools.BedTool.from_dataframe(regions)
regions_bed_merged = regions_bed.merge()

In [13]:
regions_merged = pd.read_csv(regions_bed_merged.fn, sep='\t', header=None)
regions_merged.columns = ['CHR', 'START', 'END']
regions_merged['START'] = regions_merged.apply(lambda x: x['START'] + 1, axis=1)    # 1 based format
regions_merged.head()

Unnamed: 0,CHR,START,END
0,1,2226318,2310213
1,1,2553638,2565382
2,1,3067167,3438621
3,1,6185019,6211389
4,1,6783453,7769706


In [14]:
# Save
regions_merged.to_csv(output_f, sep='\t', index=False)

In [15]:
regions_merged['LENGTH'] = regions_merged.apply(lambda x: x['END'] - x['START'] + 1, axis=1)
print(regions_merged['LENGTH'].sum())

99105646
