In [1]:
# library dependencies
import pandas as pd
from pathlib import Path
import re

In [2]:
# raw data file and path
datafile = '../data/preproc_stage1.csv'

# sanity check if the file exists
if not Path(datafile).is_file():
    print('Data file not found!')

In [3]:
# reading the data into a dataframe and looking at the first entries
df = pd.read_csv(datafile)
df

Unnamed: 0,EnsemblTranscriptID,Adrenal_PTR,Appendices_PTR,Brain_PTR,Colon_PTR,Duodenum_PTR,Endometrium_PTR,Esophagus_PTR,Fallopiantube_PTR,Fat_PTR,...,Spleen_PTR,Stomach_PTR,Testis_PTR,Thyroid_PTR,Tonsil_PTR,Urinarybladder_PTR,bed_files,fa_files,bed,fa
0,ENST00000263100,,8.277,,,,,,7.841,,...,7.313,,,,,,1,1,../../GENCODE43/protein_coding/BED6__protein_c...,../../GENCODE43/protein_coding/FA_protein_codi...
1,ENST00000373993,,,,5.135,5.371,,,,,...,,5.8143,,,,,1,1,../../GENCODE43/protein_coding/BED6__protein_c...,../../GENCODE43/protein_coding/FA_protein_codi...
2,ENST00000318602,6.290,6.328,5.948,5.811,6.068,5.383,5.881,6.119,6.410,...,5.136,6.5349,5.820,6.060,5.675,5.8286,1,1,../../GENCODE43/protein_coding/BED6__protein_c...,../../GENCODE43/protein_coding/FA_protein_codi...
3,ENST00000299698,,,3.995,,,,4.129,,,...,,,2.350,,5.249,,1,1,../../GENCODE43/protein_coding/BED6__protein_c...,../../GENCODE43/protein_coding/FA_protein_codi...
4,ENST00000401850,3.843,4.601,,,,,4.013,3.683,,...,,4.0613,4.832,,,4.2430,1,1,../../GENCODE43/protein_coding/BED6__protein_c...,../../GENCODE43/protein_coding/FA_protein_codi...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11560,ENST00000374888,,,,,,,,,,...,,,,,4.681,,1,1,../../GENCODE43/protein_coding/BED6__protein_c...,../../GENCODE43/protein_coding/FA_protein_codi...
11561,ENST00000294353,4.461,5.013,5.047,4.566,5.184,4.826,5.102,4.670,5.756,...,4.255,4.0412,5.389,4.250,4.439,4.1460,1,1,../../GENCODE43/protein_coding/BED6__protein_c...,../../GENCODE43/protein_coding/FA_protein_codi...
11562,ENST00000322764,5.664,5.524,5.478,5.915,5.811,5.817,5.943,5.509,4.931,...,6.159,5.8846,5.582,5.598,5.968,5.3358,1,1,../../GENCODE43/protein_coding/BED6__protein_c...,../../GENCODE43/protein_coding/FA_protein_codi...
11563,ENST00000381638,5.112,4.918,5.139,5.190,5.442,5.602,4.715,4.956,5.033,...,5.110,5.0834,5.047,5.038,5.130,5.0619,1,1,../../GENCODE43/protein_coding/BED6__protein_c...,../../GENCODE43/protein_coding/FA_protein_codi...


In [4]:
df['bed'].isnull().sum()

0

In [5]:
df.loc[ df['EnsemblTranscriptID'] == 'ENST00000263094' ]

Unnamed: 0,EnsemblTranscriptID,Adrenal_PTR,Appendices_PTR,Brain_PTR,Colon_PTR,Duodenum_PTR,Endometrium_PTR,Esophagus_PTR,Fallopiantube_PTR,Fat_PTR,...,Spleen_PTR,Stomach_PTR,Testis_PTR,Thyroid_PTR,Tonsil_PTR,Urinarybladder_PTR,bed_files,fa_files,bed,fa
28,ENST00000263094,3.807,,,,2.98,,,,,...,2.24,3.0781,,,,,1,1,../../GENCODE43/protein_coding/BED6__protein_c...,../../GENCODE43/protein_coding/FA_protein_codi...


In [6]:
tid = 'ENST00000263100'
bed_file = Path(df.loc[df['EnsemblTranscriptID'] == tid, 'bed'].values[0])

with bed_file.open() as f:
    line_count = 0
    for line in f:
        print('DEBUG', line)
        line_count += 1
        data = line.split('\t')
        if len(data) != 6:
            print(tid, 'BED data column missing')
            break
        if data[3] == '5UTR':
            UTR5_start = data[1]
            UTR5_stop = data[2]
        elif data[3] == 'CDS':
            CDS_start = data[1]
            CDS_stop = data[2]
        elif data[3] == '3UTR':
            UTR3_start = data[1]
            UTR3_stop = data[2]
        else:
            print(tid, 'Unknown transcript region')
            
    if line_count != 3:
        print(tid, 'BED line missing')

print(UTR5_start, UTR5_stop, CDS_start, CDS_stop, UTR3_start, UTR3_stop)

DEBUG ENST00000263100.8	0	55	5UTR	.	+

DEBUG ENST00000263100.8	55	1543	CDS	.	+

DEBUG ENST00000263100.8	1543	3382	3UTR	.	+

0 55 55 1543 1543 3382


In [50]:
# read BED files and store info in df

# BED file structure
# ENST00000000233.10	0	88	5UTR	.	+
# ENST00000000233.10	88	631	CDS	.	+
# ENST00000000233.10	631	1032	3UTR	.	+

# new dataframe columns
df['5UTR start'] = None
df['5UTR stop'] = None
df['CDS start'] = None
df['CDS stop'] = None
df['3UTR start'] = None
df['3UTR stop'] = None

# loop over all transcript IDs
for tid in df['EnsemblTranscriptID']:
    # get BED file name
    bed_file = Path(df.loc[df['EnsemblTranscriptID'] == tid, 'bed'].values[0])

    # sanity check
    if not bed_file.exists():
        print(tid, 'BED file does not exist!', bed_file)
        continue

    # read the file, extract UTR and CDS information and store it in the dataframe
    with bed_file.open() as f:
        UTR5_start = UTR5_stop = UTR3_start = UTR3_stop = -1
        linecount = 0
        for line in f:
            data = line.split('\t')
            if len(data) != 6:
                print(tid, 'BED data column missing.', bed_file)
                break
            if data[3] == '5UTR':
                UTR5_start = data[1]
                UTR5_stop = data[2]
            elif data[3] == 'CDS':
                CDS_start = data[1]
                CDS_stop = data[2]
            elif data[3] == '3UTR':
                UTR3_start = data[1]
                UTR3_stop = data[2]
            else:
                print(tid, 'Unknown transcript region.', bed_file)
                break
            linecount += 1

        df.loc[ df['EnsemblTranscriptID'] == tid, '5UTR start'] = UTR5_start
        df.loc[ df['EnsemblTranscriptID'] == tid, '5UTR stop'] = UTR5_stop
        df.loc[ df['EnsemblTranscriptID'] == tid, 'CDS start'] = CDS_start
        df.loc[ df['EnsemblTranscriptID'] == tid, 'CDS stop'] = CDS_stop
        df.loc[ df['EnsemblTranscriptID'] == tid, '3UTR start'] = UTR3_start
        df.loc[ df['EnsemblTranscriptID'] == tid, '3UTR stop'] = UTR3_stop

        if linecount != 3:
            print(tid, 'BED line count off.', bed_file)

ENST00000360265 BED line count off. ../../GENCODE43/protein_coding/BED6__protein_coding_strict/ENST00000360265.9.bed
ENST00000439040 BED line count off. ../../GENCODE43/protein_coding/BED6__protein_coding_strict/ENST00000439040.6.bed
ENST00000369733 BED line count off. ../../GENCODE43/protein_coding/BED6__protein_coding_strict/ENST00000369733.8.bed
ENST00000612273 BED line count off. ../../GENCODE43/protein_coding/BED6__protein_coding_strict/ENST00000612273.2.bed
ENST00000452319 BED line count off. ../../GENCODE43/protein_coding/BED6__protein_coding_strict/ENST00000452319.6.bed
ENST00000468623 BED line count off. ../../GENCODE43/protein_coding/BED6__protein_coding_strict/ENST00000468623.6.bed
ENST00000602569 BED line count off. ../../GENCODE43/protein_coding/BED6__protein_coding_strict/ENST00000602569.2.bed
ENST00000612619 BED line count off. ../../GENCODE43/protein_coding/BED6__protein_coding_strict/ENST00000612619.2.bed
ENST00000298622 BED line count off. ../../GENCODE43/protein_codi

Manual inspection of the above metioned BED files shows that not all transcripts have both UTR regions included.

In [51]:
# convert data types (was necessary because the comparison below failed othewise)
df['5UTR start'] = df['5UTR start'].astype(int)
df['5UTR stop'] = df['5UTR stop'].astype(int)
df['CDS start'] = df['CDS start'].astype(int)
df['CDS stop'] = df['CDS stop'].astype(int)
df['3UTR start'] = df['3UTR start'].astype(int)
df['3UTR stop'] = df['3UTR stop'].astype(int)

In [52]:
# show what the amended dataframe looks like
df

Unnamed: 0,EnsemblTranscriptID,Adrenal_PTR,Appendices_PTR,Brain_PTR,Colon_PTR,Duodenum_PTR,Endometrium_PTR,Esophagus_PTR,Fallopiantube_PTR,Fat_PTR,...,bed_files,fa_files,bed,fa,5UTR start,5UTR stop,CDS start,CDS stop,3UTR start,3UTR stop
0,ENST00000263100,,8.277,,,,,,7.841,,...,1,1,../../GENCODE43/protein_coding/BED6__protein_c...,../../GENCODE43/protein_coding/FA_protein_codi...,0,55,55,1543,1543,3382
1,ENST00000373993,,,,5.135,5.371,,,,,...,1,1,../../GENCODE43/protein_coding/BED6__protein_c...,../../GENCODE43/protein_coding/FA_protein_codi...,0,92,92,1877,1877,2044
2,ENST00000318602,6.290,6.328,5.948,5.811,6.068,5.383,5.881,6.119,6.410,...,1,1,../../GENCODE43/protein_coding/BED6__protein_c...,../../GENCODE43/protein_coding/FA_protein_codi...,0,70,70,4495,4495,4610
3,ENST00000299698,,,3.995,,,,4.129,,,...,1,1,../../GENCODE43/protein_coding/BED6__protein_c...,../../GENCODE43/protein_coding/FA_protein_codi...,0,31,31,4396,4396,5127
4,ENST00000401850,3.843,4.601,,,,,4.013,3.683,,...,1,1,../../GENCODE43/protein_coding/BED6__protein_c...,../../GENCODE43/protein_coding/FA_protein_codi...,0,490,490,1552,1552,2321
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11560,ENST00000374888,,,,,,,,,,...,1,1,../../GENCODE43/protein_coding/BED6__protein_c...,../../GENCODE43/protein_coding/FA_protein_codi...,0,38,38,2450,2450,5467
11561,ENST00000294353,4.461,5.013,5.047,4.566,5.184,4.826,5.102,4.670,5.756,...,1,1,../../GENCODE43/protein_coding/BED6__protein_c...,../../GENCODE43/protein_coding/FA_protein_codi...,0,201,201,2436,2436,8143
11562,ENST00000322764,5.664,5.524,5.478,5.915,5.811,5.817,5.943,5.509,4.931,...,1,1,../../GENCODE43/protein_coding/BED6__protein_c...,../../GENCODE43/protein_coding/FA_protein_codi...,0,80,80,1799,1799,2228
11563,ENST00000381638,5.112,4.918,5.139,5.190,5.442,5.602,4.715,4.956,5.033,...,1,1,../../GENCODE43/protein_coding/BED6__protein_c...,../../GENCODE43/protein_coding/FA_protein_codi...,0,135,135,9021,9021,11466


In [53]:
# extract only the entries without UTR
df[ (df['5UTR start'] == -1) | (df['3UTR start'] == -1) ]

Unnamed: 0,EnsemblTranscriptID,Adrenal_PTR,Appendices_PTR,Brain_PTR,Colon_PTR,Duodenum_PTR,Endometrium_PTR,Esophagus_PTR,Fallopiantube_PTR,Fat_PTR,...,bed_files,fa_files,bed,fa,5UTR start,5UTR stop,CDS start,CDS stop,3UTR start,3UTR stop
286,ENST00000360265,4.511,3.539,4.331,4.45,4.362,5.159,4.521,4.855,5.139,...,1,1,../../GENCODE43/protein_coding/BED6__protein_c...,../../GENCODE43/protein_coding/FA_protein_codi...,-1,-1,0,2193,2193,7244
851,ENST00000439040,4.067,2.584,3.836,3.046,4.028,4.073,4.007,4.017,1.683,...,1,1,../../GENCODE43/protein_coding/BED6__protein_c...,../../GENCODE43/protein_coding/FA_protein_codi...,0,404,404,4085,-1,-1
2195,ENST00000369733,,,,3.304,2.696,,3.075,,,...,1,1,../../GENCODE43/protein_coding/BED6__protein_c...,../../GENCODE43/protein_coding/FA_protein_codi...,-1,-1,0,4248,4248,5197
2217,ENST00000612273,6.559,6.092,5.206,7.293,6.509,6.288,6.682,7.069,6.243,...,1,1,../../GENCODE43/protein_coding/BED6__protein_c...,../../GENCODE43/protein_coding/FA_protein_codi...,-1,-1,0,1351,1351,1531
4099,ENST00000452319,,,,,,4.021,,,,...,1,1,../../GENCODE43/protein_coding/BED6__protein_c...,../../GENCODE43/protein_coding/FA_protein_codi...,-1,-1,0,4761,-1,-1
4158,ENST00000468623,5.96,4.974,6.611,6.144,6.065,5.607,5.679,5.745,5.531,...,1,1,../../GENCODE43/protein_coding/BED6__protein_c...,../../GENCODE43/protein_coding/FA_protein_codi...,-1,-1,0,984,-1,-1
4723,ENST00000602569,,,,,4.319,3.581,,,3.887,...,1,1,../../GENCODE43/protein_coding/BED6__protein_c...,../../GENCODE43/protein_coding/FA_protein_codi...,-1,-1,0,435,435,556
4780,ENST00000612619,3.805,3.872,3.14,,,,3.389,3.696,,...,1,1,../../GENCODE43/protein_coding/BED6__protein_c...,../../GENCODE43/protein_coding/FA_protein_codi...,-1,-1,0,2499,-1,-1
4976,ENST00000298622,,,4.842,,,,,,,...,1,1,../../GENCODE43/protein_coding/BED6__protein_c...,../../GENCODE43/protein_coding/FA_protein_codi...,-1,-1,0,2535,2535,6493
5128,ENST00000370378,5.685,,5.136,5.434,,5.372,5.103,4.156,5.228,...,1,1,../../GENCODE43/protein_coding/BED6__protein_c...,../../GENCODE43/protein_coding/FA_protein_codi...,-1,-1,0,4230,4230,4622


In [61]:
# read Fasta files and store information in dataframe

df['transcript'] = ''

maxlength = 0

# loop over all transcript IDs
for tid in df['EnsemblTranscriptID']:
    # get BED file name
    fa_file = Path(df.loc[df['EnsemblTranscriptID'] == tid, 'fa'].values[0])

    # sanity check
    if not fa_file.exists():
        print(tid, 'Fasta file does not exist!', bed_file)
        continue

    # read the file, extract UTR and CDS information and store it in the dataframe
    with fa_file.open() as f:
        # read the first line which should contain a label
        label = f.readline()
        # match the transcript ID and transcript length
        m = re.match(r'>' + tid + r'\.\d+:0-(?P<trans_length>\d+)', label)
        if not m:
            print(tid, 'Fasta file format error.', fa_file)
            continue
        # store the max transcript length
        if int(m.group('trans_length')) > maxlength: maxlength = int(m.group('trans_length'))

        # read the second line which is the transcript
        transcript = f.readline()
        df.loc[ df['EnsemblTranscriptID'] == tid, 'transcript'] = transcript

print('longest transcript is', maxlength, 'bp')

longest transcript is 109224 bp


In [63]:
# drop the count and file reference columns
df.drop(labels=['bed_files', 'fa_files', 'bed', 'fa'], axis=1, inplace=True)

In [64]:
# show the current dataframe
df

Unnamed: 0,EnsemblTranscriptID,Adrenal_PTR,Appendices_PTR,Brain_PTR,Colon_PTR,Duodenum_PTR,Endometrium_PTR,Esophagus_PTR,Fallopiantube_PTR,Fat_PTR,...,Thyroid_PTR,Tonsil_PTR,Urinarybladder_PTR,5UTR start,5UTR stop,CDS start,CDS stop,3UTR start,3UTR stop,transcript
0,ENST00000263100,,8.277,,,,,,7.841,,...,,,,0,55,55,1543,1543,3382,ATTGCTGCAGACGCTCACCCCAGACACTCACTGCACCGGAGTGAGC...
1,ENST00000373993,,,,5.135,5.371,,,,,...,,,,0,92,92,1877,1877,2044,ATAATCAAGGAAACCTTTTCCGGGTGGGGATCTCTGAAATTACTCA...
2,ENST00000318602,6.290,6.328,5.948,5.811,6.068,5.383,5.881,6.119,6.410,...,6.060,5.675,5.8286,0,70,70,4495,4495,4610,GGGACCAGATGGATTGTAGGGAGTAGGGTACAATACAGTCTGTTCT...
3,ENST00000299698,,,3.995,,,,4.129,,,...,,5.249,,0,31,31,4396,4396,5127,GACCCTGGAAAAATCTGTCTCACCCACAAAGATGTGGGCTCAGCTC...
4,ENST00000401850,3.843,4.601,,,,,4.013,3.683,,...,,,4.2430,0,490,490,1552,1552,2321,TGCACTTCTGTGCCTCAATTTCCTCATCTGTAGGGTGGGGGTGGTG...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11560,ENST00000374888,,,,,,,,,,...,,4.681,,0,38,38,2450,2450,5467,CTCGGCTCTGGTTCCAGCCGAGCCTCTCGGACGCAGAGATGGAAAT...
11561,ENST00000294353,4.461,5.013,5.047,4.566,5.184,4.826,5.102,4.670,5.756,...,4.250,4.439,4.1460,0,201,201,2436,2436,8143,GGAGTCTGCGCTCTGGTTCGGGCTGCGGCTGCGGCTGCGGCTGCGG...
11562,ENST00000322764,5.664,5.524,5.478,5.915,5.811,5.817,5.943,5.509,4.931,...,5.598,5.968,5.3358,0,80,80,1799,1799,2228,GCAGAGTCTGCGGACCCGGCGCCGAGGCGGCCACCCGAGACGCGGC...
11563,ENST00000381638,5.112,4.918,5.139,5.190,5.442,5.602,4.715,4.956,5.033,...,5.038,5.130,5.0619,0,135,135,9021,9021,11466,AGGAAGCCGGAAGCCGCAGGGGCCGCCGTCGTCTCCTCCGCGTCCC...


In [65]:
# write current pre processed table to file
datafile = '../data/preproc_stage2.csv'
df.to_csv(datafile, index=False)