In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl

Data source: https://regulondb.ccg.unam.mx/menu/download/datasets/index.jsp

## Transcription Units

In [2]:
fname = '../data/TUSet.txt'
df_TU = pd.read_csv(fname, sep='\t', comment='#',
                      names=['regulondb_id', 'TU_name', 'operon_name', 'genes', 'promoter', 'evidence', 'confidence'])
df_TU.head(5)

Unnamed: 0,regulondb_id,TU_name,operon_name,genes,promoter,evidence,confidence
0,ECK125302594,C0293,C0293,C0293,C0293p,[LTED|S|Length of transcript experimentally de...,Strong
1,ECK120021182,aaeR,aaeR,aaeR,aaeRp,[AISGDTU|W|Automated inference that a single-g...,Weak
2,ECK120027580,aaeXAB,aaeXAB,"aaeX,aaeA,aaeB",aaeXp,[ICWHO|W|Inferred computationally without huma...,Weak
3,ECK120027435,aas-lplT,aas-lplT,"aas,lplT",,[ICWHO|W|Inferred computationally without huma...,Weak
4,ECK120026783,aat,aat,aat,,[ICWHO|W|Inferred computationally without huma...,Weak


In [3]:
len(df_TU)

3692

In [4]:
df_all = pd.DataFrame(columns=['gene', 'TU', 'promoter'])
for index,row in df_TU.iterrows():
    genes = row['genes'].split(',')
    for gene in genes:
        df_all = df_all.append({'gene': gene,
                                'TU': row['TU_name'],
                                'promoter': row['promoter']},
                               ignore_index=True)

## Coding sequences

In [5]:
fname = '../data/Gene_sequence.txt'
df_cds = pd.read_csv(fname, sep='\t', comment='#',
                      names=['id', 'name', 'left_coord', 'right_coord', 'DNA_strand', 'product_type', 'product_name',
                             'start_codon', 'stop_codon', 'seq', 'bnumber', 'other_ids', 'random'])
df_cds.head(5)

Unnamed: 0,id,name,left_coord,right_coord,DNA_strand,product_type,product_name,start_codon,stop_codon,seq,bnumber,other_ids,random
0,ECK120001251,thrL(b0001),190.0,255.0,forward,-,<i>thr</i> operon leader peptide,ATG,TGA,ATGAAACGCATTAGCACCACCATTACCACCACCATCACCATTACCA...,b0001,,
1,ECK120000987,thrA(b0002),337.0,2799.0,forward,-,fused aspartate kinase/homoserine dehydrogenase 1,ATG,TGA,ATGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAAC...,b0002,,
2,ECK120000988,thrB(b0003),2801.0,3733.0,forward,-,homoserine kinase,ATG,TAA,ATGGTTAAAGTTTATGCCCCGGCTTCCAGTGCCAATATGAGCGTCG...,b0003,,
3,ECK120000989,thrC(b0004),3734.0,5020.0,forward,-,threonine synthase,ATG,TAA,ATGAAACTCTACAATCTGAAAGATCACAACGAGCAGGTCAGCTTTG...,b0004,,
4,ECK120002701,yaaX(b0005),5234.0,5530.0,forward,-,DUF2502 domain-containing protein YaaX,GTG,TAA,GTGAAAAAGATGCAATCTATCGTACTCGCACTTTCCCTGGTTCTGG...,b0005,,


In [6]:
df_cds['DNA_strand'].count()

4696

In [7]:
def append_gene_name(row):
    gene = row['name'].split('(')[0]
    return gene

In [8]:
df_cds['gene'] = df_cds.apply(lambda row: append_gene_name(row), axis=1)

In [10]:
df_cds_small = df_cds[['gene', 'seq', 'left_coord']]

In [11]:
df_all = df_all.merge(df_cds_small, on='gene', how='left')

In [12]:
df_all.head(5)

Unnamed: 0,gene,TU,promoter,seq,left_coord
0,C0293,C0293,C0293p,TACTCACCCGGGACTCGCCAGGGGACAGCCAACAGGCATTGGGTGC...,1196711.0
1,aaeR,aaeR,aaeRp,ATGGAACGACTAAAACGCATGTCGGTGTTTGCCAAAGTAGTTGAAT...,3389520.0
2,aaeX,aaeXAB,aaeXp,ATGAGTCTGTTTCCCGTTATCGTGGTGTTTGGGCTGTCCTTCCCAC...,3389134.0
3,aaeA,aaeXAB,aaeXp,GTGAAAACACTAATAAGAAAATTCTCCCGTACGGCCATCACGGTCG...,3388194.0
4,aaeB,aaeXAB,aaeXp,ATGGGTATTTTCTCCATTGCTAACCAACATATTCGCTTTGCGGTAA...,3386221.0


In [13]:
len(df_all)

6579

## UTRs

In [14]:
fname = '../data/UTR_5_3_sequence.txt'
df_utr = pd.read_csv(fname, sep='\t', comment='#',
                      names=['operon', 'TU', 'promoter', 'TSS', 'TU_DNA', 'TU_firstgene', 'TU_lastgene',
                             'terminator', 'coord', 'coord_5UTR', '5UTR', 'coord_3UTR', '3UTR'])
df_utr.head(5)

Unnamed: 0,operon,TU,promoter,TSS,TU_DNA,TU_firstgene,TU_lastgene,terminator,coord,coord_5UTR,5UTR,coord_3UTR,3UTR
0,thrLABC,thrLABC,thrLp,148.0,forward,"thrL(190,255)","thrC(3734,5020)","rho-independent(5032,5086)",148-5086,148-190,ATAGCGCACAGACAGATAAAAATTACAGAGTACACAACATCCA,5020-5086,AAATCTATTCATTATCTCAATCAGGCCGGGTTTGCTTTTATGCAGC...
1,thrLABC,thrL,thrLp,148.0,forward,"thrL(190,255)","thrL(190,255)","rho-independent(274,310)",148-310,148-190,ATAGCGCACAGACAGATAAAAATTACAGAGTACACAACATCCA,255-310,ACGCGTACAGGAAACACAGAAAAAAGCCCGCACCTGACAGTGCGGG...
2,talB,talB,talBp,8191.0,forward,"talB(8238,9191)","talB(8238,9191)","rho-independent(9199,9226)",8191-9226,8191-8238,AGACCGGTTACATCCCCCTAACAAGCTGTTTAAAGAGAAATACTATCA,9191-9226,ATCATTCTTAGCGTGACCGGGAAGTCGGTCACGCTA
3,mbiA,htgA,htgAp2,10643.0,forward,"mbiA(10830,11315)","mbiA(10830,11315)",,10643-11315,10643-10830,TCAGACCTGAGTGGCGCTAACCATCCGGCGCAGGCAGGCGATTTGC...,,
4,mbiA,htgA,htgAp1,10644.0,forward,"mbiA(10830,11315)","mbiA(10830,11315)",,10644-11315,10644-10830,CAGACCTGAGTGGCGCTAACCATCCGGCGCAGGCAGGCGATTTGCA...,,


In [15]:
df_utr_small = df_utr[['TU', 'promoter', '5UTR', '3UTR']]

In [16]:
df_all = df_all.merge(df_utr_small, on=['TU', 'promoter'], how='left')

## Promoters

In [17]:
fname = '../data/PromoterSet.txt'
df_promoter = pd.read_csv(fname, sep='\t', comment='#',
                          names=['regulondb_id', 'promoter', 'dna_strand', 'position', 'sigma', 'promoter_seq',
                                 'evidence', 'confidence'])
df_promoter.head(5)

Unnamed: 0,regulondb_id,promoter,dna_strand,position,sigma,promoter_seq,evidence,confidence
0,ECK125302590,C0293p,forward,1196711,,tatgaattaccactccttacacccgctcaaatattgttaaattgcc...,[TIM|S|Transcription initiation mapping],Strong
1,ECK125137714,aaeBp3,reverse,3388263,Sigma24,caaccagcaagagaacatctggcctgcgggcaccactgctacagtg...,[ICWHO|W|Inferred computationally without huma...,Weak
2,ECK125137715,aaeBp7,reverse,3388236,Sigma32,gggcaccactgctacagtggtggtcactggcaaacaagatcgcgac...,[ICWHO|W|Inferred computationally without huma...,Weak
3,ECK125137716,aaeBp8,reverse,3388258,Sigma24,agcaagagaacatctggcctgcgggcaccactgctacagtggtggt...,[ICWHO|W|Inferred computationally without huma...,Weak
4,ECK125135056,aaeRp,forward,3389496,,agattgatatttaatatattagcgtaactgttatgctgttatctat...,[HTTIM|S|High-throughput transcription initiat...,Strong


In [18]:
df_promoter_small = df_promoter[['promoter', 'promoter_seq']]

In [19]:
df_all = df_all.merge(df_promoter_small, on='promoter', how='left')

In [20]:
df_all.dropna(how='any', inplace=True)

In [21]:
df_all.head(5)

Unnamed: 0,gene,TU,promoter,seq,left_coord,5UTR,3UTR,promoter_seq
19,accB,accBC,accBp2,ATGGATATTCGTAAGATTAAAAAACTGATCGAGCTGGTTGAAGAAT...,3405436.0,CTCGTCCTCCCTGACGCAGTTTTTGCGCTGCGGAAAAGGTGACATT...,AGACTGCTAAAGCGTCAAAAGGCCGGATTTTCCGGCC,cttccctgataagaccagtatttagctgccaattgctacgaaatcg...
20,accC,accBC,accBp2,ATGCTGGATAAAATTGTTATTGCCAACCGCGGCGAGATTGCATTGC...,3405917.0,CTCGTCCTCCCTGACGCAGTTTTTGCGCTGCGGAAAAGGTGACATT...,AGACTGCTAAAGCGTCAAAAGGCCGGATTTTCCGGCC,cttccctgataagaccagtatttagctgccaattgctacgaaatcg...
23,accD,accD,accDp,ATGAGCTGGATTGAACGAATTAAAAGCAACATTACTCCCACCCGCA...,2433012.0,CATTCATGGTCTGTTGGGGGCAAAAATGGCATTATGCGTCCCCAAA...,ATAACTGATAAGGGCAGGGCCACTGGCTCTGCCCTTTTGCTATTCT...,actccgcggttcgaccacttttttatccaaagtttcgggctgttat...
41,acrE,acrEF,acrEp,ATGACGAAACATGCCAGGTTTTTCCTCCTGCCCTCCTTTATTCTGA...,3413864.0,TCTTTTTTTGTGTTTATGTGCCTTGAGATGCCTGTATTCATAACTA...,AATCAGAAACATAAAGGCGCTTTCGGGTGCCTTTATTATTT,ttacgaggttttaattctgcctctttcaacccgcgtcaaaataaaa...
42,acrF,acrEF,acrEp,ATGGCAAACTTTTTTATTCGACGACCGATATTTGCATGGGTGCTGG...,3415033.0,TCTTTTTTTGTGTTTATGTGCCTTGAGATGCCTGTATTCATAACTA...,AATCAGAAACATAAAGGCGCTTTCGGGTGCCTTTATTATTT,ttacgaggttttaattctgcctctttcaacccgcgtcaaaataaaa...


In [22]:
len(df_all)

1086

## Compute additional features

In [23]:
def process_CDS(row):
    seq = row['seq'].upper()
    l = len(seq)
    freqA = seq.count('A') / l
    freqT = seq.count('T') / l
    freqC = seq.count('C') / l
    freqG = seq.count('G') / l
    freqGC = freqC + freqG
    return pd.Series(dict(len_CDS=l, freqA_CDS=freqA, freqT_CDS=freqT,
                          freqC_CDS=freqC, freqG_CDS=freqG, freqGC_CDS=freqGC))

In [24]:
df_all = df_all.join(df_all.apply(process_CDS, axis="columns"))

In [25]:
def process_5UTR(row):
    seq = row['5UTR'].upper()
    l = len(seq)
    freqA = seq.count('A') / l
    freqT = seq.count('T') / l
    freqC = seq.count('C') / l
    freqG = seq.count('G') / l
    freqGC = freqC + freqG
    return pd.Series(dict(len_5UTR=l, freqA_5UTR=freqA, freqT_5UTR=freqT,
                          freqC_5UTR=freqC, freqG_5UTR=freqG, freqGC_5UTR=freqGC))

In [26]:
df_all = df_all.join(df_all.apply(process_5UTR, axis="columns"))

In [27]:
def process_3UTR(row):
    seq = row['3UTR'].upper()
    l = len(seq)
    freqA = seq.count('A') / l
    freqT = seq.count('T') / l
    freqC = seq.count('C') / l
    freqG = seq.count('G') / l
    freqGC = freqC + freqG
    return pd.Series(dict(len_3UTR=l, freqA_3UTR=freqA, freqT_3UTR=freqT,
                          freqC_3UTR=freqC, freqG_3UTR=freqG, freqGC_3UTR=freqGC))

In [28]:
df_all = df_all.join(df_all.apply(process_3UTR, axis="columns"))

In [29]:
def process_promoter(row):
    seq = row['promoter_seq'].upper()
    l = len(seq)
    freqA = seq.count('A') / l
    freqT = seq.count('T') / l
    freqC = seq.count('C') / l
    freqG = seq.count('G') / l
    freqGC = freqC + freqG
    return pd.Series(dict(len_promoter=l, freqA_promoter=freqA, freqT_promoter=freqT,
                          freqC_promoter=freqC, freqG_promoter=freqG, freqGC_promoter=freqGC))

In [30]:
df_all = df_all.join(df_all.apply(process_promoter, axis="columns"))

In [31]:
df_all.head(5)

Unnamed: 0,gene,TU,promoter,seq,left_coord,5UTR,3UTR,promoter_seq,len_CDS,freqA_CDS,...,freqT_3UTR,freqC_3UTR,freqG_3UTR,freqGC_3UTR,len_promoter,freqA_promoter,freqT_promoter,freqC_promoter,freqG_promoter,freqGC_promoter
19,accB,accBC,accBp2,ATGGATATTCGTAAGATTAAAAAACTGATCGAGCTGGTTGAAGAAT...,3405436.0,CTCGTCCTCCCTGACGCAGTTTTTGCGCTGCGGAAAAGGTGACATT...,AGACTGCTAAAGCGTCAAAAGGCCGGATTTTCCGGCC,cttccctgataagaccagtatttagctgccaattgctacgaaatcg...,471.0,0.278132,...,0.189189,0.27027,0.27027,0.540541,81.0,0.234568,0.296296,0.283951,0.185185,0.469136
20,accC,accBC,accBp2,ATGCTGGATAAAATTGTTATTGCCAACCGCGGCGAGATTGCATTGC...,3405917.0,CTCGTCCTCCCTGACGCAGTTTTTGCGCTGCGGAAAAGGTGACATT...,AGACTGCTAAAGCGTCAAAAGGCCGGATTTTCCGGCC,cttccctgataagaccagtatttagctgccaattgctacgaaatcg...,1350.0,0.251111,...,0.189189,0.27027,0.27027,0.540541,81.0,0.234568,0.296296,0.283951,0.185185,0.469136
23,accD,accD,accDp,ATGAGCTGGATTGAACGAATTAAAAGCAACATTACTCCCACCCGCA...,2433012.0,CATTCATGGTCTGTTGGGGGCAAAAATGGCATTATGCGTCCCCAAA...,ATAACTGATAAGGGCAGGGCCACTGGCTCTGCCCTTTTGCTATTCT...,actccgcggttcgaccacttttttatccaaagtttcgggctgttat...,915.0,0.225137,...,0.28,0.3,0.22,0.52,81.0,0.17284,0.37037,0.209877,0.246914,0.45679
41,acrE,acrEF,acrEp,ATGACGAAACATGCCAGGTTTTTCCTCCTGCCCTCCTTTATTCTGA...,3413864.0,TCTTTTTTTGTGTTTATGTGCCTTGAGATGCCTGTATTCATAACTA...,AATCAGAAACATAAAGGCGCTTTCGGGTGCCTTTATTATTT,ttacgaggttttaattctgcctctttcaacccgcgtcaaaataaaa...,1158.0,0.265112,...,0.341463,0.170732,0.195122,0.365854,81.0,0.271605,0.407407,0.17284,0.148148,0.320988
42,acrF,acrEF,acrEp,ATGGCAAACTTTTTTATTCGACGACCGATATTTGCATGGGTGCTGG...,3415033.0,TCTTTTTTTGTGTTTATGTGCCTTGAGATGCCTGTATTCATAACTA...,AATCAGAAACATAAAGGCGCTTTCGGGTGCCTTTATTATTT,ttacgaggttttaattctgcctctttcaacccgcgtcaaaataaaa...,3105.0,0.229952,...,0.341463,0.170732,0.195122,0.365854,81.0,0.271605,0.407407,0.17284,0.148148,0.320988


In [33]:
df_all.to_csv('../data/ecoli_gene_data.csv', index=False)