In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

# Notebook summary

The purpose of this notebook is to create a set of `.tsv` files from various sources for both host and viral genomes. It is expected that the project organization follows the relative path where all initial files are located within `../Data`. 

# Imports

In [2]:
import json
import gff3_parsing ###A separate .py library that should be available in the python path
import pandas as pd
import glob
import subprocess

import sys
sys.path.append('../../iCUB/')
import iCUB

# Custom function definitions

In [3]:
def add_RBS_energy(df, energy_dict, col_name='aSD_binding',\
                   gaps=(4,10), expected_len=30, RBS_len=6):
    '''
    This function adds a ribosome binding site (RBS) energy column to the df based off of 
    pre-computed free energy vals (created with RNAcofold and stored in the corresponding 
    energy_dict. 
    
    Inputs:
        df - 
        energy_dict - 
        gaps - 
        expected_len - 
    
    Outputs:
        df - the transformed df object now containing the new column
    '''
    for index in df.index:
        upstream = df.loc[index,'upstream_sequence']
        test_string = upstream.replace('T', 'U')
        ###Ensure that the sequence is the proper expected length
        if len(test_string) != expected_len:
            continue
        ###Ensure that the sequence has no abnormal bases
        if test_string.count('A') + test_string.count('U') +\
                                    test_string.count('C') + test_string.count('G') != expected_len:
            continue
            
        ###Calculate the energy for the indicated gap offsets
        energy_list = []
        for gap in range(gaps[0],gaps[1]+1):
             energy_list.append(energy_dict[test_string[-gap - RBS_len: -gap]])

        df.at[index, col_name] = min(energy_list)        
    return df


def call_RNAfold(sequence):
    sequence = sequence.replace('T', 'U')
    MyOut = subprocess.Popen(['RNAfold', '-p', '--noPS', '--noDP', '--constraint'],
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE, 
            stderr=subprocess.STDOUT)
    stdout, stderr = MyOut.communicate(input=str.encode(sequence))
    return stdout

def get_energy_RNAfold(stdout_string):
    '''
    '''
    temp = stdout_string.decode('utf-8') 
    mfe_line = temp.split('\n')[-5]
    mfe_val = mfe_line[mfe_line.index(' '):]
    mfe_val = mfe_val.strip().strip('()').strip()
    #
    ensemble_line = temp.split('\n')[-4]
    ensemble_val = ensemble_line[ensemble_line.index(' '):]
    ensemble_val = ensemble_val.strip().strip('[]').strip()
    return float(ensemble_val), float(mfe_val)    

def add_secondary_structure(df):
    '''
    
    '''
    for index in df.index:
        us_seq = df.loc[index]['upstream_sequence'] 
        cds_seq = df.loc[index]['coding_sequence']
        if len(cds_seq) < 90:
            continue
        beg_cds_seq = cds_seq[:60]
        if len(us_seq) == 30 and len(beg_cds_seq) == 60:
            seq = us_seq + beg_cds_seq
            rna_out = call_RNAfold(seq)
            e1, e2 = get_energy_RNAfold(rna_out)
            df.at[index, 'sec_struct'] = e1
            #
            rna_out = call_RNAfold(seq+'\n'+ribo_bound_constraint)
            e1, e2 = get_energy_RNAfold(rna_out)
            df.at[index, 'sec_struct_bound'] = e1

    return df

def add_iCUB_and_GC(df):
    '''
    '''
    for index in df.index:
        cds_seq = df.loc[index]['coding_sequence']
        if len(cds_seq) == 0:
            continue
        if len(cds_seq) != cds_seq.count('A') + cds_seq.count('T') + cds_seq.count('C') + cds_seq.count('G'):
            continue
        if len(cds_seq)%3 != 0:
            continue
        df.at[index, 'iCUB'] = iCUB.iCUB_Calculator(cds_seq).get_iCUB()
        #
        df.at[index, 'GC_cds'] = (cds_seq.count('G') + cds_seq.count('C')) / len(cds_seq)
        #
        us_seq = df.loc[index]['upstream_sequence']
        if len(us_seq) == 30:
            df.at[index, 'GC_upstream'] = (us_seq.count('G') + us_seq.count('C')) / len(us_seq)
    return df

# Notebook-wide parameters 

In [4]:
sep = '\t'
upstream_len = 30

with open('../Data/energy_files/energyRef_CCUCCU_ensemble_noneConstraint.json', 'r') as infile:
       energy_dict = json.load(infile)
        
base_viral_genome_dir = '../Data/MVP_data/host_linked_genomes/'
host_genome_dir = '../Data/MVP_data/host_genomes/'



# ribo_bound_constraint = ('.'*16) + ('x'*28)+ ('.'*16)
# assert len(ribo_bound_constraint)==60

# ribo_bound_constraint = ('.'*17) + ('x'*25)+ ('.'*58)
# assert len(ribo_bound_constraint)==100

ribo_bound_constraint = ('.'*16) + ('x'*28)+ ('.'*46)
assert len(ribo_bound_constraint)==90

# Creating `.tsv` files for host genomes

**Adding relevant columns for ribosome binding sites, secondary structure, codon usage bias, and GC content. This whole process takes 5-10 minutes per genome. Could definitely be optimized but for now not a bottleneck, just anticipate a few hours for this cell to run.**

In [None]:
host_ids = [36809,\
           717959,\
           305,\
           1590,\
           435591,\
           90371,\
           1314,\
           357276,\
           657318,\
           1639,\
           1428,\
           470,\
           573,\
           1280,\
           287,\
           562]

for host_id in host_ids:
    print(host_id)
    ###Creates the dataframe based off a gff3 and fasta file
    host_df, host_genome = gff3_parsing.compile_sequences([host_genome_dir + '{}.gff3'.format(host_id)],\
                                                        [host_genome_dir + '{}.fasta'.format(host_id)],\
                                                          upstream_len)
    ###Adds the ribosome binding site energy column
    host_df = add_RBS_energy(host_df, energy_dict, col_name='aSD_binding', gaps=(4,10))

    ###Adds the secondary structure column
    host_df = add_secondary_structure(host_df)
    
    ###Add codon usage bias column
    host_df = add_iCUB_and_GC(host_df)
    
    ###Writes to a file
    host_df.to_csv(host_genome_dir + '{}.tsv'.format(host_id), sep=sep)

## Treat the 2 chromosome hosts slightly differently to ensure no errors occur

(There is currently only one of these. And I don't love the solution because it's basically entirely copy/paste but here we are.)

In [None]:
host_ids_2chrom = [28450]

for host_id in host_ids_2chrom:
    print(host_id)
    gffs = [host_genome_dir + '{}.1.gff3'.format(host_id),\
            host_genome_dir + '{}.2.gff3'.format(host_id)]

    fastas = [host_genome_dir + '{}.1.fasta'.format(host_id),\
              host_genome_dir + '{}.2.fasta'.format(host_id)]

    host_df, host_genome = gff3_parsing.compile_sequences(gffs, fastas, upstream_len)
    
    ###Adds the ribosome binding site energy column
    host_df = add_RBS_energy(host_df, energy_dict, col_name='aSD_binding', gaps=(4,10))

    ###Adds the secondary structure column
    host_df = add_secondary_structure(host_df)
    
    ###Add codon usage bias column
    host_df = add_iCUB_and_GC(host_df)
    
    ###Writes to a file
    host_df.to_csv(host_genome_dir + '{}.tsv'.format(host_id), sep=sep)

# Creating `.tsv` files for viral genomes
**There are a lot more of these viral genomes but they're super small so run-time for this cell should also be in the few hour range at most. And it only needs to happen once so this isn't a bottleneck and I'm not concerning myself with speed.**

In [None]:
###Concatenate the two existing host lists
host_ids = host_ids + host_ids_2chrom
for host_id in host_ids:
    print('#####', host_id)
    for gff_file in glob.glob(base_viral_genome_dir+'{}_rep_viruses/*.gff3'.format(host_id)):
        print(gff_file)
        fasta_file = gff_file.replace('.gff3', '.fasta')
        tsv_file = gff_file.replace('.gff3', '.tsv')
        
        viral_df, viral_genome = gff3_parsing.compile_sequences([gff_file], [fasta_file], upstream_len)
        
        ###Adds the ribosome binding site energy column
        viral_df = add_RBS_energy(viral_df, energy_dict, col_name='aSD_binding', gaps=(4,10))

        ###Adds the secondary structure column
        viral_df = add_secondary_structure(viral_df)

        ###Add codon usage bias column
        viral_df = add_iCUB_and_GC(viral_df)

        ###Writes to a file
        viral_df.to_csv(tsv_file, sep=sep)

# Make "clean" `.tsv` files

In [None]:
def common_cleaning(df):
    df = df.reset_index(drop=True)
    df = df[df['upstream_sequence'].isnull()==False]
    df = df[df['coding_sequence'].isnull()==False]
    df = df[df['iCUB'].isnull()==False]
    df = df[df['GC_cds'].isnull()==False]
    df = df[df['GC_upstream'].isnull()==False]
    df = df[df['aSD_binding'].isnull()==False]
    df = df[df['sec_struct'].isnull()==False]
    return df

def clean_host_tsv(df):
    """
    WRITE A BRIEF PURPOSE/SUMMARY
    
    Development notes: I could also think about testing for stop codons within coding sequences
    and filtering accordingly. Also a way to test for possible non-standard genetic code usage
    
    Input/s:
        df - a pandas dataframe with numeric indices, outputted from Compile_data.ipynb and read
                back in
        
    Output/s:
        df - a clean version of the dataframe with one new column (iCUB) and (potentially)
                several rows removed
    """
    ###Run the main cleaning/additions
    df = common_cleaning(df)
    
    ###Filter out possible prophage genes by removing anything involving the word phage
    ###Numerous possibilities/ways to do this and not all genomes might have any decent
    ###descriptions in the qualifiers.
    filter_word = 'phage'
    df = df[(df['qualifiers'].str.contains(filter_word)==False)]
    
    ###Now ensure that each locus tag is only used once and when in doubt remove them both
    df['locus_tag'] = df['qualifiers'].str.split('locus_tag=', n=1, expand=True)[1]\
                            .str.split(';', n=1, expand=True)[0]
    df = df.drop_duplicates(subset = ['locus_tag'], keep = False)
    return df

def clean_virus_tsv(df):
    """
    This is the same basic structure as the "clean_host_tsv" function. 
    
    Input/s:
        df - a pandas dataframe with numeric indices, outputted from Compile_data.ipynb and read
                back in
        
    Output/s:
        df - a clean version of the dataframe with one new column (iCUB) and (potentially)
                several rows removed
                
    """  
    ###Run the main cleaning/additions
    df = common_cleaning(df)

    ###Now ensure that each viral_id tag is only used once (and when in doubt remove them BOTH)
    df['viral_id'] = df['qualifiers'].str.split('ID=', n=1, expand=True)[1]\
                            .str.split(';', n=1, expand=True)[0]
    
    df = df.drop_duplicates(subset = ["viral_id"], keep = False)
    return df

**Side analysis just to make sure that all these genomes use the standard translation table**

In [None]:
for host_tsv_file in glob.glob(host_genome_dir + '*.tsv')[:]:
    if '.clean.' in host_tsv_file:
        continue
    print(host_tsv_file)
    df = pd.read_csv(host_tsv_file, sep = '\t', index_col = 0)
    df['transl_table'] = df['qualifiers'].str.split('transl_table=', n=1, expand=True)[1]\
                        .str.split(';', n=1, expand=True)[0]
    print(df.shape[0], list(df['transl_table'].value_counts().items()))

**Now clean the host data tables**

In [None]:
for host_tsv_file in glob.glob(host_genome_dir + '*.tsv'):
    if '.clean.' in host_tsv_file:
        continue
    print(host_tsv_file)
    ###
    df = pd.read_csv(host_tsv_file, sep='\t', index_col=0)
    initial_shape = df.shape
    df = clean_host_tsv(df)
    final_shape = df.shape
    ###
    gene_ratio = final_shape[0]/initial_shape[0]
    print(gene_ratio)
    if gene_ratio <= 0.8: #This basically shouldn't happen
        break
    ###
    clean_tsv_loc = host_tsv_file.replace('.tsv', '.clean.tsv')
    df.to_csv(clean_tsv_loc, sep='\t')

**And the phages**

In [None]:
for virus_folder in glob.glob(base_viral_genome_dir + '*_rep_viruses/'):
    print('###', virus_folder)
    for virus_tsv_file in glob.glob(virus_folder + '*.tsv'):
        if '.clean.' in virus_tsv_file:
            continue
        if '.fasta.' in virus_tsv_file:
            continue
        print(virus_tsv_file)
        ###
        df = pd.read_csv(virus_tsv_file, sep='\t', index_col=0)
        initial_shape = df.shape
        df = clean_virus_tsv(df)
        final_shape = df.shape
        ###
        gene_ratio = final_shape[0]/initial_shape[0]
        if gene_ratio <= 0.8:
            print('Strange case', gene_ratio)
            break
        ###
        clean_tsv_loc = virus_tsv_file.replace('.tsv', '.clean.tsv')
        df.to_csv(clean_tsv_loc, sep='\t')

# Add in the well-annotated reference genome/s

In [9]:
for gff_file in glob.glob('../Data/benchmark_viruses/*.gff3'):
    print(gff_file)
    fasta_file = gff_file.replace('.gff3', '.fasta')
    tsv_file = gff_file.replace('.gff3', '.tsv')

    viral_df, viral_genome = gff3_parsing.compile_sequences([gff_file], [fasta_file], upstream_len)

    ###Adds the ribosome binding site energy column
    viral_df = add_RBS_energy(viral_df, energy_dict, col_name='aSD_binding', gaps=(4,10))

    ###Adds the secondary structure column
    viral_df = add_secondary_structure(viral_df)

    ###Add codon usage bias column
    viral_df = add_iCUB_and_GC(viral_df)

    ###Writes to a file
    viral_df.to_csv(tsv_file, sep=sep)

../Data/benchmark_viruses/T7.gff3


In [12]:
for virus_tsv_file in glob.glob('../Data/benchmark_viruses/*.tsv'):
        if '.clean.' in virus_tsv_file:
            continue
        if '.fasta.' in virus_tsv_file:
            continue
        print(virus_tsv_file)
        ###
        df = pd.read_csv(virus_tsv_file, sep='\t', index_col=0)
        initial_shape = df.shape
        df = clean_virus_tsv(df)
        final_shape = df.shape
        ###
        gene_ratio = final_shape[0]/initial_shape[0]
        if gene_ratio <= 0.8:
            print('Strange case', gene_ratio)
            break
        ###
        clean_tsv_loc = virus_tsv_file.replace('.tsv', '.clean.tsv')
        df.to_csv(clean_tsv_loc, sep='\t')

../Data/benchmark_viruses/T7.tsv


In [17]:
df

Unnamed: 0,genome_id,source,type,start,stop,idk,strand,trash,qualifiers,coding_sequence,upstream_sequence,aSD_binding,sec_struct,sec_struct_bound,iCUB,GC_cds,GC_upstream,viral_id
0,NC_001604.1,RefSeq,CDS,925,1278,.,+,0,ID=cds-NP_041954.1;Parent=rna-T7p01;Dbxref=Uni...,ATGGCTATGTCTAACATGACTTACAACAACGTTTTCGACCACGCTT...,TTCACTAATAACTGCACGAGGTAACACAAG,-5.42,-10.13,-3.81,50.525906,0.468927,0.4,cds-NP_041954.1
1,NC_001604.1,RefSeq,CDS,1278,1433,.,+,0,ID=cds-NP_041955.1;Parent=gene-T7p02;Dbxref=Un...,ATGTCTACTACCAACGTGCAATACGGTCTGACCGCTCAAACTGTAC...,AGTCGAGGAGTACGAGGAGGATGAAGAGTA,-9.31,-19.01,-12.96,43.066882,0.455128,0.5,cds-NP_041955.1
2,NC_001604.1,RefSeq,CDS,2021,3100,.,+,0,ID=cds-NP_041959.1;Parent=rna-T7p03;Dbxref=GOA...,ATGAACATTACCGACATCATGAACGCTATCGACGCAATCAAAGCAC...,TCTCATAACGAACATAAAGGACACAATGCA,-4.0,-7.46,-3.8,55.711182,0.493519,0.366667,cds-NP_041959.1
3,NC_001604.1,RefSeq,CDS,1496,1639,.,+,0,ID=cds-NP_041956.1;Parent=gene-T7p04;Dbxref=Un...,ATGTATATGCTTACTATCGGTCTACTCACCGCTCTAGGTCTAGCTG...,TATGATTATCACTTTACTTATGAGGGAGTA,-5.82,-19.87,-8.5,39.155014,0.458333,0.3,cds-NP_041956.1
6,NC_001604.1,RefSeq,CDS,1636,1797,.,+,0,ID=cds-NP_041958.1;Parent=gene-T7p06;Dbxref=Un...,ATGATGAAGCACTACGTTATGCCAATCCACACGTCCAACGGGGCAA...,CATAGGAATCATCAAAGGGGCACTACGCAA,-5.53,-17.68,-7.52,42.406008,0.481481,0.466667,cds-NP_041958.1
7,NC_001604.1,RefSeq,CDS,3171,5822,.,+,0,ID=cds-NP_041960.1;Parent=rna-T7p07;Dbxref=GOA...,ATGAACACGATTAACATCGCTAAGAACGACTTCTCTGACATCGAAC...,GTACGATTTACTAACTGGAAGAGGCACTAA,-5.13,-10.8,-5.39,53.909493,0.514706,0.4,cds-NP_041960.1
8,NC_001604.1,RefSeq,CDS,6137,6394,.,+,0,ID=cds-NP_041962.1;Parent=rna-T7p08;Dbxref=GOA...,ATGGGACGTTTATATAGTGGTAATCTGGCAGCATTCAAGGCAGCAA...,CTCACAAGCGTAGCTGGGAGGGTCAGTAAG,-9.51,-17.09,-9.78,45.974315,0.422481,0.566667,cds-NP_041962.1
9,NC_001604.1,RefSeq,CDS,6007,6135,.,+,0,ID=cds-NP_041961.1;Parent=gene-T7p09;Dbxref=Un...,ATGCGTAACTTCGAAAAGATGACCAAACGTTCTAACCGTAATGCTC...,TTAAAGAATTACTAAGAGAGGACTTTAAGT,-5.83,-15.83,-6.8,37.097063,0.48062,0.266667,cds-NP_041961.1
10,NC_001604.1,RefSeq,CDS,6475,7554,.,+,0,ID=cds-NP_041963.1;Parent=rna-T7p10;Dbxref=GOA...,ATGATGAACATTAAGACTAACCCGTTTAAAGCCGTGTCTTTCGTAG...,TAGTCATTTAACCAATAGGAGATAAACATT,-7.0,-10.28,-5.72,55.811993,0.469444,0.266667,cds-NP_041963.1
11,NC_001604.1,RefSeq,CDS,7608,7763,.,+,0,ID=cds-NP_041964.1;Parent=gene-T7p11;Dbxref=Un...,ATGTTTAAGAAGGTTGGTAAATTCCTTGCGGCTTTGGCAGCTATCC...,GGCCTTTCTGCGTTTATAAGGAGACACTTT,-7.0,-18.92,-9.77,50.382798,0.455128,0.433333,cds-NP_041964.1
