In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

# Notebook summary

The purpose of this notebook is to create a set of `.tsv` files from various sources for both host and viral genomes. It is expected that the project organization follows the relative path where all initial files are located within `../Data`. 

# Imports

In [2]:
import json
import gff3_parsing ###A separate .py library that should be available in the python path
import pandas as pd
import glob
import subprocess

import sys
sys.path.append('../../iCUB/')
import iCUB

# Custom function definitions

In [None]:
def add_RBS_energy(df, energy_dict, col_name='RBS_energy',\
                   gaps=(4,10), expected_len=30, RBS_len=6):
    '''
    This function adds a ribosome binding site (RBS) energy column to the df based off of 
    free energy values pre-computed and stored in the corresponding energy_dict. 
    
    Inputs:
        df - 
        energy_dict - 
        gaps - 
        expected_len - 
    
    Outputs:
        df - the transformed df object now containing the energy_binding column
        
    '''
    for index in df.index:
        upstream = df.loc[index,'upstream_sequence']
        test_string = upstream.replace('T', 'U')
        ###Ensure that the sequence is the proper expected length
        if len(test_string) != expected_len:
            continue
        ###Ensure that the sequence has no abnormal bases
        if test_string.count('A') + test_string.count('U') +\
                                    test_string.count('C') + test_string.count('G') != expected_len:
            continue
            
        ###Calculate the energy for the indicated gap offsets
        energy_list = []
        for gap in range(gaps[0],gaps[1]+1):
             energy_list.append(energy_dict[test_string[-gap - RBS_len: -gap]])

        df.at[index, col_name] = min(energy_list)        
    return df


def call_RNAfold(sequence):
    sequence = sequence.replace('T', 'U')
    MyOut = subprocess.Popen(['RNAfold', '-p', '--noPS', '--constraint'],
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE, 
            stderr=subprocess.STDOUT)
    stdout, stderr = MyOut.communicate(input=str.encode(sequence))
    return stdout

def get_energy_RNAfold(stdout_string):
    '''
    '''
    temp = stdout_string.decode('utf-8') 
    energy_line = temp.split('\n')[-5]
    energy_val = energy_line[energy_line.index(' '):]
    energy_val = energy_val.strip().strip('()').strip()
    
    mfe_line = temp.split('\n')[-4]
    mfe_val = mfe_line[energy_line.index(' '):]
    mfe_val = mfe_val.strip().strip('[]').strip()
    return float(energy_val), float(mfe_val)    

def add_secondary_structure(df):
    '''
    
    '''
    for index in df.index:
        us_seq = df.loc[index]['upstream_sequence'] 
        cds_seq = df.loc[index]['coding_sequence']
        beg_cds_seq = cds_seq[:30]
        if len(us_seq) == 30 and len(beg_cds_seq) == 30:
            seq = us_seq + beg_cds_seq
            rna_out = call_RNAfold(seq)
            e1, e2 = get_energy_RNAfold(rna_out)
            df.at[index, 'secondary_structure'] = e2
            
        if len(cds_seq) > 90:
            int_cds_seq = cds_seq[30:90]
            rna_out = call_RNAfold(int_cds_seq)
            e1, e2 = get_energy_RNAfold(rna_out)
            df.at[index, 'secondary_structure_internal'] = e2
    return df

def add_iCUB_and_GC(df):
    '''
    '''
    for index in df.index:
        cds_seq = df.loc[index]['coding_sequence']
        if len(cds_seq) == 0:
            continue
        if len(cds_seq) != cds_seq.count('A') + cds_seq.count('T') + cds_seq.count('C') + cds_seq.count('G'):
            continue
        if len(cds_seq)%3 != 0:
            continue
        df.at[index, 'iCUB'] = iCUB.iCUB_Calculator(cds_seq).get_iCUB()
        #
        df.at[index, 'GC_cds'] = (cds_seq.count('G') + cds_seq.count('C')) / len(cds_seq)
        #
        us_seq = df.loc[index]['upstream_sequence']
        if len(us_seq) == 30:
            df.at[index, 'GC_upstream'] = (us_seq.count('G') + us_seq.count('C')) / len(us_seq)
    return df

# Notebook-wide parameters 

In [5]:
sep = '\t'
upstream_len = 30

with open('../Data/energy_files/energyRef_CCUCCU_ensemble_noneConstraint.json', 'r') as infile:
       energy_dict = json.load(infile)
        
base_viral_genome_dir = '../Data/MVP_data/host_linked_genomes/'
host_genome_dir = '../Data/MVP_data/host_genomes/'

# Creating `.tsv` files for host genomes

**Adding relevant columns for ribosome binding sites, secondary structure, codon usage bias, and GC content. This whole process takes 5-10 minutes per genome. Could definitely be optimized but for now not a bottleneck, just anticipate a few hours for this cell to run.**

In [None]:
host_ids = [36809,\
           717959,\
           305,\
           1590,\
           435591,\
           90371,\
           1314,\
           357276,\
           657318,\
           1639,\
           1428,\
           470,\
           573,\
           1280,\
           287,\
           562]

for host_id in host_ids:
    print(host_id)
    ###Creates the dataframe based off a gff3 and fasta file
    host_df, host_genome = gff3_parsing.compile_sequences([host_genome_dir + '{}.gff3'.format(host_id)],\
                                                        [host_genome_dir + '{}.fasta'.format(host_id)],\
                                                          upstream_len)
    ###Adds the ribosome binding site energy column
    host_df = add_RBS_energy(host_df, energy_dict, col_name='RBS_energy', gaps=(4,10))
    host_df = add_RBS_energy(host_df, energy_dict, col_name='RBS_energy_upstream', gaps=(11,17))

    ###Adds the secondary structure column
    host_df = add_secondary_structure(host_df)
    
    ###Add codon usage bias column
    host_df = add_iCUB_and_GC(host_df)
    
    ###Writes to a file
    host_df.to_csv(host_genome_dir + '{}.tsv'.format(host_id), sep=sep)

## Treat the 2 chromosome hosts slightly differently to ensure no errors occur

(There is currently only one of these. And I don't love the solution because it's basically entirely copy/paste but here we are.)

In [None]:
host_ids_2chrom = [28450]

for host_id in host_ids_2chrom:
    print(host_id)
    gffs = [host_genome_dir + '{}.1.gff3'.format(host_id),\
            host_genome_dir + '{}.2.gff3'.format(host_id)]

    fastas = [host_genome_dir + '{}.1.fasta'.format(host_id),\
              host_genome_dir + '{}.2.fasta'.format(host_id)]

    host_df, host_genome = gff3_parsing.compile_sequences(gffs, fastas, upstream_len)
    
    ###Adds the ribosome binding site energy column
    host_df = add_RBS_energy(host_df, energy_dict, col_name='RBS_energy', gaps=(4,10))
    host_df = add_RBS_energy(host_df, energy_dict, col_name='RBS_energy_upstream', gaps=(11,17))

    ###Adds the secondary structure column
    host_df = add_secondary_structure(host_df)
    
    ###Add codon usage bias column
    host_df = add_iCUB_and_GC(host_df)
    
    ###Writes to a file
    host_df.to_csv(host_genome_dir + '{}.tsv'.format(host_id), sep=sep)

# Creating `.tsv` files for viral genomes
**There are a lot more of these viral genomes but they're super small so run-time for this cell should also be in the few hour range at most. And it only needs to happen once so this isn't a bottleneck and I'm not concerning myself with speed.**

In [None]:
###Concatenate the two existing host lists
host_ids = host_ids + host_ids_2chrom
for host_id in host_ids:
    print('#####', host_id)
    for gff_file in glob.glob(base_viral_genome_dir+'{}_rep_viruses/*.gff3'.format(host_id)):
        print(gff_file)
        fasta_file = gff_file.replace('.gff3', '.fasta')
        tsv_file = gff_file.replace('.gff3', '.tsv')
        
        viral_df, viral_genome = gff3_parsing.compile_sequences([gff_file], [fasta_file], upstream_len)
        
        ###Adds the ribosome binding site energy column
        viral_df = add_RBS_energy(viral_df, energy_dict, col_name='RBS_energy', gaps=(4,10))
        viral_df = add_RBS_energy(viral_df, energy_dict, col_name='RBS_energy_upstream', gaps=(11,17))

        ###Adds the secondary structure column
        viral_df = add_secondary_structure(viral_df)

        ###Add codon usage bias column
        viral_df = add_iCUB_and_GC(viral_df)

        ###Writes to a file
        viral_df.to_csv(tsv_file, sep=sep)

# Make "clean" `.tsv` files

In [None]:
def common_cleaning(df):
    df = df.reset_index(drop=True)
    df = df[df['upstream_sequence'].isnull()==False]
    df = df[df['coding_sequence'].isnull()==False]
    df = df[df['iCUB'].isnull()==False]
    df = df[df['GC_cds'].isnull()==False]
    df = df[df['GC_upstream'].isnull()==False]
    df = df[df['RBS_energy'].isnull()==False]
    df = df[df['RBS_energy_upstream'].isnull()==False]
    return df

def clean_host_tsv(df):
    """
    WRITE A BRIEF PURPOSE/SUMMARY
    
    Development notes: I could also think about testing for stop codons within coding sequences
    and filtering accordingly. Also a way to test for possible non-standard genetic code usage
    
    Input/s:
        df - a pandas dataframe with numeric indices, outputted from Compile_data.ipynb and read
                back in
        
    Output/s:
        df - a clean version of the dataframe with one new column (iCUB) and (potentially)
                several rows removed
    """
    ###Run the main cleaning/additions
    df = common_cleaning(df)
    
    ###Filter out possible prophage genes by removing anything involving the word phage
    ###Numerous possibilities/ways to do this and not all genomes might have any decent
    ###descriptions in the qualifiers.
    filter_word = 'phage'
    df = df[(df['qualifiers'].str.contains(filter_word)==False)]
    
    ###Now ensure that each locus tag is only used once and when in doubt remove them both
    df['locus_tag'] = df['qualifiers'].str.split('locus_tag=', n=1, expand=True)[1]\
                            .str.split(';', n=1, expand=True)[0]
    df = df.drop_duplicates(subset = ['locus_tag'], keep = False)
    return df

def clean_virus_tsv(df):
    """
    This is the same basic structure as the "clean_host_tsv" function. 
    
    Input/s:
        df - a pandas dataframe with numeric indices, outputted from Compile_data.ipynb and read
                back in
        
    Output/s:
        df - a clean version of the dataframe with one new column (iCUB) and (potentially)
                several rows removed
                
    """  
    ###Run the main cleaning/additions
    df = common_cleaning(df)

    ###Now ensure that each viral_id tag is only used once (and when in doubt remove them BOTH)
    df['viral_id'] = df['qualifiers'].str.split('ID=', n=1, expand=True)[1]\
                            .str.split(';', n=1, expand=True)[0]
    
    df = df.drop_duplicates(subset = ["viral_id"], keep = False)
    return df

**Side analysis just to make sure that all these genomes use the standard translation table**

In [None]:
for host_tsv_file in glob.glob(host_genome_dir + '*.tsv')[:]:
    if '.clean.' in host_tsv_file:
        continue
    print(host_tsv_file)
    df = pd.read_csv(host_tsv_file, sep = '\t', index_col = 0)
    df['transl_table'] = df['qualifiers'].str.split('transl_table=', n=1, expand=True)[1]\
                        .str.split(';', n=1, expand=True)[0]
    print(df.shape[0], list(df['transl_table'].value_counts().items()))

../Data/MVP_data/host_genomes/28450.tsv
5727 [('11', 5727)]
../Data/MVP_data/host_genomes/1590.tsv
3013 [('11', 3013)]
../Data/MVP_data/host_genomes/562.tsv
4379 [('11', 4379)]
../Data/MVP_data/host_genomes/357276.tsv
4343 [('11', 4343)]
../Data/MVP_data/host_genomes/657318.tsv
3294 [('11', 3294)]
../Data/MVP_data/host_genomes/573.tsv
5316 [('11', 5316)]
../Data/MVP_data/host_genomes/1280.tsv
2767 [('11', 2767)]
../Data/MVP_data/host_genomes/305.tsv
3466 [('11', 3466)]
../Data/MVP_data/host_genomes/435591.tsv
3979 [('11', 3979)]
../Data/MVP_data/host_genomes/470.tsv
4327 [('11', 4327)]
../Data/MVP_data/host_genomes/287.tsv
5573 [('11', 5573)]
../Data/MVP_data/host_genomes/1314.tsv
1693 [('11', 1690)]
../Data/MVP_data/host_genomes/36809.tsv
4920 [('11', 4920)]
../Data/MVP_data/host_genomes/717959.tsv
3110 [('11', 3110)]
../Data/MVP_data/host_genomes/90371.tsv
4447 [('11', 4447)]
../Data/MVP_data/host_genomes/1639.tsv
2867 [('11', 2867)]
../Data/MVP_data/host_genomes/1428.tsv
5117 [('11'

**First host data tables**

In [None]:
for host_tsv_file in glob.glob(host_genome_dir + '*.tsv'):
    if '.clean.' in host_tsv_file:
        continue
    print(host_tsv_file)
    ###
    df = pd.read_csv(host_tsv_file, sep='\t', index_col=0)
    initial_shape = df.shape
    df = clean_host_tsv(df)
    final_shape = df.shape
    ###
    gene_ratio = final_shape[0]/initial_shape[0]
    print(gene_ratio)
    if gene_ratio <= 0.8: #This basically shouldn't happen
        break
    ###
    clean_tsv_loc = host_tsv_file.replace('.tsv', '.clean.tsv')
    df.to_csv(clean_tsv_loc, sep='\t')

../Data/MVP_data/host_genomes/28450.tsv
0.9776497293521914
../Data/MVP_data/host_genomes/1590.tsv
0.9495519415864587
../Data/MVP_data/host_genomes/562.tsv
0.9271523178807947
../Data/MVP_data/host_genomes/357276.tsv
0.9689154962007829
../Data/MVP_data/host_genomes/657318.tsv
0.9875531268973892
../Data/MVP_data/host_genomes/573.tsv
0.9834462001504891
../Data/MVP_data/host_genomes/1280.tsv
0.9555475243946513
../Data/MVP_data/host_genomes/305.tsv
0.9676860934795153
../Data/MVP_data/host_genomes/435591.tsv
0.9856747926614727
../Data/MVP_data/host_genomes/470.tsv
0.9315923272475156
../Data/MVP_data/host_genomes/287.tsv
0.9971290148932352
../Data/MVP_data/host_genomes/1314.tsv
0.941523922031896
../Data/MVP_data/host_genomes/36809.tsv
0.9902439024390244
../Data/MVP_data/host_genomes/717959.tsv
0.9305466237942123
../Data/MVP_data/host_genomes/90371.tsv
0.947829997751293
../Data/MVP_data/host_genomes/1639.tsv
0.9895361004534357
../Data/MVP_data/host_genomes/1428.tsv
0.9976548759038499


**Next, phage**

In [None]:
for virus_folder in glob.glob(base_viral_genome_dir + '*_rep_viruses/'):
    print('###', virus_folder)
    for virus_tsv_file in glob.glob(virus_folder + '*.tsv'):
        if '.clean.' in virus_tsv_file:
            continue
        print(virus_tsv_file)
        ###
        df = pd.read_csv(virus_tsv_file, sep='\t', index_col=0)
        initial_shape = df.shape
        df = clean_virus_tsv(df)
        final_shape = df.shape
        ###
        gene_ratio = final_shape[0]/initial_shape[0]
        if gene_ratio <= 0.8:
            print('Strange case', gene_ratio)
            break
        ###
        clean_tsv_loc = virus_tsv_file.replace('.tsv', '.clean.tsv')
        df.to_csv(clean_tsv_loc, sep='\t')

### ../Data/MVP_data/host_linked_genomes/1314_rep_viruses/
### ../Data/MVP_data/host_linked_genomes/36809_rep_viruses/
### ../Data/MVP_data/host_linked_genomes/28450_rep_viruses/
### ../Data/MVP_data/host_linked_genomes/562_rep_viruses/
### ../Data/MVP_data/host_linked_genomes/470_rep_viruses/
### ../Data/MVP_data/host_linked_genomes/435591_rep_viruses/
### ../Data/MVP_data/host_linked_genomes/305_rep_viruses/
### ../Data/MVP_data/host_linked_genomes/717959_rep_viruses/
### ../Data/MVP_data/host_linked_genomes/1639_rep_viruses/
### ../Data/MVP_data/host_linked_genomes/657318_rep_viruses/
### ../Data/MVP_data/host_linked_genomes/1280_rep_viruses/
### ../Data/MVP_data/host_linked_genomes/1428_rep_viruses/
### ../Data/MVP_data/host_linked_genomes/573_rep_viruses/
### ../Data/MVP_data/host_linked_genomes/90371_rep_viruses/
### ../Data/MVP_data/host_linked_genomes/1590_rep_viruses/
### ../Data/MVP_data/host_linked_genomes/357276_rep_viruses/
### ../Data/MVP_data/host_linked_genomes/287_rep_