# Pseudomonas Aeruginosa Modulome

In [1]:
from pymodulon.core import IcaData
from pymodulon.io import *
from pymodulon.imodulondb import *
from pymodulon.util import *
import pandas as pd

## Load and Initial Check

In [2]:
ica_data = load_json_model('../paeru_116_iModulon.json')

In [3]:
pd.set_option('display.max_colwidth', None)

table_issues, tf_issues, missing_gene_links, missing_dois = \
    imodulondb_compatibility(ica_data)

print('--Table Issues--')
display(table_issues)
print('--TF Issues--')
display(tf_issues)
print('--Missing Gene Links--')
display(missing_gene_links.values)
print('--Missing DOIs--')
display(missing_dois.values)

iModulon Table is missing a name column
iModulon Table is missing a TF column
Note that TF is used for coding purposes, so it uses '+' and '/' and must match the trn.
iModulon Table is missing a Regulator column
Note that Regulator is displayed in the dataset page so it can have spelled out 'and' and 'or' operators.If you have a TF column, a Regulator column will begenerated for you (but not vice versa).
iModulon Table is missing a Function column
iModulon Table is missing a Category column
iModulon Table is missing a n_genes column

Gene Table is missing the optional length column
Gene Table is missing the optional operon column

Sample Table is missing the optional sample column
The 0th column will be used to define the names of samples in the activity bar graph unless you add this column.
Sample Table is missing a Biological Replicates column
Sample Table is missing the optional DOI column
If you would like to be able to access papers by clicking the activity bars, add this column a

TypeError: cannot unpack non-iterable NoneType object

## Add the iModulonDB Table

In [4]:
ica_data.imodulondb_table = {
     'organism': 'Pseudomonas aeruginosa',
     'dataset': 'aeruPRECISE411',
     'strain': 'PA01',
     'publication_name': 'Rajput, et al., Forthcoming',
     'publication_link': '',
     'gene_link_db': 'Pseudomonas Genome DB',
     'organism_folder': 'p_aeruginosa',
     'dataset_folder': 'modulome'
}

## Add the Explained Variance

In [5]:
for k in ica_data.imodulon_table.index:
    ica_data.imodulon_table.loc[k, 'exp_var'] = explained_variance(
        ica_data, imodulons=k)

## Sample Table

In [6]:
pd.set_option('display.max_rows', 370)
pd.set_option('display.max_columns', 60)
ica_data.sample_table

Unnamed: 0,ReleaseDate,TaxID,ScientificName,species_tax_id,class_tax_id,strain_tax_id,species,LibraryLayout,Platform,Model,Run,SRAStudy,BioProject,ProjectID,Sample,BioSample,Submission,LibraryStrategy,LibrarySelection,LibrarySource,CenterName,InsertSize,InsertDev,SRA ID,GEO Series,GEO Sample,PMID,Biosample ID,biosample_strain,strain_final,biosample_genotype,project,condition,refrence,condition_name_old,project_name_old,full_name,replicates,refrence_condition_old,strain_description,base_media,carbon_source,nitrogen_source,aerobicity,supplement,temperature,pH,growth_phase,OD,additional_notes,passed_fastqc,passed_reads_mapped_to_CDS,passed_global_correlation,passed_similar_replicates,passed_number_replicates
SRX7442752,##########,208964.0,Pseudomonas aeruginosa PAO1,287,1236.0,208964.0,Pseudomonas aeruginosa,SINGLE,ILLUMINA,Illumina HiSeq 2500,SRR10768834,SRP238832,PRJNA597754,597754.0,SRS5887073,SAMN13683003,SRA1016704,RNA-Seq,cDNA,TRANSCRIPTOMIC,University of British Columbia,0.0,9734641.0,GSE142611,GSM4232895,32071223,32071223.0,PA01,PAO1,wild type,NC_002516.2,AB569_EDTA_NaNO2,EDTA_NaNo2_wt,EDTA_NaNo2_wt,EDTA_NaNo2_wt,AB569_PAO1,AB569_EDTA_NaNO2:EDTA_NaNo2_wt,,EDTA_NaNo2_wt,LB,,,Aerobic,,37.0,,,,,,True,True,1.0,True,True
SRX7442753,##########,208964.0,Pseudomonas aeruginosa PAO1,287,1236.0,208964.0,Pseudomonas aeruginosa,SINGLE,ILLUMINA,Illumina HiSeq 2500,SRR10768835,SRP238832,PRJNA597754,597754.0,SRS5887074,SAMN13683002,SRA1016704,RNA-Seq,cDNA,TRANSCRIPTOMIC,University of British Columbia,0.0,9734642.0,GSE142611,GSM4232896,32071223,32071223.0,PA01,PAO1,wild type,NC_002516.2,AB569_EDTA_NaNO2,EDTA_NaNo2_wt,EDTA_NaNo2_wt,EDTA_NaNo2_wt,AB569_PAO1,AB569_EDTA_NaNO2:EDTA_NaNo2_wt,,EDTA_NaNo2_wt,LB,,,Aerobic,,37.0,,,,,,True,True,1.0,True,True
SRX7442754,##########,208964.0,Pseudomonas aeruginosa PAO1,287,1236.0,208964.0,Pseudomonas aeruginosa,SINGLE,ILLUMINA,Illumina HiSeq 2500,SRR10768836,SRP238832,PRJNA597754,597754.0,SRS5887075,SAMN13683001,SRA1016704,RNA-Seq,cDNA,TRANSCRIPTOMIC,University of British Columbia,0.0,9734643.0,GSE142611,GSM4232897,32071223,32071223.0,PA01,PAO1,wild type,NC_002516.2,AB569_EDTA_NaNO2,NaNO2,EDTA_NaNo2_wt,NaNO2_1,AB569_PAO1,AB569_EDTA_NaNO2:NaNO2,,EDTA_NaNo2_wt,LB,,,Aerobic,,37.0,,,,,,True,True,1.0,True,True
SRX7442755,##########,208964.0,Pseudomonas aeruginosa PAO1,287,1236.0,208964.0,Pseudomonas aeruginosa,SINGLE,ILLUMINA,Illumina HiSeq 2500,SRR10768837,SRP238832,PRJNA597754,597754.0,SRS5887076,SAMN13682999,SRA1016704,RNA-Seq,cDNA,TRANSCRIPTOMIC,University of British Columbia,0.0,9734644.0,GSE142611,GSM4232898,32071223,32071223.0,PA01,PAO1,wild type,NC_002516.2,AB569_EDTA_NaNO2,NaNO2,EDTA_NaNo2_wt,NaNO2_3,AB569_PAO1,AB569_EDTA_NaNO2:NaNO2,,EDTA_NaNo2_wt,LB,,,Aerobic,,37.0,,,,,,True,True,1.0,True,True
SRX7442756,##########,208964.0,Pseudomonas aeruginosa PAO1,287,1236.0,208964.0,Pseudomonas aeruginosa,SINGLE,ILLUMINA,Illumina HiSeq 2500,SRR10768838,SRP238832,PRJNA597754,597754.0,SRS5887077,SAMN13682998,SRA1016704,RNA-Seq,cDNA,TRANSCRIPTOMIC,University of British Columbia,0.0,9734645.0,GSE142611,GSM4232899,32071223,32071223.0,PA01,PAO1,wild type,NC_002516.2,AB569_EDTA_NaNO2,EDTA,EDTA_NaNo2_wt,EDTA_1,AB569_PAO1,AB569_EDTA_NaNO2:EDTA,,EDTA_NaNo2_wt,LB,,,Aerobic,,37.0,,,,,,True,True,1.0,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRX2711963,##########,287.0,Pseudomonas aeruginosa,287,1236.0,287.0,Pseudomonas aeruginosa,PAIRED,ILLUMINA,Illumina HiSeq 2000,SRR5420449,SRP103149,PRJNA381683,381683.0,SRS2103670,SAMN06687296,SRA551999,RNA-Seq,RANDOM,TRANSCRIPTOMIC,Singapore Centre for Environmental Life Sciences Engineering,0.0,3911992.0,,,29090193,29090193.0,PA01Pa01_wildtype_rep1,PAO1,wild type,NC_002516.2,yhjH_knockout,PA_delyhjH_wt,PA_delyhjH_wt,PA_delyhjH_wt,Knock_out_delyhjH,yhjH_knockout:PA_delyhjH_wt,,PA_delyhjH_wt,ABTGC,,,,,37.0,,late log,,,,True,True,1.0,True,True
SRX2711964,##########,287.0,Pseudomonas aeruginosa,287,1236.0,287.0,Pseudomonas aeruginosa,PAIRED,ILLUMINA,Illumina HiSeq 2000,SRR5420450,SRP103149,PRJNA381683,381683.0,SRS2103671,SAMN06687295,SRA551999,RNA-Seq,RANDOM,TRANSCRIPTOMIC,Singapore Centre for Environmental Life Sciences Engineering,0.0,3911993.0,,,29090193,29090193.0,PA01yhjH_rep2,PAO1,delyhjH,NC_002516.2,yhjH_knockout,PA_delyhjH,PA_delyhjH_wt,PA_delyhjH,Knock_out_delyhjH,yhjH_knockout:PA_delyhjH,,PA_delyhjH_wt,ABTGC,,,,,37.0,,late log,,,,True,True,1.0,True,True
SRX2711965,##########,287.0,Pseudomonas aeruginosa,287,1236.0,287.0,Pseudomonas aeruginosa,PAIRED,ILLUMINA,Illumina HiSeq 2000,SRR5420451,SRP103149,PRJNA381683,381683.0,SRS2103672,SAMN06687294,SRA551999,RNA-Seq,RANDOM,TRANSCRIPTOMIC,Singapore Centre for Environmental Life Sciences Engineering,0.0,3911994.0,,,29090193,29090193.0,PA01yhjH_rep1,PAO1,delyhjH,NC_002516.2,yhjH_knockout,PA_delyhjH,PA_delyhjH_wt,PA_delyhjH,Knock_out_delyhjH,yhjH_knockout:PA_delyhjH,,PA_delyhjH_wt,ABTGC,,,,,37.0,,late log,,,,True,True,1.0,True,True
SRX2711966,##########,287.0,Pseudomonas aeruginosa,287,1236.0,287.0,Pseudomonas aeruginosa,PAIRED,ILLUMINA,Illumina HiSeq 2000,SRR5420452,SRP103149,PRJNA381683,381683.0,SRS2103673,SAMN06687293,SRA551999,RNA-Seq,RANDOM,TRANSCRIPTOMIC,Singapore Centre for Environmental Life Sciences Engineering,0.0,3911995.0,,,29090193,29090193.0,PA01wspF_rep2,PAO1,delwspF,NC_002516.2,yhjH_knockout,PA_delwspF,PA_delyhjH_wt,PA_delwspF,Knock_out_delyhjH,yhjH_knockout:PA_delwspF,,PA_delyhjH_wt,ABTGC,,,,,37.0,,late log,,,,True,True,1.0,True,True


In [8]:
#ica_data.sample_table.to_csv('paeru_sample_table.csv')
ica_data.sample_table = pd.read_csv('paeru_sample_table.csv', index_col = 0)

In [9]:
# add replicate numbers for each sample
ica_data.sample_table = ica_data.sample_table.rename({'replicates':'replicate'}, axis = 1)
for name, group in ica_data.sample_table.groupby(['project', 'condition']):
    ica_data.sample_table.loc[group.index, 'replicate'] = range(1, group.shape[0]+1)

# make sample names that include replicate number
ica_data.sample_table = ica_data.sample_table.rename({'full_name':'full_condition_name'}, axis = 1)
ica_data.sample_table['name'] = \
    ica_data.sample_table.full_condition_name + ':' + \
    ica_data.sample_table.replicate.astype(str)

# important: "Sample" is already a column and we don't want it to be
ica_data.sample_table = ica_data.sample_table.rename({
    'Sample': 'SRS_Sample',
    'name': 'sample'
    }, axis = 1)

## iModulon Table

In [10]:
# deal with the tf_issues.has_gene column
tf_genes = {'PA4021':'eatR',
            'PA0367':'laoR',
            'PA3184':'hexR',
            'PA2511':'antR',
            'PA5499':'zur',
            'PA1663':'sfa2',
            'PA0071':'tagR2',
            'PA4769':'lldR',
            'PA1978':'erbR',
            'PA2020':'mexZ',
            'PA4596':'esrC',
            'PA0716':'xisF4',
            'PA1335':'aauR',
            'PA2523':'czcR',
            'PA0186':'atsR',
            'PA3204':'cpxR',
            'PA0279':'ydfF',
            'PA3381':'phnF',
            'PA5085':'dguR',
            'PA4184':'souR',
            'PA2491':'mexS',
            'PA2449':'gcsR',
            'PA2010':'hmgR',
            'PA0217':'mdcR'
           }

for locus, gene in tf_genes.items():
    if not(locus in ica_data.gene_table.index):
        print(locus)
    ica_data.gene_table.gene_name[locus] = gene

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ica_data.gene_table.gene_name[locus] = gene


In [11]:
# "Novel iModulon" is not a regulator
ica_data.imodulon_table.loc['RiPP', 'regulator'] = None

In [12]:
# capitalization error in the TRN
ica_data.trn.regulator = ica_data.trn.regulator.replace('Prrf', 'PrrF')
ica_data.tf_links['PrrF'] = 'https://www.pseudomonas.com/primarySequenceFeature/list?strain_ids=107&term=Pseudomonas+aeruginosa+PAO1+%28Reference%29&c1=name&v1=PrrF&e1=1&assembly=complete'

In [13]:
# category typo & capitalization
ica_data.imodulon_table.Category = ica_data.imodulon_table.Category.replace({
    'Quorum sening ': 'Quorum Sensing',
    'Quorum sensing': 'Quorum Sensing',
    'Defense mechanisms': 'Defense Mechanisms',
    'Secretion system': 'Secretion System',
    'Biosynthetic gene cluster': 'Biosynthetic Gene Cluster',
    'Stress response':'Stress Response'
    })

## Double-check

In [14]:
table_issues, tf_issues, missing_gene_links, missing_dois = \
    imodulondb_compatibility(ica_data)

print('--Table Issues--')
display(table_issues)
print('--TF Issues--')
display(tf_issues)
print('--Missing Gene Links--')
display(missing_gene_links.values)
print('--Missing DOIs--')
display(missing_dois.values)

--Table Issues--


Unnamed: 0,Table,Missing Column,Solution
0,iModulonDB,publication_link,The publication name will not be a hyperlink.
1,Gene,operon,Operon info will not display.
2,Sample,n_replicates,This column will be generated for you.
3,iModulon,name,imodulon_table.index will be used.
4,iModulon,n_genes,This column will be computed for you.


--TF Issues--


Unnamed: 0,in_trn,has_link,has_gene
Sigma factor X,True,True,False
EutR,True,True,False
RutR,True,True,False
AHL,True,True,False
TagR1,True,True,False
PrpR,True,True,False
NagQ,True,True,False
PrrF,True,True,False
k-opioid,True,True,False
Cobalamin,True,True,False


--Missing Gene Links--


array([], dtype=float64)

--Missing DOIs--


array(['Paeru_0017', 'Paeru_0018', 'Paeru_0019', 'Paeru_0020',
       'Paeru_0033', 'Paeru_0034', 'Paeru_0035', 'Paeru_0036',
       'Paeru_0037', 'Paeru_0038', 'Paeru_0039', 'Paeru_0040',
       'Paeru_0041', 'Paeru_0042', 'Paeru_0043', 'Paeru_0044',
       'Paeru_0045', 'Paeru_0046', 'Paeru_0047', 'Paeru_0048',
       'Paeru_0049', 'Paeru_0050', 'Paeru_0051', 'Paeru_0052',
       'Paeru_0053', 'Paeru_0056', 'Paeru_0057', 'Paeru_0058',
       'Paeru_0059', 'Paeru_0060', 'Paeru_0061', 'Paeru_0062',
       'Paeru_0063', 'Paeru_0064', 'Paeru_0065', 'Paeru_0066',
       'Paeru_0068', 'Paeru_0069', 'Paeru_0070', 'Paeru_0071',
       'Paeru_0072', 'Paeru_0073', 'Paeru_0074', 'Paeru_0075',
       'Paeru_0077', 'Paeru_0106', 'Paeru_0107', 'Paeru_0111',
       'Paeru_0112', 'Paeru_0131', 'Paeru_0132', 'Paeru_0133',
       'Paeru_0134', 'Paeru_0135', 'Paeru_0136', 'Paeru_0137',
       'Paeru_0138', 'Paeru_0161', 'Paeru_0162', 'Paeru_0166',
       'Paeru_0167', 'Paeru_0220', 'Paeru_0221', 'Paeru

## Save

In [13]:
save_to_json(ica_data, 'final_im_objects/p_aeruginosa_modulome_imdb.json')

## Export

In [22]:
categories = ['Carbon Source Utilization',
              'Amino Acid and Nucleotide Biosynthesis',
              'Energy Metabolism',
              'Metal Homeostasis',
              'Miscellaneous Metabolism',
              'Translational', 
              'Biosynthetic Gene Cluster',
              'Secretion System',
              'Structural Components',
              'Quorum Sensing',
              'Stress Response',
              'Defense Mechanisms',
              'Prophages',
              'Singlet', 
              'Uncharacterized'
             ]

In [None]:
imodulondb_export(ica_data, '../iModulonDB', cat_order = categories)