# Pseudomonas Aeruginosa Modulome

In [2]:
from pymodulon.core import IcaData
from pymodulon.io import *
from pymodulon.imodulondb import *
from pymodulon.util import *
import pandas as pd

## Load and Initial Check

In [3]:
ica_data = load_json_model('paeru_data_from_Akanksha.json')

In [4]:
pd.set_option('display.max_colwidth', None)

table_issues, tf_issues, missing_gene_links, missing_dois = \
    imodulondb_compatibility(ica_data)

print('--Table Issues--')
display(table_issues)
print('--TF Issues--')
display(tf_issues)
print('--Missing Gene Links--')
display(missing_gene_links.values)
print('--Missing DOIs--')
display(missing_dois.values)

--Table Issues--


Unnamed: 0,Table,Missing Column,Solution
0,iModulonDB,organism,"The default, ""New Organism"", will be used."
1,iModulonDB,dataset,"The default, ""New Dataset"", will be used."
2,iModulonDB,strain,"The default, ""Unspecified"", will be used."
3,iModulonDB,publication_name,"The default, ""Unpublished Study"", will be used."
4,iModulonDB,publication_link,The publication name will not be a hyperlink.
5,iModulonDB,gene_link_db,"The default, ""External Database"", will be used."
6,iModulonDB,organism_folder,"The default, ""new_organism"", will be used."
7,iModulonDB,dataset_folder,"The default, ""new_dataset"", will be used."
8,Gene,operon,Operon info will not display.
9,Sample,n_replicates,This column will be generated for you.


--TF Issues--


Unnamed: 0,in_trn,has_link,has_gene
EatR,True,True,False
LaoR,True,True,False
HexR,True,True,False
Sigma factor X,True,True,False
AntR,True,True,False
EutR,True,True,False
Zur,True,True,False
RutR,True,True,False
AHL,True,True,False
TagR1,True,True,False


--Missing Gene Links--


array([], dtype=float64)

--Missing DOIs--


array(['SRX7442752', 'SRX7442753', 'SRX7442754', 'SRX7442755',
       'SRX7442756', 'SRX7442757', 'SRX7442758', 'SRX7442759',
       'SRX7442761', 'SRX7874204', 'SRX7874205', 'SRX7874206',
       'SRX7874207', 'SRX8639787', 'SRX8639788', 'SRX8639789',
       'SRX8639790', 'SRX8639791', 'SRX8639792', 'SRX8639793',
       'SRX8639794', 'SRX8639795', 'SRX8639796', 'SRX8639797',
       'SRX8639798', 'SRX8639800', 'SRX1156127', 'SRX1156135',
       'SRX1156136', 'SRX1156137', 'SRX1156138', 'SRX1156140',
       'ERX2195929', 'ERX2195930', 'ERX2195931', 'ERX2195932',
       'ERX2195933', 'ERX2195934', 'SRX4579961', 'SRX4579962',
       'SRX4579963', 'SRX4579964', 'SRX4579965', 'SRX4579966',
       'SRX5630349', 'SRX5630350', 'SRX5630351', 'SRX5630352',
       'SRX1411278', 'SRX1411287', 'SRX1411289', 'SRX1411279',
       'SRX1411280', 'SRX1411281', 'SRX1411282', 'SRX1411283',
       'SRX1411284', 'SRX1411285', 'SRX1411286', 'SRX1411288',
       'SRX2711962', 'SRX2711963', 'SRX2711964', 'SRX27

## Add the iModulonDB Table

In [5]:
ica_data.imodulondb_table = {
     'organism': 'Pseudomonas aeruginosa',
     'dataset': 'Modulome',
     'strain': 'PA01',
     'publication_name': 'Rajput, et al., Forthcoming',
     'publication_link': '',
     'gene_link_db': 'Pseudomonas Genome DB',
     'organism_folder': 'p_aeruginosa',
     'dataset_folder': 'modulome'
}

## Add the Explained Variance

In [6]:
for k in ica_data.imodulon_table.index:
    ica_data.imodulon_table.loc[k, 'exp_var'] = explained_variance(
        ica_data, imodulons=k)

## Sample Table

In [7]:
pd.set_option('display.max_rows', 370)
pd.set_option('display.max_columns', 60)
ica_data.sample_table

Unnamed: 0,ReleaseDate,TaxID,ScientificName,species_tax_id,class_tax_id,strain_tax_id,species,LibraryLayout,Platform,Model,Run,SRAStudy,BioProject,ProjectID,Sample,BioSample,Submission,LibraryStrategy,LibrarySelection,LibrarySource,CenterName,InsertSize,InsertDev,SRA ID,GEO Series,GEO Sample,PMID,Biosample ID,biosample_strain,strain_final,biosample_genotype,project,condition,refrence,condition_name_old,project_name_old,full_name,replicates,refrence_condition_old,strain_description,base_media,carbon_source,nitrogen_source,aerobicity,supplement,temperature,pH,growth_phase,OD,additional_notes,passed_fastqc,passed_reads_mapped_to_CDS,passed_global_correlation,passed_similar_replicates,passed_number_replicates
SRX7442752,2/19/20 16:57,208964.0,Pseudomonas aeruginosa PAO1,287,1236.0,208964.0,Pseudomonas aeruginosa,SINGLE,ILLUMINA,Illumina HiSeq 2500,SRR10768834,SRP238832,PRJNA597754,597754.0,SRS5887073,SAMN13683003,SRA1016704,RNA-Seq,cDNA,TRANSCRIPTOMIC,University of British Columbia,0.0,9734641.0,GSE142611,GSM4232895,32071223,32071223.0,PA01,PAO1,wild type,NC_002516.2,AB569_EDTA_NaNO2,EDTA_NaNo2_wt,EDTA_NaNo2_wt,EDTA_NaNo2_wt,AB569_PAO1,AB569_EDTA_NaNO2:EDTA_NaNo2_wt,,EDTA_NaNo2_wt,LB,,,Aerobic,,37.0,,,,,,1.0,1.0,1.0,1.0,1.0
SRX7442753,2/19/20 16:57,208964.0,Pseudomonas aeruginosa PAO1,287,1236.0,208964.0,Pseudomonas aeruginosa,SINGLE,ILLUMINA,Illumina HiSeq 2500,SRR10768835,SRP238832,PRJNA597754,597754.0,SRS5887074,SAMN13683002,SRA1016704,RNA-Seq,cDNA,TRANSCRIPTOMIC,University of British Columbia,0.0,9734642.0,GSE142611,GSM4232896,32071223,32071223.0,PA01,PAO1,wild type,NC_002516.2,AB569_EDTA_NaNO2,EDTA_NaNo2_wt,EDTA_NaNo2_wt,EDTA_NaNo2_wt,AB569_PAO1,AB569_EDTA_NaNO2:EDTA_NaNo2_wt,,EDTA_NaNo2_wt,LB,,,Aerobic,,37.0,,,,,,1.0,1.0,1.0,1.0,1.0
SRX7442754,2/19/20 16:57,208964.0,Pseudomonas aeruginosa PAO1,287,1236.0,208964.0,Pseudomonas aeruginosa,SINGLE,ILLUMINA,Illumina HiSeq 2500,SRR10768836,SRP238832,PRJNA597754,597754.0,SRS5887075,SAMN13683001,SRA1016704,RNA-Seq,cDNA,TRANSCRIPTOMIC,University of British Columbia,0.0,9734643.0,GSE142611,GSM4232897,32071223,32071223.0,PA01,PAO1,wild type,NC_002516.2,AB569_EDTA_NaNO2,NaNO2,EDTA_NaNo2_wt,NaNO2_1,AB569_PAO1,AB569_EDTA_NaNO2:NaNO2,,EDTA_NaNo2_wt,LB,,,Aerobic,,37.0,,,,,,1.0,1.0,1.0,1.0,1.0
SRX7442755,2/19/20 16:57,208964.0,Pseudomonas aeruginosa PAO1,287,1236.0,208964.0,Pseudomonas aeruginosa,SINGLE,ILLUMINA,Illumina HiSeq 2500,SRR10768837,SRP238832,PRJNA597754,597754.0,SRS5887076,SAMN13682999,SRA1016704,RNA-Seq,cDNA,TRANSCRIPTOMIC,University of British Columbia,0.0,9734644.0,GSE142611,GSM4232898,32071223,32071223.0,PA01,PAO1,wild type,NC_002516.2,AB569_EDTA_NaNO2,NaNO2,EDTA_NaNo2_wt,NaNO2_3,AB569_PAO1,AB569_EDTA_NaNO2:NaNO2,,EDTA_NaNo2_wt,LB,,,Aerobic,,37.0,,,,,,1.0,1.0,1.0,1.0,1.0
SRX7442756,2/19/20 16:57,208964.0,Pseudomonas aeruginosa PAO1,287,1236.0,208964.0,Pseudomonas aeruginosa,SINGLE,ILLUMINA,Illumina HiSeq 2500,SRR10768838,SRP238832,PRJNA597754,597754.0,SRS5887077,SAMN13682998,SRA1016704,RNA-Seq,cDNA,TRANSCRIPTOMIC,University of British Columbia,0.0,9734645.0,GSE142611,GSM4232899,32071223,32071223.0,PA01,PAO1,wild type,NC_002516.2,AB569_EDTA_NaNO2,EDTA,EDTA_NaNo2_wt,EDTA_1,AB569_PAO1,AB569_EDTA_NaNO2:EDTA,,EDTA_NaNo2_wt,LB,,,Aerobic,,37.0,,,,,,1.0,1.0,1.0,1.0,1.0
SRX7442757,2/19/20 16:57,208964.0,Pseudomonas aeruginosa PAO1,287,1236.0,208964.0,Pseudomonas aeruginosa,SINGLE,ILLUMINA,Illumina HiSeq 2500,SRR10768839,SRP238832,PRJNA597754,597754.0,SRS5887078,SAMN13682997,SRA1016704,RNA-Seq,cDNA,TRANSCRIPTOMIC,University of British Columbia,0.0,9734646.0,GSE142611,GSM4232900,32071223,32071223.0,PA01,PAO1,wild type,NC_002516.2,AB569_EDTA_NaNO2,EDTA,EDTA_NaNo2_wt,EDTA_2,AB569_PAO1,AB569_EDTA_NaNO2:EDTA,,EDTA_NaNo2_wt,LB,,,Aerobic,,37.0,,,,,,1.0,1.0,1.0,1.0,1.0
SRX7442758,2/19/20 16:57,208964.0,Pseudomonas aeruginosa PAO1,287,1236.0,208964.0,Pseudomonas aeruginosa,SINGLE,ILLUMINA,Illumina HiSeq 2500,SRR10768840,SRP238832,PRJNA597754,597754.0,SRS5887079,SAMN13682996,SRA1016704,RNA-Seq,cDNA,TRANSCRIPTOMIC,University of British Columbia,0.0,9734647.0,GSE142611,GSM4232901,32071223,32071223.0,PA01,PAO1,wild type,NC_002516.2,AB569_EDTA_NaNO2,EDTA,EDTA_NaNo2_wt,EDTA_3,AB569_PAO1,AB569_EDTA_NaNO2:EDTA,,EDTA_NaNo2_wt,LB,,,Aerobic,,37.0,,,,,,1.0,1.0,1.0,1.0,1.0
SRX7442759,2/19/20 16:57,208964.0,Pseudomonas aeruginosa PAO1,287,1236.0,208964.0,Pseudomonas aeruginosa,SINGLE,ILLUMINA,Illumina HiSeq 2500,SRR10768841,SRP238832,PRJNA597754,597754.0,SRS5887080,SAMN13682994,SRA1016704,RNA-Seq,cDNA,TRANSCRIPTOMIC,University of British Columbia,0.0,9734648.0,GSE142611,GSM4232902,32071223,32071223.0,PA01,PAO1,wild type,NC_002516.2,AB569_EDTA_NaNO2,NaNO2_EDTA,EDTA_NaNo2_wt,NaNO2_EDTA_1,AB569_PAO1,AB569_EDTA_NaNO2:NaNO2_EDTA,,EDTA_NaNo2_wt,LB,,,Aerobic,,37.0,,,,,,1.0,1.0,1.0,1.0,1.0
SRX7442761,2/19/20 16:57,208964.0,Pseudomonas aeruginosa PAO1,287,1236.0,208964.0,Pseudomonas aeruginosa,SINGLE,ILLUMINA,Illumina HiSeq 2500,SRR10768843,SRP238832,PRJNA597754,597754.0,SRS5887082,SAMN13682995,SRA1016704,RNA-Seq,cDNA,TRANSCRIPTOMIC,University of British Columbia,0.0,9734650.0,GSE142611,GSM4232904,32071223,32071223.0,PA01,PAO1,wild type,NC_002516.2,AB569_EDTA_NaNO2,NaNO2_EDTA,EDTA_NaNo2_wt,NaNO2_EDTA_3,AB569_PAO1,AB569_EDTA_NaNO2:NaNO2_EDTA,,EDTA_NaNo2_wt,LB,,,Aerobic,,37.0,,,,,,1.0,1.0,1.0,1.0,1.0
SRX7874204,3/9/20 5:15,208964.0,Pseudomonas aeruginosa PAO1,287,1236.0,208964.0,Pseudomonas aeruginosa,PAIRED,ILLUMINA,Illumina HiSeq 2500,SRR11267574,SRP252017,PRJNA611483,611483.0,SRS6282508,SAMN14332581,SRA1052330,RNA-Seq,PCR,TRANSCRIPTOMIC,Southern University of Science and Technology,0.0,10303907.0,,,,,PAO1,PAO1,wild type,NC_002516.2,Anthranilate_metabolism_GntR_AntP,GntR_AntP_wt,GntR_AntP_wt,GntR_AntP_wt,Anthranilate_GntR_AntP,Anthranilate_metabolism_GntR_AntP:GntR_AntP_wt,,GntR_AntP_wt,,,,,,,,,,,,1.0,1.0,1.0,1.0,1.0


In [8]:
#ica_data.sample_table.to_csv('paeru_sample_table.csv')
ica_data.sample_table = pd.read_csv('paeru_sample_table.csv', index_col = 0)

In [9]:
# add replicate numbers for each sample
ica_data.sample_table = ica_data.sample_table.rename({'replicates':'replicate'}, axis = 1)
for name, group in ica_data.sample_table.groupby(['project', 'condition']):
    ica_data.sample_table.loc[group.index, 'replicate'] = range(1, group.shape[0]+1)

# make sample names that include replicate number
ica_data.sample_table = ica_data.sample_table.rename({'full_name':'full_condition_name'}, axis = 1)
ica_data.sample_table['name'] = \
    ica_data.sample_table.full_condition_name + ':' + \
    ica_data.sample_table.replicate.astype(str)

# important: "Sample" is already a column and we don't want it to be
ica_data.sample_table = ica_data.sample_table.rename({
    'Sample': 'SRS_Sample',
    'name': 'sample'
    }, axis = 1)

## iModulon Table

In [10]:
# deal with the tf_issues.has_gene column
tf_genes = {'PA4021':'eatR',
            'PA0367':'laoR',
            'PA3184':'hexR',
            'PA2511':'antR',
            'PA5499':'zur',
            'PA1663':'sfa2',
            'PA0071':'tagR2',
            'PA4769':'lldR',
            'PA1978':'erbR',
            'PA2020':'mexZ',
            'PA4596':'esrC',
            'PA0716':'xisF4',
            'PA1335':'aauR',
            'PA2523':'czcR',
            'PA0186':'atsR',
            'PA3204':'cpxR',
            'PA0279':'ydfF',
            'PA3381':'phnF',
            'PA5085':'dguR',
            'PA4184':'souR',
            'PA2491':'mexS',
            'PA2449':'gcsR',
            'PA2010':'hmgR',
            'PA0217':'mdcR'
           }

for locus, gene in tf_genes.items():
    if not(locus in ica_data.gene_table.index):
        print(locus)
    ica_data.gene_table.gene_name[locus] = gene

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ica_data.gene_table.gene_name[locus] = gene


In [11]:
# "Novel iModulon" is not a regulator
ica_data.imodulon_table.loc['RiPP', 'regulator'] = None

In [12]:
# capitalization error in the TRN
ica_data.trn.regulator = ica_data.trn.regulator.replace('Prrf', 'PrrF')
ica_data.tf_links['PrrF'] = 'https://www.pseudomonas.com/primarySequenceFeature/list?strain_ids=107&term=Pseudomonas+aeruginosa+PAO1+%28Reference%29&c1=name&v1=PrrF&e1=1&assembly=complete'

In [13]:
# category typo & capitalization
ica_data.imodulon_table.Category = ica_data.imodulon_table.Category.replace({
    'Quorum sening ': 'Quorum Sensing',
    'Quorum sensing': 'Quorum Sensing',
    'Defense mechanisms': 'Defense Mechanisms',
    'Secretion system': 'Secretion System',
    'Biosynthetic gene cluster': 'Biosynthetic Gene Cluster',
    'Stress response':'Stress Response'
    })

## Double-check

In [14]:
table_issues, tf_issues, missing_gene_links, missing_dois = \
    imodulondb_compatibility(ica_data)

print('--Table Issues--')
display(table_issues)
print('--TF Issues--')
display(tf_issues)
print('--Missing Gene Links--')
display(missing_gene_links.values)
print('--Missing DOIs--')
display(missing_dois.values)

--Table Issues--


Unnamed: 0,Table,Missing Column,Solution
0,iModulonDB,publication_link,The publication name will not be a hyperlink.
1,Gene,operon,Operon info will not display.
2,Sample,n_replicates,This column will be generated for you.
3,iModulon,name,imodulon_table.index will be used.
4,iModulon,n_genes,This column will be computed for you.


--TF Issues--


Unnamed: 0,in_trn,has_link,has_gene
Sigma factor X,True,True,False
EutR,True,True,False
RutR,True,True,False
AHL,True,True,False
TagR1,True,True,False
PrpR,True,True,False
NagQ,True,True,False
PrrF,True,True,False
k-opioid,True,True,False
Cobalamin,True,True,False


--Missing Gene Links--


array([], dtype=float64)

--Missing DOIs--


array(['Paeru_0017', 'Paeru_0018', 'Paeru_0019', 'Paeru_0020',
       'Paeru_0033', 'Paeru_0034', 'Paeru_0035', 'Paeru_0036',
       'Paeru_0037', 'Paeru_0038', 'Paeru_0039', 'Paeru_0040',
       'Paeru_0041', 'Paeru_0042', 'Paeru_0043', 'Paeru_0044',
       'Paeru_0045', 'Paeru_0046', 'Paeru_0047', 'Paeru_0048',
       'Paeru_0049', 'Paeru_0050', 'Paeru_0051', 'Paeru_0052',
       'Paeru_0053', 'Paeru_0056', 'Paeru_0057', 'Paeru_0058',
       'Paeru_0059', 'Paeru_0060', 'Paeru_0061', 'Paeru_0062',
       'Paeru_0063', 'Paeru_0064', 'Paeru_0065', 'Paeru_0066',
       'Paeru_0068', 'Paeru_0069', 'Paeru_0070', 'Paeru_0071',
       'Paeru_0072', 'Paeru_0073', 'Paeru_0074', 'Paeru_0075',
       'Paeru_0077', 'Paeru_0106', 'Paeru_0107', 'Paeru_0111',
       'Paeru_0112', 'Paeru_0131', 'Paeru_0132', 'Paeru_0133',
       'Paeru_0134', 'Paeru_0135', 'Paeru_0136', 'Paeru_0137',
       'Paeru_0138', 'Paeru_0161', 'Paeru_0162', 'Paeru_0166',
       'Paeru_0167', 'Paeru_0220', 'Paeru_0221', 'Paeru

## Save

In [13]:
save_to_json(ica_data, 'final_im_objects/p_aeruginosa_modulome_imdb.json')

## Export

In [22]:
categories = ['Carbon Source Utilization',
              'Amino Acid and Nucleotide Biosynthesis',
              'Energy Metabolism',
              'Metal Homeostasis',
              'Miscellaneous Metabolism',
              'Translational', 
              'Biosynthetic Gene Cluster',
              'Secretion System',
              'Structural Components',
              'Quorum Sensing',
              'Stress Response',
              'Defense Mechanisms',
              'Prophages',
              'Singlet', 
              'Uncharacterized'
             ]

In [None]:
imodulondb_export(ica_data, '../iModulonDB', cat_order = categories)