# Bacillus subtilis Modulome

In [3]:
from pymodulon.core import IcaData
from pymodulon.io import *
from pymodulon.imodulondb import *
from pymodulon.util import *
import pandas as pd
from os import path

In [4]:
ica_data = load_json_model(path.join('..','data','processed_data','bsu.json.gz'))

In [5]:
pd.set_option('display.max_colwidth', None)

table_issues, tf_issues, missing_gene_links, missing_dois = \
    imodulondb_compatibility(ica_data)

print('--Table Issues--')
display(table_issues)
print('--TF Issues--')
display(tf_issues)
print('--Missing Gene Links--')
display(missing_gene_links.values)
print('--Missing DOIs--')
display(missing_dois.values)

--Table Issues--


Unnamed: 0,Table,Missing Column,Solution
0,iModulonDB,organism,"The default, ""New Organism"", will be used."
1,iModulonDB,dataset,"The default, ""New Dataset"", will be used."
2,iModulonDB,strain,"The default, ""Unspecified"", will be used."
3,iModulonDB,publication_name,"The default, ""Unpublished Study"", will be used."
4,iModulonDB,publication_link,The publication name will not be a hyperlink.
5,iModulonDB,gene_link_db,"The default, ""External Database"", will be used."
6,iModulonDB,organism_folder,"The default, ""new_organism"", will be used."
7,iModulonDB,dataset_folder,"The default, ""new_dataset"", will be used."
8,Sample,n_replicates,This column will be generated for you.
9,Sample,doi,Clicking on activity plot bars will not link to relevant papers for the samples.


--TF Issues--


Unnamed: 0,in_trn,has_link,has_gene
S-box,True,False,False
WalR,True,False,True
Rex,True,False,True
Fnr,True,False,True
SigM,True,False,True
EAR riboswitch,True,False,False
SigB,True,False,True
PyrR,True,False,True
SigV,True,False,True
CodY,True,False,True


--Missing Gene Links--


array(['BSU_00010', 'BSU_00020', 'BSU_00030', ..., 'BSU_41040',
       'BSU_41050', 'BSU_41060'], dtype=object)

--Missing DOIs--


array(['SRX8138083', 'SRX8138084', 'SRX8138085', 'SRX8138086',
       'SRX8138087', 'SRX8138088', 'ERX4111143', 'ERX4111144',
       'ERX4111145', 'ERX4111146', 'ERX4111148', 'ERX4111149',
       'ERX4111150', 'ERX4111151', 'ERX4111152', 'ERX4111153',
       'ERX4111154', 'ERX4111156', 'ERX4111157', 'ERX4111158',
       'SRX5577041', 'SRX5577042', 'SRX5577043', 'SRX5577044',
       'SRX5577045', 'SRX5577046', 'SRX5577047', 'SRX5577048',
       'SRX5577049', 'SRX5577050', 'SRX5577051', 'SRX5577052',
       'SRX5577053', 'SRX5577054', 'SRX5577055', 'SRX5577056',
       'SRX5577057', 'SRX5577058', 'SRX5577059', 'SRX5577060',
       'SRX5577061', 'SRX5577062', 'SRX5577063', 'SRX5577064',
       'SRX5577065', 'SRX5577066', 'SRX5577067', 'SRX5577068',
       'SRX533166', 'SRX533171', 'SRX533786', 'SRX533787', 'SRX533789',
       'SRX533791', 'SRX533792', 'SRX533793', 'SRX533794', 'SRX533795',
       'SRX4703435', 'SRX4703436', 'SRX4703437', 'SRX4703438',
       'SRX4703439', 'SRX4703440', 'S

## Add the iModulonDB Table

In [6]:
ica_data.imodulondb_table  = {
     'organism': 'Bacillus subtilis',
     'dataset': 'Modulome',
     'strain': '168',
     'publication_name': 'Sastry, et al., Forthcoming',
     'publication_link': '',
     'gene_link_db': '<i>Subti</i>Wiki',
     'organism_folder': 'b_subtilis',
     'dataset_folder': 'modulome'
}

## Add the Explained Variance

In [7]:
for k in ica_data.imodulon_table.index:
    ica_data.imodulon_table.loc[k, 'exp_var'] = explained_variance(
        ica_data, imodulons=k)

## Rename some iModulon columns

In [8]:
ica_data.imodulon_table = ica_data.imodulon_table.rename({
    'category':'broad_category',
    'function':'category'
    }, axis = 1)

## Links

In [9]:
prefix = 'http://subtiwiki.uni-goettingen.de/v3/gene/search/exact/'

# add gene links
for g in ica_data.gene_table.index:
    ica_data.gene_links[g] = prefix + g

# add regulator links
for r in tf_issues.index[(tf_issues.has_gene.astype(bool))]:
    ica_data.tf_links[r] = prefix + ica_data.name2num(r)
    
# some regulators don't have genes but do have pages
# here, I link to the direct page since I had to check their existence anyway
other_regs = {
    'SwrA':'http://subtiwiki.uni-goettingen.de/v3/gene/view/5D479874B43F521DB52EDC2C27CDE4967F22DE47',
    'SigK':'http://subtiwiki.uni-goettingen.de/v3/gene/view/24F7FD5C7C3A68BB2760ABB8CBD8FBD65E5FF7D4',
    'YlxR':'http://subtiwiki.uni-goettingen.de/v3/gene/view/F4097349A563503468A2A14F062AEAC532C7917A',
    'LnrK':'http://subtiwiki.uni-goettingen.de/v3/gene/view/387EF370CE24F7A3C20789A57329A02EBED46F53',
    'KipR':'http://subtiwiki.uni-goettingen.de/v3/gene/view/7DA9A79876C546B78B716A64706A3A3716018C2E'
}
for k, v in other_regs.items():
    ica_data.tf_links[k] = v

## Sample Table

In [10]:
# add replicate numbers
for name, group in ica_data.sample_table.groupby(['project', 'condition']):
    ica_data.sample_table.loc[group.index, 'replicate'] = range(1, group.shape[0]+1)
    
# make sample names that include replicate number
ica_data.sample_table = ica_data.sample_table.rename({'full_name':'full_condition_name'}, axis = 1)
ica_data.sample_table['name'] = \
    ica_data.sample_table.full_condition_name + ':' + \
    ica_data.sample_table.replicate.astype(int).astype(str)

# important: adjust naming to this new column
ica_data.sample_table = ica_data.sample_table.rename({
    'Sample': 'SRS_Sample',
    'name': 'sample'
    }, axis = 1)

In [11]:
#ica_data.sample_table.to_csv('bsub_sample_table.csv')

# TODO 
# add a DOI column and then replace the existing sample_table

## Double-check Compatibility

In [12]:
table_issues, tf_issues, missing_gene_links, missing_dois = \
    imodulondb_compatibility(ica_data)

print('--Table Issues--')
display(table_issues)
print('--TF Issues--')
display(tf_issues)
print('--Missing Gene Links--')
display(missing_gene_links.values)
print('--Missing DOIs--')
display(missing_dois.values)

--Table Issues--


Unnamed: 0,Table,Missing Column,Solution
0,iModulonDB,publication_link,The publication name will not be a hyperlink.
1,Sample,n_replicates,This column will be generated for you.
2,Sample,doi,Clicking on activity plot bars will not link to relevant papers for the samples.
3,iModulon,name,imodulon_table.index will be used.
4,iModulon,function,"The function will be blank in the dataset table and ""Uncharacterized"" in the iModulon dashboard"
5,iModulon,n_genes,This column will be computed for you.


--TF Issues--


Unnamed: 0,in_trn,has_link,has_gene
S-box,True,False,False
EAR riboswitch,True,False,False
stringent response,True,False,False
YlxR,True,True,False
SwrA,True,True,False
SigK,True,True,False
Thi-box,True,False,False
LnrK,True,True,False
G-box,True,False,False
KipR,True,True,False


--Missing Gene Links--


array([], dtype=float64)

--Missing DOIs--


array(['SRX8138083', 'SRX8138084', 'SRX8138085', 'SRX8138086',
       'SRX8138087', 'SRX8138088', 'ERX4111143', 'ERX4111144',
       'ERX4111145', 'ERX4111146', 'ERX4111148', 'ERX4111149',
       'ERX4111150', 'ERX4111151', 'ERX4111152', 'ERX4111153',
       'ERX4111154', 'ERX4111156', 'ERX4111157', 'ERX4111158',
       'SRX5577041', 'SRX5577042', 'SRX5577043', 'SRX5577044',
       'SRX5577045', 'SRX5577046', 'SRX5577047', 'SRX5577048',
       'SRX5577049', 'SRX5577050', 'SRX5577051', 'SRX5577052',
       'SRX5577053', 'SRX5577054', 'SRX5577055', 'SRX5577056',
       'SRX5577057', 'SRX5577058', 'SRX5577059', 'SRX5577060',
       'SRX5577061', 'SRX5577062', 'SRX5577063', 'SRX5577064',
       'SRX5577065', 'SRX5577066', 'SRX5577067', 'SRX5577068',
       'SRX533166', 'SRX533171', 'SRX533786', 'SRX533787', 'SRX533789',
       'SRX533791', 'SRX533792', 'SRX533793', 'SRX533794', 'SRX533795',
       'SRX4703435', 'SRX4703436', 'SRX4703437', 'SRX4703438',
       'SRX4703439', 'SRX4703440', 'S

## Save & Export

In [14]:
save_to_json(ica_data, path.join('..','data','processed_data','bsu_imdb.json.gz'))

In [15]:
categories = [
    'Carbon Metabolism',
    'AA/Nucleotide Metabolism',
    'Misc. Metabolism',
    'Homeostasis',
    'Lifestyles',
    'Cellular Processes',
    'Stress Response',
    'Prophages',
    'Other',
    'Single Gene',
    'Uncharacterized'
]

In [17]:
pd.set_option('mode.chained_assignment',None)
imodulondb_export(ica_data, '../iModulonDB', cat_order = categories)

Writing main site files...
Done writing main site files. Writing plot files...
Two progress bars will appear below. The second will take significantly longer than the first.
Writing iModulon page files (1/2)


  0%|          | 0/72 [00:00<?, ?it/s]

Writing Gene page files (2/2)


  0%|          | 0/4325 [00:00<?, ?it/s]

Complete! (Organism = b_subtilis; Dataset = modulome)
