In [1]:
from os import path
import seaborn as sns
import matplotlib.pyplot as plt

from pymodulon.compare import *
from pymodulon.io import load_json_model
from pymodulon.plotting import *
from pymodulon.example_data import load_bsub_data, load_ecoli_data
from sklearn.metrics.pairwise import cosine_similarity

# Comparison of iModulons across species
Try to make some sort of network analysis comparing iMs across multiple species.

In [5]:
from Bio import SeqIO

def parse_gbff(file_path):
    protein_to_locus = {}
    for record in SeqIO.parse(file_path, 'genbank'):
        for feature in record.features:
            if feature.type == 'CDS':
                protein_id = feature.qualifiers.get('protein_id', [''])[0]
                locus_tag = feature.qualifiers.get('locus_tag', [''])[0]
                protein_to_locus[protein_id] = locus_tag
    return protein_to_locus

# Parse the gbff files and extract protein IDs and locus tags for each species
salb_protein_to_locus = parse_gbff('../../data/external/modulome/genomes/Sal.gbff')
ecol_protein_to_locus = parse_gbff('../../data/external/modulome/genomes/Eco.gbff')
mtub_protein_to_locus = parse_gbff('../../data/external/modulome/genomes/Mtu.gbff')
pae_protein_to_locus = parse_gbff('../../data/external/modulome/genomes/Pae.gbff')
sen_protein_to_locus = parse_gbff('../../data/external/modulome/genomes/Sen.gbff')
sac_protein_to_locus = parse_gbff('../../data/external/modulome/genomes/Sac.gbff')
bsu_protein_to_locus = parse_gbff('../../data/external/modulome/genomes/Bsu.gbff')

# Before running orthofinder replace the protein_ids with locus_tags
the orthogroups files is going to get too large to replace the protein id names, so we need to do it before running orthofinder.

In [6]:
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

def create_fasta_with_locus_tags(protein_to_locus, fasta_path, output_path):
    # Parse the FASTA file and create new SeqRecord objects with locus tags as IDs
    new_records = []
    for record in SeqIO.parse(fasta_path, 'fasta'):
        protein_id = record.id
        locus_tag = protein_to_locus.get(protein_id, protein_id)  # Use the protein_id as default if no locus_tag is found
        new_record = SeqRecord(record.seq, id=locus_tag, description="")
        new_records.append(new_record)

    # Write the new SeqRecord objects to a new FASTA file
    with open(output_path, 'w') as output_handle:
        SeqIO.write(new_records, output_handle, 'fasta')

# Call the function for each species
create_fasta_with_locus_tags(bsu_protein_to_locus, '../../data/external/modulome/genomes/bsu.faa', '../../data/external/modulome/genomes/bsu2.faa')
create_fasta_with_locus_tags(sac_protein_to_locus, '../../data/external/modulome/genomes/sac.faa', '../../data/external/modulome/genomes/sac2.faa')
create_fasta_with_locus_tags(sen_protein_to_locus, '../../data/external/modulome/genomes/Sen.faa', '../../data/external/modulome/genomes/sen2.faa')
create_fasta_with_locus_tags(pae_protein_to_locus, '../../data/external/modulome/genomes/Pae.faa', '../../data/external/modulome/genomes/pae2.faa')
create_fasta_with_locus_tags(mtub_protein_to_locus, '../../data/external/modulome/genomes/Mtu.faa', '../../data/external/modulome/genomes/mtub2.faa')
create_fasta_with_locus_tags(ecol_protein_to_locus, '../../data/external/modulome/genomes/Eco.faa', '../../data/external/modulome/genomes/ecol2.faa')
create_fasta_with_locus_tags(salb_protein_to_locus, '../../data/external/modulome/genomes/Sal.faa', '../../data/external/modulome/genomes/salb2.faa')

In [2]:
# Read the orthofinder results
orthogroups = pd.read_csv('../../data/external/modulome/orthofinder/Orthogroups/Orthogroups.tsv', sep='\t')

orthogroups

Unnamed: 0,Orthogroup,Bsu,Eco,Mtu,Pae,Sac,Sal,Sen
0,OG0000000,"BSU_02830, BSU_03930, BSU_08650, BSU_12350, BS...","b0596, b1093, b1619, b2137, b2426, b2774, b284...","Rv0687, Rv0769, Rv0927c, Rv1050, Rv1350, Rv148...","PA1470, PA1649, PA1827, PA1828, PA2003, PA2515...","SACI_RS00960, SACI_RS01155, SACI_RS05250, SACI...","XNR_RS00260, XNR_RS02895, XNR_RS02935, XNR_RS0...","STM0598, STM1195, STM2171, STM2445, STM3017, S..."
1,OG0000001,,,"Rv0109, Rv0124, Rv0150c, Rv0151c, Rv0152c, Rv0...",,,,
2,OG0000002,"BSU_04700, BSU_34110",,"Rv0941c, Rv1364c",PA2786,,"XNR_RS00335, XNR_RS01380, XNR_RS01600, XNR_RS0...",
3,OG0000003,,,"Rv0096, Rv0256c, Rv0280, Rv0286, Rv0305c, Rv03...",,,,
4,OG0000004,"BSU_01450, BSU_01950, BSU_02570, BSU_02750, BS...","b0127, b3201","Rv1218c, Rv1458c, Rv1687c, Rv2936","PA2812, PA3394, PA3672, PA4037, PA4461","SACI_RS01905, SACI_RS03505, SACI_RS03525, SACI...","XNR_RS06865, XNR_RS07525, XNR_RS09885, XNR_RS1...","STM0172, STM2020, STM3319"
...,...,...,...,...,...,...,...,...
5257,OG0005257,,,,,,,"PSLT107, STM3034"
5258,OG0005258,,,,,,,"STM3170, STM4053"
5259,OG0005259,,,,,,,"STM3767, STM4447"
5260,OG0005260,,,,,,,"STM3768, STM4446"


In [8]:
# Split the genes in each cell
orthogroups['Eco'] = orthogroups['Eco'].str.split(', ')
orthogroups['Mtu'] = orthogroups['Mtu'].str.split(', ')
orthogroups['Pae'] = orthogroups['Pae'].str.split(', ')
orthogroups['Sal'] = orthogroups['Sal'].str.split(', ')
orthogroups['Sen'] = orthogroups['Sen'].str.split(', ')
orthogroups['Sac'] = orthogroups['Sac'].str.split(', ')
orthogroups['Bsu'] = orthogroups['Bsu'].str.split(', ')

# Put each gene on a separate row
orthogroups = orthogroups.explode('Eco')
orthogroups = orthogroups.explode('Mtu')
orthogroups = orthogroups.explode('Pae')
orthogroups = orthogroups.explode('Sal')
orthogroups = orthogroups.explode('Sen')
orthogroups = orthogroups.explode('Sac')
orthogroups = orthogroups.explode('Bsu')

orthogroups

Unnamed: 0,Orthogroup,Bsu,Eco,Mtu,Pae,Sac,Sal,Sen
0,OG0000000,BSU_02830,b0596,Rv0687,PA1470,SACI_RS00960,XNR_RS00260,STM0598
0,OG0000000,BSU_03930,b0596,Rv0687,PA1470,SACI_RS00960,XNR_RS00260,STM0598
0,OG0000000,BSU_08650,b0596,Rv0687,PA1470,SACI_RS00960,XNR_RS00260,STM0598
0,OG0000000,BSU_12350,b0596,Rv0687,PA1470,SACI_RS00960,XNR_RS00260,STM0598
0,OG0000000,BSU_13770,b0596,Rv0687,PA1470,SACI_RS00960,XNR_RS00260,STM0598
...,...,...,...,...,...,...,...,...
5259,OG0005259,,,,,,,STM4447
5260,OG0005260,,,,,,,STM3768
5260,OG0005260,,,,,,,STM4446
5261,OG0005261,,,,,,,STM4413


In [10]:
# Too big for github - save to local machine
orthogroups.to_csv('/Users/nilmat/Documents/Streptomyces_main/orthofinder/imodulon/Results_Jan26/Orthogroups/orthogroups_locus.tsv', sep='\t', index=False)  

In [5]:
orthogroups = pd.read_csv('/Users/nilmat/Documents/Streptomyces_main/orthofinder/imodulon/Results_Jan26/Orthogroups/orthogroups_locus.tsv.gz', sep='\t')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [6]:
orthogroups

Unnamed: 0,Orthogroup,Bsu,Eco,Mtu,Pae,Sac,Sal,Sen
0,OG0000000,BSU_02830,b0596,Rv0687,PA1470,SACI_RS00960,XNR_RS00260,STM0598
1,OG0000000,BSU_03930,b0596,Rv0687,PA1470,SACI_RS00960,XNR_RS00260,STM0598
2,OG0000000,BSU_08650,b0596,Rv0687,PA1470,SACI_RS00960,XNR_RS00260,STM0598
3,OG0000000,BSU_12350,b0596,Rv0687,PA1470,SACI_RS00960,XNR_RS00260,STM0598
4,OG0000000,BSU_13770,b0596,Rv0687,PA1470,SACI_RS00960,XNR_RS00260,STM0598
...,...,...,...,...,...,...,...,...
67369147,OG0005259,,,,,,,STM4447
67369148,OG0005260,,,,,,,STM3768
67369149,OG0005260,,,,,,,STM4446
67369150,OG0005261,,,,,,,STM4413


Around 40min to run the cell below.

In [7]:
# Define a dictionary with the orthologous genes for each species
orthologous_genes = {}

# Group by 'Orthogroup' and convert each group to a set
grouped = orthogroups.groupby('Orthogroup')[['Eco', 'Mtu', 'Pae', 'Sal', 'Sen', 'Sac', 'Bsu']].apply(lambda x: set(x.values.flatten()))

# Convert the sets back to lists and store in the dictionary
orthologous_genes = {group: list(values) for group, values in grouped.items()}

# Define a dictionary with the orthologous genes for each species
orthogroup_mapping = {}

# Convert the sets back to lists and store in the dictionary
for group, values in grouped.items():
    for value in values:
        orthogroup_mapping[value] = group

In [8]:
# Remove nan keys from the dictionary
orthogroup_mapping = {k: v for k, v in orthogroup_mapping.items() if not pd.isnull(k)}

In [11]:
df = pd.DataFrame(list(orthogroup_mapping.items()), columns=['Gene', 'Orthogroup'])

df.to_csv('../../data/external/modulome/orthogroup_mapping.tsv', sep='\t', index=False)

In [15]:
# Create function to calculate cosine similarity between two iModulons, ignoring Nan values
def cosine_similarity_ignore_nan(v1, v2):
    mask = ~np.isnan(v1) & ~np.isnan(v2)
    if not mask.any():
        return np.nan
    v1 = v1[mask]
    v2 = v2[mask]
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

In [16]:
# Load the M files
ecol_M = pd.read_csv("../../data/external/modulome/M_files/ecol_M.csv", index_col=0)
mtub_M = pd.read_csv("../../data/external/modulome/M_files/mtub_M.csv", index_col=0)
pae_M = pd.read_csv("../../data/external/modulome/M_files/pae_M.csv", index_col=0)
sac_M = pd.read_csv("../../data/external/modulome/M_files/sac_M.csv", index_col=0)
sen_M = pd.read_csv("../../data/external/modulome/M_files/sen_M.csv", index_col=0)
salb_ica = load_json_model("../../data/processed/modulome/salb.json.gz")
salb_M = salb_ica.M
#salb_M.columns = [col.replace(', ', '-') if ', ' in col else col for col in salb_M.columns]
#bsu_ica = load_json_model("../../data/external/modulome/ica_objects/bsu.json.gz")
#bsu_M = bsu_ica.M
bsu_M = pd.read_csv("../../data/external/modulome/M_files/bsu_microarray_M.csv", index_col=0)

In [17]:
# remove all genes that doesn't start with 'BSU'
bsu_M = bsu_M[bsu_M.index.str.startswith('BSU')]
# Remove all genes with an '_' in the name
bsu_M = bsu_M[~bsu_M.index.str.contains('_')]
# Replace BSU0 with BSU_0
bsu_M.index = bsu_M.index.str.replace('BSU', 'BSU_')

bsu_M

Unnamed: 0,PchR - Pulcherrimin,FadR - Fatty Acids,CodY - BCAA Limitation,AcoR - Acetoin,SigW - Cell Wall Stress,Eps - Exopolymeric Substances,MalR - Malate,SigB-1 - General Stress 1,Fnr - Nitrate Respiration,ResD - Oxygen Limitation,...,TnrA / PucR - Nitrogen Limitation,PhoP-2 - Phosphate Limitation 2,HutP - Histidine Utilization,Empty 2,PhoP-1 - Phosphate Limitation 1,Empty 3,Alb - Antilisterial Bacteriocin,LicR - Lichenan,"MtlR / AnsR - Mannitol, Asparagine, and Aspartate",CcpA-1 - Low Glucose 1
BSU_00010,-0.001543,0.007361,0.007277,-0.008409,-0.009993,0.003756,0.001345,-0.006197,-0.001841,0.000903,...,-0.009127,0.007536,0.001779,0.022096,-0.006085,0.018861,0.011650,0.013356,-0.016221,0.006136
BSU_00020,-0.003736,0.007254,0.007312,-0.014166,-0.010550,0.011069,0.005112,-0.000983,0.005141,0.004626,...,-0.012617,0.014400,0.010801,0.022773,-0.002667,0.017667,0.017097,0.021457,-0.013430,0.002674
BSU_00030,-0.006012,-0.000192,-0.004445,0.000151,0.002552,-0.000105,-0.004826,0.002288,-0.001541,-0.014308,...,0.008559,0.017144,0.010425,0.007857,0.006917,0.021567,0.009625,-0.009871,-0.004375,-0.000956
BSU_00040,-0.002820,0.014130,0.002272,0.001101,0.002937,-0.003265,-0.001369,0.000286,0.000342,-0.004167,...,0.002498,0.004472,0.002515,-0.001003,0.001221,0.012183,0.000635,-0.003473,0.006395,-0.005535
BSU_00050,-0.003375,0.009737,-0.001405,0.005599,0.006397,-0.003275,-0.002238,0.001132,-0.001103,-0.002939,...,0.001899,0.007950,0.002323,0.002780,0.004409,0.013814,-0.000459,-0.003749,0.001712,-0.005012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
BSU_41020,0.009257,-0.002289,-0.002964,-0.000370,-0.002420,-0.003821,-0.003409,0.002319,-0.005529,-0.000847,...,0.005396,0.008049,-0.000360,0.014728,-0.007636,0.010271,-0.001225,0.017292,-0.004420,0.000575
BSU_41030,-0.013181,-0.007268,-0.010047,-0.004201,-0.001315,-0.004296,-0.006896,-0.001965,-0.004033,-0.004407,...,0.008508,0.006449,0.005630,0.017242,0.000474,0.007859,-0.002265,0.000192,0.006863,-0.003114
BSU_41040,-0.019956,-0.005775,-0.009229,-0.009563,-0.001080,-0.001520,-0.004598,-0.004108,-0.010680,-0.001053,...,0.001408,-0.002339,0.005278,0.016300,0.001111,0.002775,-0.006299,-0.006872,0.006987,-0.002776
BSU_41050,-0.035428,-0.019266,0.002586,-0.005106,-0.000171,0.002655,-0.003030,-0.001380,-0.004308,-0.003509,...,0.005703,0.008554,0.008305,0.003841,0.008294,0.018146,0.009615,-0.002800,0.008132,-0.001268


In [18]:
# Modify the mapping to keep the locus_tag if the gene is not associated with an orthogroup
for gene in salb_M.index:
    if gene not in orthogroup_mapping:
        orthogroup_mapping[gene] = gene

for gene in mtub_M.index:
    if gene not in orthogroup_mapping:
        orthogroup_mapping[gene] = gene

for gene in ecol_M.index:
    if gene not in orthogroup_mapping:
        orthogroup_mapping[gene] = gene

for gene in pae_M.index:
    if gene not in orthogroup_mapping:
        orthogroup_mapping[gene] = gene

for gene in sac_M.index:
    if gene not in orthogroup_mapping:
        orthogroup_mapping[gene] = gene

for gene in sen_M.index:
    if gene not in orthogroup_mapping:
        orthogroup_mapping[gene] = gene

for gene in bsu_M.index:
    if gene not in orthogroup_mapping:
        orthogroup_mapping[gene] = gene

salb_M.index = salb_M.index.map(orthogroup_mapping)
mtub_M.index = mtub_M.index.map(orthogroup_mapping)
ecol_M.index = ecol_M.index.map(orthogroup_mapping)
pae_M.index = pae_M.index.map(orthogroup_mapping)
sac_M.index = sac_M.index.map(orthogroup_mapping)
sen_M.index = sen_M.index.map(orthogroup_mapping)
bsu_M.index = bsu_M.index.map(orthogroup_mapping)

# Remove rows with NaN in the index
salb_M = salb_M[salb_M.index.notna()]
mtub_M = mtub_M[mtub_M.index.notna()]
ecol_M = ecol_M[ecol_M.index.notna()]
pae_M = pae_M[pae_M.index.notna()]
sac_M = sac_M[sac_M.index.notna()]
sen_M = sen_M[sen_M.index.notna()]
bsu_M = bsu_M[bsu_M.index.notna()]

# If there is more than one row with the same index, take the max for each column
salb_M = salb_M.groupby(salb_M.index).max()
mtub_M = mtub_M.groupby(mtub_M.index).max()
ecol_M = ecol_M.groupby(ecol_M.index).max()
pae_M = pae_M.groupby(pae_M.index).max()
sac_M = sac_M.groupby(sac_M.index).max()
sen_M = sen_M.groupby(sen_M.index).max()
bsu_M = bsu_M.groupby(bsu_M.index).max()

# Add prefix to column names
salb_M = salb_M.add_prefix('salb_')
mtub_M = mtub_M.add_prefix('mtub_')
ecol_M = ecol_M.add_prefix('ecol_')
pae_M = pae_M.add_prefix('pae_')
sac_M = sac_M.add_prefix('sac_')
sen_M = sen_M.add_prefix('sen_')
bsu_M = bsu_M.add_prefix('bsu_')

# Merge the M files on the index
M = pd.concat([salb_M, mtub_M, ecol_M, pae_M, sen_M, sac_M, bsu_M], axis=1, join='outer')

# Create a dataframe for the edges
edges = []
for i in range(M.shape[1]):
    for j in range(i+1, M.shape[1]):
        sim = cosine_similarity_ignore_nan(M.iloc[:, i], M.iloc[:, j])
        edges.append([M.columns[i], M.columns[j], sim])

edges_df = pd.DataFrame(edges, columns=['Source', 'Target', 'Weight'])

# Export the edges to a CSV file
edges_df.to_csv('../../data/processed/modulome/network2.csv', index=False)
edges_df

Unnamed: 0,Source,Target,Weight
0,salb_BGC-19-20 deletion,salb_Glutamine,0.097194
1,salb_BGC-19-20 deletion,salb_Prophages,0.065303
2,salb_BGC-19-20 deletion,salb_Surugamide repressor,0.074519
3,salb_BGC-19-20 deletion,salb_ArsR,0.062347
4,salb_BGC-19-20 deletion,salb_Paulomycin-1,0.048064
...,...,...,...
257398,bsu_Alb - Antilisterial Bacteriocin,"bsu_MtlR / AnsR - Mannitol, Asparagine, and As...",0.047935
257399,bsu_Alb - Antilisterial Bacteriocin,bsu_CcpA-1 - Low Glucose 1,0.057405
257400,bsu_LicR - Lichenan,"bsu_MtlR / AnsR - Mannitol, Asparagine, and As...",0.080184
257401,bsu_LicR - Lichenan,bsu_CcpA-1 - Low Glucose 1,0.112897


In [19]:
M.to_csv('../../data/processed/modulome/merged_M.csv')

# Change the names of the orthogroups ids to identifiable gene names

In [20]:
M = pd.read_csv('../../data/processed/modulome/merged_M.csv', index_col=0)

In [21]:
from Bio import SeqIO

def parse_gbff(file_path):
    locus_to_gene = {}  # New dictionary to store locus tag to gene name mapping
    protein_to_locus = {}
    for record in SeqIO.parse(file_path, 'genbank'):
        for feature in record.features:
            if feature.type == 'CDS':
                protein_id = feature.qualifiers.get('protein_id', [''])[0]
                locus_tag = feature.qualifiers.get('locus_tag', [''])[0]
                gene_name = feature.qualifiers.get('gene', [''])[0]  # Extract the gene name
                protein_to_locus[protein_id] = locus_tag
                locus_to_gene[locus_tag] = gene_name  # Store the gene name in the dictionary
    return protein_to_locus, locus_to_gene

# Parse the gbff files and extract protein IDs, locus tags, and gene names for each species
salb_protein_to_locus, salb_locus_to_gene = parse_gbff('../../data/external/modulome/genomes/Sal.gbff')
ecol_protein_to_locus, ecol_locus_to_gene = parse_gbff('../../data/external/modulome/genomes/Eco.gbff')
mtub_protein_to_locus, mtub_locus_to_gene = parse_gbff('../../data/external/modulome/genomes/Mtu.gbff')
pae_protein_to_locus, pae_locus_to_gene = parse_gbff('../../data/external/modulome/genomes/Pae.gbff')
sen_protein_to_locus, sen_locus_to_gene = parse_gbff('../../data/external/modulome/genomes/Sen.gbff')
sac_protein_to_locus, sac_locus_to_gene = parse_gbff('../../data/external/modulome/genomes/sac.gbff')
bsu_protein_to_locus, bsu_locus_to_gene = parse_gbff('../../data/external/modulome/genomes/bsu.gbff')

In [22]:
# Create a new dictionary to store the mapping from orthogroups to gene names
orthogroup_to_gene = {}

# List of protein_to_gene dictionaries in the order you want to check them
protein_to_gene_dicts = [salb_locus_to_gene, bsu_locus_to_gene, mtub_locus_to_gene, pae_locus_to_gene, sen_locus_to_gene, sac_locus_to_gene]

class BreakIt(Exception): pass

# For each orthogroup in the orthologous_genes dictionary
for orthogroup, locus_tags in orthologous_genes.items():
    try:
        # For each locus tag
        for locus_tag in locus_tags:
            # Check the ecol_locus_to_gene dictionary first
            if locus_tag in ecol_locus_to_gene:
                # Add the gene name to the orthogroup_to_gene dictionary and break the loop
                orthogroup_to_gene[orthogroup] = ecol_locus_to_gene[locus_tag]
                raise BreakIt
            # For each of the other protein_to_gene dictionaries
            for protein_to_gene in protein_to_gene_dicts[1:]:
                # If the locus tag is in the dictionary
                if locus_tag in protein_to_gene:
                    # Add the gene name to the orthogroup_to_gene dictionary
                    orthogroup_to_gene[orthogroup] = protein_to_gene[locus_tag]
            # If the locus tag is not in any dictionary, add the locus tag itself to the orthogroup_to_gene dictionary
            else:
                orthogroup_to_gene[orthogroup] = locus_tag
    except BreakIt:
        continue

In [23]:
orthogroup_to_gene

{'OG0000000': 'entA',
 'OG0000001': 'Rv2741',
 'OG0000002': 'XNR_RS12970',
 'OG0000003': 'Rv3621c',
 'OG0000004': 'yadG',
 'OG0000005': 'feaB',
 'OG0000006': 'narP',
 'OG0000007': 'galR',
 'OG0000008': 'cynR',
 'OG0000009': 'hprR',
 'OG0000010': 'ydiO',
 'OG0000011': 'hisP',
 'OG0000012': 'yejF',
 'OG0000013': 'menB',
 'OG0000014': 'slyA',
 'OG0000015': 'caiC',
 'OG0000016': 'decR',
 'OG0000017': 'pspF',
 'OG0000018': 'potG',
 'OG0000019': 'yebQ',
 'OG0000020': 'nlpD',
 'OG0000021': 'aroP',
 'OG0000022': 'yqaB',
 'OG0000023': 'XNR_RS20475',
 'OG0000024': 'uidR',
 'OG0000025': 'pdeR',
 'OG0000026': 'ttdR',
 'OG0000027': 'dkgB',
 'OG0000028': 'rutD',
 'OG0000029': 'wcaL',
 'OG0000030': 'glaR',
 'OG0000031': 'malF',
 'OG0000032': 'ycjP',
 'OG0000033': 'XNR_RS29070',
 'OG0000034': 'qorA',
 'OG0000035': 'cspD',
 'OG0000036': 'glnH',
 'OG0000037': 'kdgR',
 'OG0000038': 'PA4307',
 'OG0000039': 'dgcF',
 'OG0000040': 'XNR_RS11690',
 'OG0000041': 'ytfH',
 'OG0000042': 'mngR',
 'OG0000043': 'dbpA

In [24]:
# Replace the index of the M DataFrame with the gene names from the orthogroup_to_gene dictionary
# If the orthogroup does not have a gene name or the gene name is an empty string, keep the orthogroup id as it was
M.index = M.index.map(lambda x: orthogroup_to_gene.get(x, x) if orthogroup_to_gene.get(x, x) != '' else x)

In [25]:
M.to_csv('../../data/processed/modulome/merged_M_max_gene_names.csv')

# Create node table

In [26]:
# create node tables listing all iModulons and the species
salb_M_T = salb_M.T
salb_nodes = pd.DataFrame(index=salb_M_T.index)
salb_nodes['species'] = 'Sal'

mtub_M_T = mtub_M.T
mtub_nodes = pd.DataFrame(index=mtub_M_T.index)
mtub_nodes['species'] = 'Mtu'

ecol_M_T = ecol_M.T
ecol_nodes = pd.DataFrame(index=ecol_M_T.index)
ecol_nodes['species'] = 'Eco'

pae_M_T = pae_M.T
pae_nodes = pd.DataFrame(index=pae_M_T.index)
pae_nodes['species'] = 'Pae'

sen_M_T = sen_M.T
sen_nodes = pd.DataFrame(index=sen_M_T.index)
sen_nodes['species'] = 'Sen'

sac_M_T = sac_M.T
sac_nodes = pd.DataFrame(index=sac_M_T.index)
sac_nodes['species'] = 'sac'

bsu_M_T = bsu_M.T
bsu_nodes = pd.DataFrame(index=bsu_M_T.index)
bsu_nodes['species'] = 'bsu'

# Concatenate the node tables
nodes = pd.concat([salb_nodes, mtub_nodes, ecol_nodes, pae_nodes, sen_nodes, sac_nodes, bsu_nodes])
nodes

Unnamed: 0,species
salb_BGC-19-20 deletion,Sal
salb_Glutamine,Sal
salb_Prophages,Sal
salb_Surugamide repressor,Sal
salb_ArsR,Sal
...,...
bsu_Empty 3,bsu
bsu_Alb - Antilisterial Bacteriocin,bsu
bsu_LicR - Lichenan,bsu
"bsu_MtlR / AnsR - Mannitol, Asparagine, and Aspartate",bsu


In [27]:
nodes.to_csv('../../data/processed/modulome/nodes.csv')

In [None]:
edges_df = pd.read_csv('../../data/processed/modulome/network2.csv')

# 