In [1]:
import pandas
import cobra
import os
from tqdm import tqdm


directory = "./thauera-files/ThaueraPaper/"

# sampling_dir = "OptimalHeterotrophic/"
# sampling_dir = "OptimalAnaerobicDesnitrification/"

# sampling_dir = "OptSolarom/"
# sampling_dir = "optimalAromatic/"

# sampling_dir = "aerobicAromatic/"
sampling_dir = "anaerobicAromatic/"

# ranks_dir = "ranks/"
ranks_dir = "ranks-simplified/"

In [2]:
log_format = '%(asctime)s %(message)s' #%(clientip)-15s %(user)-8s
import logging
from coralme.builder.main import ListHandler
log = logging.getLogger()
logging.basicConfig(filename = "./tmp.log", filemode = 'w', level = logging.WARNING, format = log_format)
logging.captureWarnings(True)

### Run

In [3]:
model = pandas.read_excel(directory + "ModelThauera.xls")
model["Subsystem"].head()

0    Cofactor and Prosthetic Group Biosynthesis
1    Cofactor and Prosthetic Group Biosynthesis
2    Cofactor and Prosthetic Group Biosynthesis
3    Cofactor and Prosthetic Group Biosynthesis
4            Purine and Pyrimidine Biosynthesis
Name: Subsystem, dtype: object

In [4]:
Sampling = pandas.DataFrame()
samples = []
for file in os.listdir(directory+sampling_dir):
    if "csv" not in file:
        continue
    met = file.split(".csv")[0]
    df = pandas.read_csv(directory+sampling_dir+file,header=None)

    # Full
    # Sampling[met] = df.stack()

    # Simplified
    Sampling[met] = df.mean(axis=1)
    samples.append(met)

In [5]:
# Full
# Sampling = pandas.concat([Sampling.reset_index().set_index("level_0"),model["Subsystem"]],axis=1).reset_index()

In [8]:
# Simplified
Sampling = pandas.concat([Sampling,model["Subsystem"]],axis=1).reset_index()[samples+["Subsystem"]]

In [7]:
Sampling.to_csv(sampling_dir+"Sampling.csv")

In [10]:
for col in samples:
    tmp = Sampling[col].sort_values()
    tmp.rank().to_csv(sampling_dir+ranks_dir+"{}_rank.csv".format(col))

### Load

In [9]:
Sampling = pandas.read_csv(sampling_dir+"Sampling.csv",index_col=0)

In [10]:
samples = [file.split(".csv")[0] for file in os.listdir(directory+sampling_dir) if "csv" in file]

In [11]:
from scipy.stats import mannwhitneyu
classification = "Subsystem"
def mannwhitney(class_value,col1,col2,alt="greater"):
    Rank = pandas.concat([
        pandas.read_csv(sampling_dir+ranks_dir+"{}_rank.csv".format(col1),index_col=0),
        pandas.read_csv(sampling_dir+ranks_dir+"{}_rank.csv".format(col2),index_col=0),
        Sampling["Subsystem"]
    ],axis=1)
    taxa_df = Rank[Rank[classification] == class_value]
    
    x = taxa_df[col1].values
    y = taxa_df[col2].values
    return mannwhitneyu(x,y, method="exact", alternative=alt)

In [12]:
ref = "ac_e"
# ref = "glc__D_e"

# alt = "greater"
alt = "less"

In [13]:
dct = {}
for col in samples:
    if col == ref:
        continue
    dct[col] = {}
    for subs in Sampling["Subsystem"].unique():
        if not isinstance(subs,str) or "Transport" in subs or "Exchange" in subs:
            continue
        ListHandler.print_and_log("Running {}:{}".format(col,subs))
        U1, p = mannwhitney(subs,col,ref,alt=alt)
        dct[col][subs] = p
        ListHandler.print_and_log("Solution: {},{},{}".format(col,subs,p))

Running phenol_e:Cofactor and Prosthetic Group Biosynthesis
Solution: phenol_e,Cofactor and Prosthetic Group Biosynthesis,0.9999999833661593
Running phenol_e:Purine and Pyrimidine Biosynthesis
Solution: phenol_e,Purine and Pyrimidine Biosynthesis,0.9502405057787295
Running phenol_e:Alternate Carbon Metabolism
Solution: phenol_e,Alternate Carbon Metabolism,1.0
Running phenol_e:Tyrosine, Tryptophan, and Phenylalanine Metabolism
Solution: phenol_e,Tyrosine, Tryptophan, and Phenylalanine Metabolism,0.9775382272605322
Running phenol_e:Oxidative Phosphorylation
Solution: phenol_e,Oxidative Phosphorylation,0.9999999958270008
Running phenol_e:Extracellular exchange
Solution: phenol_e,Extracellular exchange,0.999999999999841
Running phenol_e:Valine, Leucine, and Isoleucine Metabolism
Solution: phenol_e,Valine, Leucine, and Isoleucine Metabolism,0.9077662607288839
Running phenol_e:Arginine and Proline Metabolism
Solution: phenol_e,Arginine and Proline Metabolism,0.9732225387163239
Running phenol

In [14]:
pandas.DataFrame.from_dict(dct).to_csv(sampling_dir+"MWU-{}-results.csv".format(alt))