In [25]:
## Preparing the abundance data ##


import pandas as pd

# Load data
df3 = pd.read_csv('/Users/nfarooqi/masters-thesis-evolvey/space-evolvey/code/evolvey_analysis/Testerosa/data/mmc1_Table.S3.txt', sep='\t')

# Replace '-' with 0 and fill NaN with 0
df3 = df3.replace('-', 0)
df3 = df3.fillna(0)

# Drop the second column
df3 = df3.drop(df3.columns[1], axis=1)

# Convert columns to numeric, except the first column, handling non-numeric values
df3.iloc[:, 1:] = df3.iloc[:, 1:].apply(pd.to_numeric, errors='coerce').fillna(0)

# Normalize each column by the column's maximum value
df3.iloc[:, 1:] = df3.iloc[:, 1:].div(df3.iloc[:, 1:].max())

# Now substract all values from 1 to invert the values

df3.iloc[:, 1:] = 1 - df3.iloc[:, 1:]

# Display the result
print(df3.head())

   Name SvL1chip71-batch41.280602 PheH1.chip37-batch47.181002  \
0  ybeF                  0.983924                         1.0   
1  glpQ                  0.981708                    0.871917   
2  ybdT                  0.948823                     0.43833   
3  ybxH                  0.985057                     0.88425   
4  ybdM                  0.977327                    0.886148   

  ah1.chip5.200301 ah1.chip7.200301 ah1.chip10.200301 ah2.chip6.200301  \
0         0.683299         0.686465          0.785778         0.849071   
1         0.696497         0.724377          0.768707         0.864026   
2         0.682857         0.678485          0.753008         0.833862   
3         0.703401         0.709461          0.778522         0.795993   
4         0.697551         0.708653          0.768628         0.781421   

  ah2.chip8.200301 ah2.chip9.200301 al2.chip9-batch8.240401  ...  \
0         0.831115              1.0                0.819421  ...   
1         0.855892         0

In [None]:
## Preparing the GSM data ##

import cobra 

# Load the model

model = cobra.io.read_sbml_model('/Users/nfarooqi/masters-thesis-evolvey/space-evolvey/code/evolvey_analysis/Testerosa/models/subtilis.xml')


rxn_gene_dict = {rxn.id: rxn.gene_name_reaction_rule for rxn in model.reactions}

# make it into a dataframe

rxn_gene_df = pd.DataFrame(rxn_gene_dict.items(), columns=['rxn_id', 'gene_names'])

#filter out reactions with no gene names

rxn_gene_df = rxn_gene_df[rxn_gene_df['gene_names'] != '']

df = rxn_gene_df

def parse_genes(genes):
    # Replace logical connectors with commas, and remove parentheses
    genes = genes.replace(' and ', ', ').replace(' or ', ', ').replace('(', '').replace(')', '')
    # Split on commas and strip extra spaces, then deduplicate
    genes = set([gene.strip() for gene in genes.split(',')])
    return ', '.join(sorted(genes))

# Apply the function to clean up gene names
df['gene_names'] = df['gene_names'].apply(parse_genes)

# If needed, aggregate by 'rxn_id' (if there are duplicate 'rxn_id' values)
df = df.groupby('rxn_id')['gene_names'].agg(lambda x: ', '.join(sorted(set(', '.join(x).split(', '))))).reset_index()

print(df.head(50))

        rxn_id                                         gene_names
179    2S6HCCi                                               menD
180     26DPAi                                  spoVFA and spoVFB
181    3AMBAt2                                               gabP
182        FDH  (fdhD and yjgC) or (fdhD and yrhE) or (fdhD an...
183     FDMO_1                                               ssuD
185    AAMYL_1                                               amyE
189       ABTA                                               gabT
193        FBA                                       iolJ or fbaA
195       FBA2                                       fbaA or iolJ
196        FBP                                                fbp
197     FCLT_2                                               hemH
198      FDMO1                                               ssuD
199    FDMO2_1                                               ssuD
200    FDMO3_1                                               ssuD
201     AB

In [24]:
import pandas as pd

# Assuming df and df3 are already defined and df3 is set with 'Name' as the index
df3.set_index('Name', inplace=True)

# Function to retrieve expression values for genes
def get_expression_values(gene_list, col):
    values = []
    genes = gene_list.split(', ')
    for gene in genes:
        if gene in df3.index:
            value = df3.at[gene, col]  # Retrieve the value for the gene
            values.append(str(value))
    return ', '.join(values)

# Prepare a dictionary to hold the new columns data
new_columns = {}

# Loop over each column in df3 (except the index 'Name')
for col in df3.columns:
    # Apply the function to each row in df and store results in the dictionary
    new_columns[col] = df['gene_names'].apply(lambda x: get_expression_values(x, col))

# Create a new DataFrame from the dictionary
new_df = pd.DataFrame(new_columns)

# Concatenate this new DataFrame with the original df
df = pd.concat([df, new_df], axis=1)

print(df)

      rxn_id        gene_names  \
0    23CN2P1              yfkN   
1    23CN2P2              yfkN   
2    23CN2P3              yfkN   
3    23CN2P4              yfkN   
4     26DPAi    spoVFA, spoVFB   
..       ...               ...   
899    XYLI2              xylA   
900     XYLK              xylB   
901    XYLt2              araE   
902    ZN2t4              czcD   
903    ZNabc  znuA, znuB, znuC   

                             SvL1chip71-batch41.280602  \
0    Name\nyfkN    0.977327\nyfkN         1.0\nName...   
1    Name\nyfkN    0.977327\nyfkN         1.0\nName...   
2    Name\nyfkN    0.977327\nyfkN         1.0\nName...   
3    Name\nyfkN    0.977327\nyfkN         1.0\nName...   
4    Name\nspoVFA    0.984008\nspoVFA         1.0\n...   
..                                                 ...   
899  Name\nxylA    1.0\nxylA    1.0\nName: SvL1chip...   
900  Name\nxylB    0.97452\nxylB        1.0\nName: ...   
901  Name\naraE    0.987888\naraE         1.0\nName...   
902  Name\n