## Comprehensive Earth Mover's Distance (EMD) Analysis for Median_TC and Median_UC

In [1]:
import pandas as pd
from scipy.stats import wasserstein_distance

In [2]:
data = pd.read_csv("data/RESULTS_FDR_LASTLEVELONLY_v2.TAB", sep="\t")
data.head()

Unnamed: 0,strain,condition,slevel,locus_tag,Annotation_TC,Median_TC,controlMedian_TC,fdr_TC,Effect_TC,Annotation_UC,Median_UC,controlMedian_UC,fdr_UC,Effect_UC
1,DCLPA,canavanine,HIGH,CCNA_00001,Essential,0.0,0.0,,,Essential,0.0,0.0,,
2,DCLPA,canavanine,HIGH,CCNA_00002,Conditionally_Neutral,7603.0,5948.5,1.0,0.209904,Conditionally_Neutral,42.0,50.5,1.0,-0.178476
3,DCLPA,canavanine,HIGH,CCNA_00003,Conditionally_Neutral,3672.0,3681.0,1.0,0.355211,Conditionally_Neutral,30.0,36.0,1.0,-0.317923
4,DCLPA,canavanine,HIGH,CCNA_00004,Essential,0.0,0.0,,,Essential,0.0,0.0,,
5,DCLPA,canavanine,HIGH,CCNA_00005,Essential,0.0,0.0,,,Essential,0.0,0.0,,


In [3]:
# Filtering data based on conditions

# For Median_TC
filtered_data_TC = data[~data['Annotation_TC'].isin(['Essential', 'Conditionally_Neutral'])]
filtered_data_TC = filtered_data_TC[filtered_data_TC['strain'].isin(['wild-type', 'DCLPA', 'DCLPB'])]
heat_data_TC = filtered_data_TC[(filtered_data_TC['condition'] == 'heat') & (filtered_data_TC['slevel'] == 'LOW')]
oxidative_data_TC = filtered_data_TC[filtered_data_TC['condition'] == 'oxidative-peroxide']
filtered_data_TC = pd.concat([heat_data_TC, oxidative_data_TC])

# For Median_UC
filtered_data_UC = data[~data['Annotation_UC'].isin(['Essential', 'Conditionally_Neutral'])]
filtered_data_UC = filtered_data_UC[filtered_data_UC['strain'].isin(['wild-type', 'DCLPA', 'DCLPB'])]
heat_data_UC = filtered_data_UC[(filtered_data_UC['condition'] == 'heat') & (filtered_data_UC['slevel'] == 'LOW')]
oxidative_data_UC = filtered_data_UC[filtered_data_UC['condition'] == 'oxidative-peroxide']
filtered_data_UC = pd.concat([heat_data_UC, oxidative_data_UC])

In [5]:
# Define functions to calculate EMD for Median_TC and Median_UC

def calculate_emd_for_strain(data):
    conditions = ["heat", "oxidative-peroxide"]
    slevels = ["LOW", "MEDIUM", "HIGH"]
    emd_results = {}
    base_data = data[(data["condition"] == "heat") & (data["slevel"] == "LOW")]["Median_UC"].values
    for condition in conditions:
        for slevel in slevels:
            comparison_data = data[(data["condition"] == condition) & (data["slevel"] == slevel)]["Median_UC"].values
            emd_value = wasserstein_distance(base_data, comparison_data)
            emd_results[("heat", "LOW", condition, slevel)] = emd_value
    return emd_results

def calculate_emd_for_strain_TC(data):
    conditions = ["heat", "oxidative-peroxide"]
    slevels = ["LOW", "MEDIUM", "HIGH"]
    emd_results = {}
    base_data = data[(data["condition"] == "heat") & (data["slevel"] == "LOW")]["Median_TC"].values
    for condition in conditions:
        for slevel in slevels:
            comparison_data = data[(data["condition"] == condition) & (data["slevel"] == slevel)]["Median_TC"].values
            emd_value = wasserstein_distance(base_data, comparison_data)
            emd_results[("heat", "LOW", condition, slevel)] = emd_value
    return emd_results

def calculate_emd_for_strain_with_check(data, metric):
    conditions = ["heat", "oxidative-peroxide"]
    slevels = ["LOW", "MEDIUM", "HIGH"]
    emd_results = {}
    base_data = data[(data["condition"] == "heat") & (data["slevel"] == "LOW")][metric].values
    for condition in conditions:
        for slevel in slevels:
            comparison_data = data[(data["condition"] == condition) & (data["slevel"] == slevel)][metric].values
            if len(base_data) == 0 or len(comparison_data) == 0:
                emd_value = None
            else:
                emd_value = wasserstein_distance(base_data, comparison_data)
            emd_results[("heat", "LOW", condition, slevel)] = emd_value
    return emd_results

In [6]:
# Calculate EMD for each strain for both metrics

strains = ["wild-type", "DCLPA", "DCLPB"]

emd_results_TC = {}
for strain in strains:
    strain_data = filtered_data_TC[filtered_data_TC["strain"] == strain]
    emd_results_TC[strain] = calculate_emd_for_strain_with_check(strain_data, "Median_TC")

emd_results_UC = {}
for strain in strains:
    strain_data = filtered_data_UC[filtered_data_UC["strain"] == strain]
    emd_results_UC[strain] = calculate_emd_for_strain_with_check(strain_data, "Median_UC")

In [7]:
emd_values = []

for strain, values in emd_results_TC.items():
    for pair, emd_value in values.items():
        condition_pair = f"{pair[0]}:{pair[1]}∥{pair[2]}:{pair[3]}"
        emd_values.append([strain, condition_pair, emd_value, "Median_TC"])

for strain, values in emd_results_UC.items():
    for pair, emd_value in values.items():
        condition_pair = f"{pair[0]}:{pair[1]}∥{pair[2]}:{pair[3]}"
        emd_values.append([strain, condition_pair, emd_value, "Median_UC"])

emd_df = pd.DataFrame(emd_values, columns=["Strain", "Condition Pair", "EMD Value", "Metric"])
emd_df

Unnamed: 0,Strain,Condition Pair,EMD Value,Metric
0,wild-type,heat:LOW∥heat:LOW,0.0,Median_TC
1,wild-type,heat:LOW∥heat:MEDIUM,,Median_TC
2,wild-type,heat:LOW∥heat:HIGH,,Median_TC
3,wild-type,heat:LOW∥oxidative-peroxide:LOW,71.735074,Median_TC
4,wild-type,heat:LOW∥oxidative-peroxide:MEDIUM,88.967635,Median_TC
5,wild-type,heat:LOW∥oxidative-peroxide:HIGH,123.641578,Median_TC
6,DCLPA,heat:LOW∥heat:LOW,0.0,Median_TC
7,DCLPA,heat:LOW∥heat:MEDIUM,,Median_TC
8,DCLPA,heat:LOW∥heat:HIGH,,Median_TC
9,DCLPA,heat:LOW∥oxidative-peroxide:LOW,134.841847,Median_TC


In [None]:
csv_path = "emd_values_corrected.csv"
emd_df.to_csv(csv_path, index=False)
csv_path