In [56]:
import numpy as np
import pandas as pd
from statsmodels.stats.anova import AnovaRM
from scipy.stats import friedmanchisquare
from scipy.stats import shapiro

In [64]:
# Read the data
data_new = pd.read_csv('mapping_file_taxonomy.csv', index_col=0)

# Filter rows where 'Sample date' contains '2021'
data_new = data_new[data_new['Sample date'].str.contains('2021')]

# Convert 'Sample date' to datetime
data_new['Sample date'] = pd.to_datetime(data_new['Sample date'])

# Calculate the 'Days elapsed' variable
# Assuming the first date in your data is the start date
start_date = data_new['Sample date'].min()
data_new['Days_elapsed'] = (data_new['Sample date'] - start_date).dt.days

# Create a mapping for Water and Nitrogen
water_mapping = {'Low': 70, 'High': 100}
nitrogen_mapping = {'Low': 22, 'Optimal': 224, 'High': 275}

# Map the Water and Nitrogen values to their numeric representations
data_new['Water Numeric'] = data_new['Water'].map(water_mapping)
data_new['Nitrogen Numeric'] = data_new['Nitrogen'].map(nitrogen_mapping)

# Remove bacteria columns with values that sum to 0
bacteria_columns = [col for col in data_new.columns if col.startswith("Bacteria")]
for col in bacteria_columns:
    if data_new[col].sum() == 0:
        data_new.drop(col, axis=1, inplace=True)

# Display the first few rows of the dataframe
data_new.head()

Unnamed: 0.1,Unnamed: 0,Sample name,Sample date,Sample ID,Plot,ID,Nitrogen,Water,Block,Direction,...,Bacteria Silanimonas mangrovi,Bacteria Stenotrophomonas maltophilia,Bacteria Thermomonas carbonis,Bacteria Thermomonas sp. SY21,Bacteria Mycoplasma yeatsii,Bacteria Akkermansia glycaniphila,Bacteria Luteolibacter gellanilyticus,Days_elapsed,Water Numeric,Nitrogen Numeric
0,0,193,2021-06-15,2N,2N,111.0,Low,High,1,N,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,100,22
1,1,205,2021-06-15,2S,2S,111.0,Low,High,1,S,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,100,22
2,2,212,2021-06-15,23S,23S,211.0,Low,High,2,S,...,0.0,0.0,0.0,0.0,0.0,0.0,500381.771773,0,100,22
3,3,221,2021-06-15,23N,23N,211.0,Low,High,2,N,...,0.0,0.0,933210.653486,1337378.0,0.0,0.0,0.0,0,100,22
4,4,195,2021-06-15,32N,32N,311.0,Low,High,3,N,...,0.0,0.0,0.0,0.0,0.0,517624.104143,0.0,0,100,22


In [65]:
import warnings
warnings.filterwarnings('ignore')

In [49]:
# Filter columns for bacteria
bacteria_columns = [col for col in data_new.columns if col.startswith("Bacteria")]

# Initialize a list to store results
anova_results_list = []

# Repeated Measures ANOVA for each bacteria column
for bacteria in bacteria_columns:
    # Preparing the dataset for this specific bacterium
    df_anova = data_new[['Days_elapsed', 'Nitrogen', 'Water', bacteria]]
    df_anova = df_anova.rename(columns={bacteria: 'Abundance'})

    # Convert 'Abundance' to numeric, coercing non-numeric values to NaN
    df_anova['Abundance'] = pd.to_numeric(df_anova['Abundance'], errors='coerce')

    # Handle NaN values in 'Abundance'
    df_anova['Abundance'].fillna(0, inplace=True)

    # Perform Shapiro-Wilk test for normality
    shapiro_test = shapiro(df_anova['Abundance'])
    normality = 'Yes' if shapiro_test.pvalue > 0.05 else 'No'

    # ANOVA for Nitrogen
    try:
        df_nitrogen = df_anova.drop('Water', axis=1)
        df_nitrogen = df_nitrogen.groupby(['Days_elapsed', 'Nitrogen']).mean().reset_index()
        model_nitrogen = AnovaRM(df_nitrogen, 'Abundance', 'Days_elapsed', within=['Nitrogen']).fit()
        f_val_nitrogen = model_nitrogen.anova_table['F Value'][0]
        p_val_nitrogen = model_nitrogen.anova_table['Pr > F'][0]
    except Exception as e:
        f_val_nitrogen, p_val_nitrogen = None, None

    # ANOVA for Water
    try:
        df_water = df_anova.drop('Nitrogen', axis=1)
        df_water = df_water.groupby(['Days_elapsed', 'Water']).mean().reset_index()
        model_water = AnovaRM(df_water, 'Abundance', 'Days_elapsed', within=['Water']).fit()
        f_val_water = model_water.anova_table['F Value'][0]
        p_val_water = model_water.anova_table['Pr > F'][0]
    except Exception as e:
        f_val_water, p_val_water = None, None

    # Append the results to the list
    anova_results_list.append({
        'Bacteria': bacteria,
        'Normality (Shapiro-Wilk)': normality,
        'F_Value_Nitrogen': f_val_nitrogen,
        'P_Value_Nitrogen': p_val_nitrogen,
        'F_Value_Water': f_val_water,
        'P_Value_Water': p_val_water
    })

# Convert the list of results into a DataFrame
anova_results = pd.DataFrame(anova_results_list)

# Display the first few rows of the results
anova_results.head()

Unnamed: 0,Bacteria,Normality (Shapiro-Wilk),F_Value_Nitrogen,P_Value_Nitrogen,F_Value_Water,P_Value_Water
0,Bacteria Acidicapsa acidisoli,No,2.925958,0.111255,2.282489,0.205365
1,Bacteria Acidipila dinghuensis,No,2.42354,0.150363,0.163393,0.706731
2,Bacteria Acidipila rosea,No,5.336928,0.033684,0.31968,0.602007
3,Bacteria Acidisarcina polymorpha,No,1.424773,0.295606,2.540529,0.186182
4,Bacteria Acidobacterium capsulatum,No,1.127808,0.370265,5.227043,0.084202


In [55]:
anova_results[anova_results['Normality (Shapiro-Wilk)'] == 'Yes'].sort_values('F_Value_Nitrogen', ascending = False)

Unnamed: 0,Bacteria,Normality (Shapiro-Wilk),F_Value_Nitrogen,P_Value_Nitrogen,F_Value_Water,P_Value_Water
180,Bacteria Gaiella occulta,Yes,1.108653,0.37585,4.958914,0.089924
35,Bacteria Corynebacterium imitans,Yes,,,,
36,Bacteria Corynebacterium tuscaniense,Yes,,,,
42,Bacteria Rhodococcus canchipurensis,Yes,,,,
120,Bacteria Rothia amarae,Yes,,,,
...,...,...,...,...,...,...
2752,Bacteria Lacunisphaera anatis,Yes,,,,
2753,Bacteria Haloferula sargassicola,Yes,,,,
2754,Bacteria Luteolibacter arcticus,Yes,,,,
2755,Bacteria Prosthecobacter vanneervenii,Yes,,,,


In [66]:
# Filter columns for bacteria
bacteria_columns = [col for col in data_new.columns if col.startswith("Bacteria")]

# Initialize a list to store results
friedman_results_list = []

# Friedman Test for each bacteria column
for bacteria in bacteria_columns:
    # Preparing the dataset for this specific bacterium
    df_friedman = data_new[['Days_elapsed', 'Nitrogen', 'Water', bacteria]]
    df_friedman = df_friedman.rename(columns={bacteria: 'Abundance'})

    # Convert 'Abundance' to numeric, coercing non-numeric values to NaN
    df_friedman['Abundance'] = pd.to_numeric(df_friedman['Abundance'], errors='coerce')

    # Handle NaN values in 'Abundance'
    df_friedman['Abundance'].fillna(0, inplace=True)

    # Pivot the data for Friedman test
    df_pivot = df_friedman.pivot_table(index='Days_elapsed', columns=['Nitrogen', 'Water'], values='Abundance')

    # Perform Friedman test
    try:
        friedman_test = friedmanchisquare(*[df_pivot[col] for col in df_pivot.columns])
        friedman_p_value = friedman_test.pvalue
    except Exception as e:
        friedman_p_value = None

    # Append the results to the list
    friedman_results_list.append({
        'Bacteria': bacteria,
        'Friedman_Test_P_Value': friedman_p_value
    })

# Convert the list of results into a DataFrame
friedman_results = pd.DataFrame(friedman_results_list)

# Display the first few rows of the results
friedman_results.head()

Unnamed: 0,Bacteria,Friedman_Test_P_Value
0,Bacteria Acidicapsa acidisoli,0.011271
1,Bacteria Acidipila dinghuensis,0.459917
2,Bacteria Acidipila rosea,0.188866
3,Bacteria Acidisarcina polymorpha,0.03592
4,Bacteria Acidobacterium capsulatum,0.310582


In [67]:
len(bacteria_columns)

1262

In [62]:
print(friedman_results.sort_values('Friedman_Test_P_Value').head(n = 20))

                                        Bacteria  Friedman_Test_P_Value
408           Bacteria Desnuesiella massiliensis               0.000139
331  Bacteria Pseudogracilibacillus endophyticus               0.000314
337              Bacteria Virgibacillus kekensis               0.000492
392           Bacteria Novibacillus thermophilus               0.000492
509                     Bacteria Microvirga soli               0.000534
401               Bacteria Clostridium saudiense               0.000561
340            Bacteria Virgibacillus sp. Bac332               0.000561
586                Bacteria Skermanella aerolata               0.000601
696                Bacteria Sorangium cellulosum               0.000631
332       Bacteria Pseudogracilibacillus marinus               0.000645
560             Bacteria Rubellimicrobium roseum               0.000956
308               Bacteria Bacillus thermolactis               0.001250
404       Bacteria Clostridium sp. SYSU GA15002T               0

In [69]:
friedman_results[friedman_results['Friedman_Test_P_Value'] < 0.05 / len(bacteria_columns)]

Unnamed: 0,Bacteria,Friedman_Test_P_Value
