In [3]:
import pandas as pd
from statsmodels.stats.anova import AnovaRM

In [41]:
# Read the data
data_new = pd.read_csv('mapping_file_taxonomy.csv', index_col=0)

# Filter rows where 'Sample date' contains '2021'
data_new = data_new[data_new['Sample date'].str.contains('2021')]

# Convert 'Sample date' to datetime
data_new['Sample date'] = pd.to_datetime(data_new['Sample date'])

# Calculate the 'Days elapsed' variable
# Assuming the first date in your data is the start date
start_date = data_new['Sample date'].min()
data_new['Days_elapsed'] = (data_new['Sample date'] - start_date).dt.days

# Create a mapping for Water and Nitrogen
water_mapping = {'Low': 70, 'High': 100}
nitrogen_mapping = {'Low': 22, 'Optimal': 224, 'High': 275}

# Map the Water and Nitrogen values to their numeric representations
data_new['Water Numeric'] = data_new['Water'].map(water_mapping)
data_new['Nitrogen Numeric'] = data_new['Nitrogen'].map(nitrogen_mapping)

# Display the first few rows of the dataframe
data_new.head()

Unnamed: 0.1,Unnamed: 0,Sample name,Sample date,Sample ID,Plot,ID,Nitrogen,Water,Block,Direction,...,Bacteria Methylacidiphilum kamchatkense,Bacteria Cephaloticoccus primus,Bacteria Lacunisphaera anatis,Bacteria Haloferula sargassicola,Bacteria Luteolibacter arcticus,Bacteria Prosthecobacter vanneervenii,Bacteria Roseibacillus persicicus,Days_elapsed,Water Numeric,Nitrogen Numeric
0,0,193,2021-06-15,2N,2N,111.0,Low,High,1,N,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,100,22
1,1,205,2021-06-15,2S,2S,111.0,Low,High,1,S,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,100,22
2,2,212,2021-06-15,23S,23S,211.0,Low,High,2,S,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,100,22
3,3,221,2021-06-15,23N,23N,211.0,Low,High,2,N,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,100,22
4,4,195,2021-06-15,32N,32N,311.0,Low,High,3,N,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,100,22


In [44]:
import warnings
warnings.filterwarnings('ignore')

In [45]:
# Filter columns for bacteria
bacteria_columns = [col for col in data_new.columns if col.startswith("Bacteria")]

# Initialize a list to store results
anova_results_list = []

# Repeated Measures ANOVA for each bacteria column
for bacteria in bacteria_columns:
    # Preparing the dataset for this specific bacterium
    df_anova = data_new[['Days_elapsed', 'Nitrogen', 'Water', bacteria]]
    df_anova = df_anova.rename(columns={bacteria: 'Abundance'})

    # Convert 'Abundance' to numeric, coercing non-numeric values to NaN
    df_anova['Abundance'] = pd.to_numeric(df_anova['Abundance'], errors='coerce')

    # Check for NaN values and handle them appropriately
    if df_anova['Abundance'].isnull().any():
        print(f"NaN values found in {bacteria}. Handling NaNs...")
        df_anova['Abundance'].fillna(0, inplace=True)

    # print(f"Analyzing {bacteria}...")  # Diagnostic print

    # ANOVA for Nitrogen
    try:
        df_nitrogen = df_anova.drop('Water', axis=1)
        df_nitrogen = df_nitrogen.groupby(['Days_elapsed', 'Nitrogen']).mean().reset_index()
        model_nitrogen = AnovaRM(df_nitrogen, 'Abundance', 'Days_elapsed', within=['Nitrogen']).fit()
        f_val_nitrogen = model_nitrogen.anova_table['F Value'][0]
        p_val_nitrogen = model_nitrogen.anova_table['Pr > F'][0]
    except Exception as e:
        print(f"Error in ANOVA for Nitrogen in {bacteria}: {e}")
        f_val_nitrogen, p_val_nitrogen = None, None

    # ANOVA for Water
    try:
        df_water = df_anova.drop('Nitrogen', axis=1)
        df_water = df_water.groupby(['Days_elapsed', 'Water']).mean().reset_index()
        model_water = AnovaRM(df_water, 'Abundance', 'Days_elapsed', within=['Water']).fit()
        f_val_water = model_water.anova_table['F Value'][0]
        p_val_water = model_water.anova_table['Pr > F'][0]
    except Exception as e:
        print(f"Error in ANOVA for Water in {bacteria}: {e}")
        f_val_water, p_val_water = None, None

    # Append the results to the list
    anova_results_list.append({
        'Bacteria': bacteria,
        'F_Value_Nitrogen': f_val_nitrogen,
        'P_Value_Nitrogen': p_val_nitrogen,
        'F_Value_Water': f_val_water,
        'P_Value_Water': p_val_water
    })

# Convert the list of results into a DataFrame
anova_results = pd.DataFrame(anova_results_list)

# Display the first few rows of the results
anova_results.head()

                             Bacteria  F_Value_Nitrogen  P_Value_Nitrogen  \
0       Bacteria Acidicapsa acidisoli          2.925958          0.111255   
1      Bacteria Acidipila dinghuensis          2.423540          0.150363   
2            Bacteria Acidipila rosea          5.336928          0.033684   
3    Bacteria Acidisarcina polymorpha          1.424773          0.295606   
4  Bacteria Acidobacterium capsulatum          1.127808          0.370265   

   F_Value_Water  P_Value_Water  
0       2.282489       0.205365  
1       0.163393       0.706731  
2       0.319680       0.602007  
3       2.540529       0.186182  
4       5.227043       0.084202  


In [47]:
anova_results.sort_values('F_Value_Nitrogen', ascending = False)

Unnamed: 0,Bacteria,F_Value_Nitrogen,P_Value_Nitrogen,F_Value_Water,P_Value_Water
391,Bacteria Melghirimyces thermohalophilus,16.451925,0.001463,33.530748,0.004421
490,Bacteria Hyphomicrobium sulfonivorans,11.763812,0.004146,10.506088,0.031635
750,Bacteria Dokdonella koreensis,11.206307,0.004788,4.588635,0.098853
393,Bacteria Shimazuella kribbensis,10.562687,0.005692,10.085910,0.033669
189,Bacteria Patulibacter medicamentivorans,10.559682,0.005697,6.295982,0.066115
...,...,...,...,...,...
2752,Bacteria Lacunisphaera anatis,,,,
2753,Bacteria Haloferula sargassicola,,,,
2754,Bacteria Luteolibacter arcticus,,,,
2755,Bacteria Prosthecobacter vanneervenii,,,,
