In [1]:
import ast
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Read the csv file
final_df = pd.read_csv('Data/Brute_force/collated_results3.csv')

# Filter rows where both 'Microbe' and 'Interacting Microbe' start with 'Bacteria'
final_df = final_df[(final_df['Microbe'].str.startswith('Bacteria')) & (final_df['Interaction Microbe'].str.startswith('Bacteria'))]

# Sort values based on 'Averaged R2'
final_df = final_df.sort_values('R2', ascending = False)

# Print the shape of the dataframe
print(final_df.shape)

# Display the top rows of the dataframe
final_df.head()

(440232, 5)


Unnamed: 0.1,Unnamed: 0,Microbe,Interaction Microbe,Feature Importance,R2
0,393656,Bacteria_Patulibacter_brassicae,Bacteria_Sphingomonas_sediminicola,{'Bacteria_Patulibacter_brassicae': 0.08318639...,0.578298
1,344864,Bacteria_Mesorhizobium_opportunistum,Bacteria_Nitrosospira_lacus,{'Bacteria_Mesorhizobium_opportunistum': 0.003...,0.510694
2,250436,Bacteria_Planctomyces_sp._SH-PL14,Bacteria_Nitrosospira_lacus,{'Bacteria_Planctomyces_sp._SH-PL14': 0.087813...,0.503452
3,319314,Bacteria_Nitrosospira_lacus,Bacteria_Mesorhizobium_japonicum,{'Bacteria_Nitrosospira_lacus': 0.333182826547...,0.489512
4,21262,Bacteria_Dokdonella_fugitiva,Bacteria_Nitrosospira_lacus,{'Bacteria_Dokdonella_fugitiva': 0.02120302892...,0.485206


In [3]:
def count_reverse_duplicates(df):
    # Create a column to hold sorted tuple of 'Microbe' and 'Interaction Microbe'
    df['sorted_combination'] = df.apply(lambda x: tuple(sorted([x['Microbe'], x['Interaction Microbe']])), axis=1)
    
    # Group by the sorted combination and count
    duplicate_counts = df.groupby('sorted_combination').size().reset_index(name='counts')
    
    # Filter to find combinations that appear more than once (reverse duplicates)
    reverse_duplicates = duplicate_counts[duplicate_counts['counts'] > 1]
    
    return reverse_duplicates

# Apply the function on your dataframe
reverse_duplicates = count_reverse_duplicates(final_df)
print(len(reverse_duplicates))

220116


In [4]:
# Create a column to hold sorted tuple of 'Microbe' and 'Interaction Microbe'
final_df['sorted_combination'] = final_df.apply(lambda x: tuple(sorted([x['Microbe'], x['Interaction Microbe']])), axis=1)

# Drop duplicates based on the sorted combination
final_df_deduplicated = final_df.drop_duplicates(subset='sorted_combination', keep='first').drop(columns=['sorted_combination'])

# Resetting index for better data readability
final_df_deduplicated = final_df_deduplicated.reset_index(drop=True)
final_df_deduplicated = final_df_deduplicated.iloc[:2000, :]
print(final_df_deduplicated.shape)

(2000, 5)


In [5]:
final_df_deduplicated.head(n = 50)

Unnamed: 0.1,Unnamed: 0,Microbe,Interaction Microbe,Feature Importance,R2
0,393656,Bacteria_Patulibacter_brassicae,Bacteria_Sphingomonas_sediminicola,{'Bacteria_Patulibacter_brassicae': 0.08318639...,0.578298
1,344864,Bacteria_Mesorhizobium_opportunistum,Bacteria_Nitrosospira_lacus,{'Bacteria_Mesorhizobium_opportunistum': 0.003...,0.510694
2,250436,Bacteria_Planctomyces_sp._SH-PL14,Bacteria_Nitrosospira_lacus,{'Bacteria_Planctomyces_sp._SH-PL14': 0.087813...,0.503452
3,319314,Bacteria_Nitrosospira_lacus,Bacteria_Mesorhizobium_japonicum,{'Bacteria_Nitrosospira_lacus': 0.333182826547...,0.489512
4,21262,Bacteria_Dokdonella_fugitiva,Bacteria_Nitrosospira_lacus,{'Bacteria_Dokdonella_fugitiva': 0.02120302892...,0.485206
5,384180,Bacteria_Geobacter_uraniireducens,Bacteria_Sphingomonas_sediminicola,{'Bacteria_Geobacter_uraniireducens': 0.008699...,0.481874
6,319083,Bacteria_Nitrosospira_lacus,Bacteria_Rathayibacter_tritici,{'Bacteria_Nitrosospira_lacus': 0.298813971563...,0.481862
7,318785,Bacteria_Nitrosospira_lacus,Bacteria_Nitrosospira_briensis,{'Bacteria_Nitrosospira_lacus': 0.190575376680...,0.470386
8,318995,Bacteria_Nitrosospira_lacus,Bacteria_Mesorhizobium_sp._M9A.F.Ca.ET.002.03.1.2,{'Bacteria_Nitrosospira_lacus': 0.196304484301...,0.469384
9,319064,Bacteria_Nitrosospira_lacus,Bacteria_Rhodoplanes_azumiensis,{'Bacteria_Nitrosospira_lacus': 0.188307947192...,0.466317


In [6]:
# Concatenate values from 'Microbe' and 'Interaction Microbe'
all_microbes = pd.concat([final_df_deduplicated['Microbe'], final_df_deduplicated['Interaction Microbe']])

# Count the occurrences of each microbe
microbe_counts = all_microbes.value_counts()

# Extract the top 10 most common microbes
top_20_microbes = microbe_counts.head(10)
top_20_microbes

Bacteria_Nitrosospira_lacus           566
Bacteria_Longilinea_arvoryzae         311
Bacteria_Microlunatus_aurantiacus     188
Bacteria_Sphingomonas_sediminicola    142
Bacteria_Paraflavitalea_soli          139
Bacteria_Brevitalea_deliciosa          94
Bacteria_Variovorax_paradoxus          90
Bacteria_Nitrosospira_briensis         77
Bacteria_Legionella_pneumophila        63
Bacteria_Povalibacter_uvarum           46
Name: count, dtype: int64

In [7]:
# Filter for rows where 'Microbe' or 'Interaction Microbe' is either 'Bacteria_Sphingomonas_sediminicola' or 'Bacteria_Nitrosospira_lacus'
filtered_df = final_df_deduplicated[
    (final_df_deduplicated['Microbe'].isin(['Bacteria_Sphingomonas_sediminicola', 'Bacteria_Nitrosospira_lacus'])) & 
    (final_df_deduplicated['Interaction Microbe'].isin(['Bacteria_Sphingomonas_sediminicola', 'Bacteria_Nitrosospira_lacus']))
]

# Display the filtered dataframe
filtered_df

Unnamed: 0.1,Unnamed: 0,Microbe,Interaction Microbe,Feature Importance,R2
39,239171,Bacteria_Sphingomonas_sediminicola,Bacteria_Nitrosospira_lacus,{'Bacteria_Sphingomonas_sediminicola': 0.14189...,0.434074


In [8]:
# Checking for duplicates
duplicates = final_df_deduplicated[final_df_deduplicated.duplicated(subset=['Microbe', 'Interaction Microbe'], keep=False)]

print(duplicates)

Empty DataFrame
Columns: [Unnamed: 0, Microbe, Interaction Microbe, Feature Importance, R2]
Index: []


In [9]:
def to_dict(val):
    if isinstance(val, str):
        try:
            return ast.literal_eval(val)
        except ValueError:
            return {}
    return val

# Convert all string representations to dictionaries using .loc to prevent the SettingWithCopyWarning
filtered_df.loc[:, 'Feature Importance'] = filtered_df['Feature Importance'].apply(to_dict)

# Initialize dictionaries to store the cumulative importance and occurrence count of each feature
cumulative_importance = {}
feature_count = {}

# Iterate over the 'Feature Importance' column
for feature_dict in filtered_df['Feature Importance']:
    for feature, importance in feature_dict.items():
        # If the feature is already in the dictionaries, add the current importance value and increment count
        if feature in cumulative_importance:
            cumulative_importance[feature] += importance
            feature_count[feature] += 1
        # If not, initialize the feature in the dictionaries with the current importance value and set count to 1
        else:
            cumulative_importance[feature] = importance
            feature_count[feature] = 1

# Convert the cumulative importance values to average by dividing by the occurrence count of each feature
average_importance = {feature: importance / feature_count[feature] for feature, importance in cumulative_importance.items()}

# Convert the average importance dictionary to a pandas DataFrame for better visualization
average_importance_df = pd.DataFrame(list(average_importance.items()), columns=['Feature', 'Average Importance']).sort_values(by='Average Importance', ascending=False)

# Display the average importance dataframe
average_importance_df

Unnamed: 0,Feature,Average Importance
1,Water trt target,0.511394
3,Bacteria_Nitrosospira_lacus,0.201272
2,N trt target,0.145438
0,Bacteria_Sphingomonas_sediminicola,0.141896


In [10]:
average_importance_df.head(n = 20)

Unnamed: 0,Feature,Average Importance
1,Water trt target,0.511394
3,Bacteria_Nitrosospira_lacus,0.201272
2,N trt target,0.145438
0,Bacteria_Sphingomonas_sediminicola,0.141896


# Model stacking

In [11]:
import warnings
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

In [12]:
# Function to clean column names
def clean_column_names(df):
    new_columns = []
    for col in df.columns:
        cleaned_col = col.replace("[", "").replace("]", "").replace(",", "")
        new_columns.append(cleaned_col)

    df.columns = new_columns
    return df

In [13]:
# One-hot encode microbial features
dummies = pd.get_dummies(final_df[['Microbe', 'Interaction Microbe']])
X_second_layer = pd.concat([final_df.drop(['Microbe', 'Interaction Microbe'], axis=1), dummies], axis=1)
X_second_layer = X_second_layer.select_dtypes(include=['bool'])

# Apply the function to clean column names
X_second_layer = clean_column_names(X_second_layer)

y_second_layer = final_df["R2"]

In [14]:
X_second_layer.head()

Unnamed: 0,Microbe_Bacteria_Achromobacter_insolitus,Microbe_Bacteria_Acidibrevibacterium_fodinaquatile,Microbe_Bacteria_Acidicapsa_acidisoli,Microbe_Bacteria_Acidimicrobium_ferrooxidans,Microbe_Bacteria_Acidipila_dinghuensis,Microbe_Bacteria_Acidipila_rosea,Microbe_Bacteria_Acidisarcina_polymorpha,Microbe_Bacteria_Acidisphaera_rubrifaciens,Microbe_Bacteria_Aciditerrimonas_ferrireducens,Microbe_Bacteria_Acidobacterium_capsulatum,...,Interaction Microbe_Bacteria_Virgibacillus_kekensis,Interaction Microbe_Bacteria_Virgibacillus_necropolis,Interaction Microbe_Bacteria_Virgibacillus_sp._Bac332,Interaction Microbe_Bacteria_Williamsia_limnetica,Interaction Microbe_Bacteria_Woeseia_oceani,Interaction Microbe_Bacteria_Youhaiella_tibetensis,Interaction Microbe_Bacteria_Zavarzinella_formosa,Interaction Microbe_Bacteria_Zhizhongheella_caldifontis,Interaction Microbe_Bacteria_Brevibacterium_frigoritolerans,Interaction Microbe_Bacteria_Desulfotomaculum_salinum
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [15]:
with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=FutureWarning)
    # Define hyperparameter grid to search

    param_grid = {
        'learning_rate': [0.01, 0.1, 0.3],
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 5, 7]
    }
    
    # Initialize the model
    second_layer_model = xgb.XGBRegressor(n_jobs=-1)
    
    # Initialize and fit GridSearchCV
    grid_search = GridSearchCV(estimator=second_layer_model, param_grid=param_grid, cv=5)
    grid_search.fit(X_second_layer, y_second_layer)
    
    # Extract best estimator and its feature importances
    best_estimator = grid_search.best_estimator_
    importances = best_estimator.feature_importances_

In [16]:
# Get feature importances
importances = best_estimator.feature_importances_

feature_names = X_second_layer.columns

# Pair feature importances with feature names and sort
feature_importance_pairs = sorted(zip(importances, feature_names), reverse=True)

# Extract the top 20 most important features
top_20_features = feature_importance_pairs[:20]

# Display the top 20 most important features
for importance, feature in top_20_features:
    print(f"{feature}: {importance}")

Microbe_Bacteria_Nitrosospira_lacus: 0.010803837329149246
Interaction Microbe_Bacteria_Nitrosospira_lacus: 0.010774942114949226
Microbe_Bacteria_Nitrosospira_briensis: 0.010164840146899223
Interaction Microbe_Bacteria_Variovorax_paradoxus: 0.010093113407492638
Interaction Microbe_Bacteria_Rhizomicrobium_electricum: 0.008559852838516235
Interaction Microbe_Bacteria_Microlunatus_aurantiacus: 0.00842492189258337
Interaction Microbe_Bacteria_Brevitalea_deliciosa: 0.008417671546339989
Microbe_Bacteria_Povalibacter_uvarum: 0.007811404298990965
Interaction Microbe_Bacteria_Nitrosospira_briensis: 0.007806373760104179
Microbe_Bacteria_Variovorax_paradoxus: 0.007783808279782534
Interaction Microbe_Bacteria_Geobacter_sp._M18: 0.007699323818087578
Microbe_Bacteria_Terrimonas_suqianensis: 0.0075937495566904545
Interaction Microbe_Bacteria_Microlunatus_phosphovorus: 0.007349255960434675
Microbe_Bacteria_Brevitalea_deliciosa: 0.007098400965332985
Microbe_Bacteria_Legionella_pneumophila: 0.00699494546