In [1]:
import ast
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Read the csv file
final_df = pd.read_csv('../collated_results3.csv')

# Filter rows where both 'Microbe' and 'Interacting Microbe' start with 'Bacteria'
final_df = final_df[(final_df['Microbe'].str.startswith('Bacteria')) & (final_df['Interaction Microbe'].str.startswith('Bacteria'))]

# Sort values based on 'Averaged R2'
final_df = final_df.sort_values('R2', ascending = False)

# Print the shape of the dataframe
print(final_df.shape)

# Display the top rows of the dataframe
final_df.head()

(511940, 5)


Unnamed: 0.1,Unnamed: 0,Microbe,Interaction Microbe,Feature Importance,R2
0,307964,Bacteria_Candidatus_Saccharibacteria_bacterium...,Bacteria_Sphingomonas_sediminicola,{'Bacteria_Candidatus_Saccharibacteria_bacteri...,0.547459
1,3909,Bacteria_Nitrosospira_lacus,Bacteria_Reyranella_terrae,{'Bacteria_Nitrosospira_lacus': 0.205791482032...,0.513115
2,4040,Bacteria_Nitrosospira_lacus,Bacteria_Methylibium_petroleiphilum,{'Bacteria_Nitrosospira_lacus': 0.226117117156...,0.512003
3,4293,Bacteria_Nitrosospira_lacus,Bacteria_Bradyrhizobium_mercantei,{'Bacteria_Nitrosospira_lacus': 0.229663619571...,0.51119
4,1669,Bacteria_Devosia_sp._I507,Bacteria_Nitrosospira_lacus,{'Bacteria_Devosia_sp._I507': 0.00558376083560...,0.495946


In [3]:
def count_reverse_duplicates(df):
    # Create a column to hold sorted tuple of 'Microbe' and 'Interaction Microbe'
    df['sorted_combination'] = df.apply(lambda x: tuple(sorted([x['Microbe'], x['Interaction Microbe']])), axis=1)
    
    # Group by the sorted combination and count
    duplicate_counts = df.groupby('sorted_combination').size().reset_index(name='counts')
    
    # Filter to find combinations that appear more than once (reverse duplicates)
    reverse_duplicates = duplicate_counts[duplicate_counts['counts'] > 1]
    
    return reverse_duplicates

# Apply the function on your dataframe
reverse_duplicates = count_reverse_duplicates(final_df)
print(len(reverse_duplicates))

255970


In [4]:
# Create a column to hold sorted tuple of 'Microbe' and 'Interaction Microbe'
final_df['sorted_combination'] = final_df.apply(lambda x: tuple(sorted([x['Microbe'], x['Interaction Microbe']])), axis=1)

# Drop duplicates based on the sorted combination
final_df_deduplicated = final_df.drop_duplicates(subset='sorted_combination', keep='first').drop(columns=['sorted_combination'])

# Resetting index for better data readability
final_df_deduplicated = final_df_deduplicated.reset_index(drop=True)
final_df_deduplicated.to_csv('Data/SCINet/collated_deduplicated_results.csv')
final_df_deduplicated = final_df_deduplicated.iloc[:2000, :]
print(final_df_deduplicated.shape)

(2000, 5)


In [5]:
final_df_deduplicated.head(n = 50)

Unnamed: 0.1,Unnamed: 0,Microbe,Interaction Microbe,Feature Importance,R2
0,307964,Bacteria_Candidatus_Saccharibacteria_bacterium...,Bacteria_Sphingomonas_sediminicola,{'Bacteria_Candidatus_Saccharibacteria_bacteri...,0.547459
1,3909,Bacteria_Nitrosospira_lacus,Bacteria_Reyranella_terrae,{'Bacteria_Nitrosospira_lacus': 0.205791482032...,0.513115
2,4040,Bacteria_Nitrosospira_lacus,Bacteria_Methylibium_petroleiphilum,{'Bacteria_Nitrosospira_lacus': 0.226117117156...,0.512003
3,4293,Bacteria_Nitrosospira_lacus,Bacteria_Bradyrhizobium_mercantei,{'Bacteria_Nitrosospira_lacus': 0.229663619571...,0.51119
4,1669,Bacteria_Devosia_sp._I507,Bacteria_Nitrosospira_lacus,{'Bacteria_Devosia_sp._I507': 0.00558376083560...,0.495946
5,4012,Bacteria_Nitrosospira_lacus,Bacteria_Arthrobacter_sp._YN,{'Bacteria_Nitrosospira_lacus': 0.232273489090...,0.495326
6,75620,Bacteria_Longilinea_arvoryzae,Bacteria_Nitrosospira_briensis,{'Bacteria_Longilinea_arvoryzae': 0.2746430979...,0.494154
7,493640,Bacteria_Paludisphaera_borealis,Bacteria_Nitrosospira_lacus,{'Bacteria_Paludisphaera_borealis': 0.02989450...,0.491315
8,4064,Bacteria_Nitrosospira_lacus,Bacteria_Actinoplanes_sp._N902-109,{'Bacteria_Nitrosospira_lacus': 0.236549634547...,0.486672
9,4201,Bacteria_Nitrosospira_lacus,Bacteria_Nocardioides_islandensis,{'Bacteria_Nitrosospira_lacus': 0.224949141406...,0.478289


In [6]:
# Concatenate values from 'Microbe' and 'Interaction Microbe'
all_microbes = pd.concat([final_df_deduplicated['Microbe'], final_df_deduplicated['Interaction Microbe']])

# Count the occurrences of each microbe
microbe_counts = all_microbes.value_counts()

# Extract the top 10 most common microbes
top_20_microbes = microbe_counts.head(10)
top_20_microbes

Bacteria_Nitrosospira_lacus                   597
Bacteria_Longilinea_arvoryzae                 234
Bacteria_Microlunatus_aurantiacus             157
Bacteria_Sphingomonas_sediminicola            142
Bacteria_Paraflavitalea_soli                  116
Bacteria_Sterolibacteriaceae_bacterium_M52    106
Bacteria_Brevitalea_deliciosa                  94
Bacteria_Variovorax_paradoxus                  75
Bacteria_Nitrosospira_briensis                 66
Bacteria_Povalibacter_uvarum                   50
Name: count, dtype: int64

In [7]:
# Filter for rows where 'Microbe' or 'Interaction Microbe' is either 'Bacteria_Sphingomonas_sediminicola' or 'Bacteria_Nitrosospira_lacus'
filtered_df = final_df_deduplicated[
    (final_df_deduplicated['Microbe'].isin(['Bacteria_Sphingomonas_sediminicola', 'Bacteria_Nitrosospira_lacus'])) & 
    (final_df_deduplicated['Interaction Microbe'].isin(['Bacteria_Sphingomonas_sediminicola', 'Bacteria_Nitrosospira_lacus']))
]

# Display the filtered dataframe
filtered_df

Unnamed: 0.1,Unnamed: 0,Microbe,Interaction Microbe,Feature Importance,R2
24,324444,Bacteria_Sphingomonas_sediminicola,Bacteria_Nitrosospira_lacus,{'Bacteria_Sphingomonas_sediminicola': 0.14176...,0.450309


In [8]:
# Checking for duplicates
duplicates = final_df_deduplicated[final_df_deduplicated.duplicated(subset=['Microbe', 'Interaction Microbe'], keep=False)]

print(duplicates)

Empty DataFrame
Columns: [Unnamed: 0, Microbe, Interaction Microbe, Feature Importance, R2]
Index: []


In [9]:
def to_dict(val):
    if isinstance(val, str):
        try:
            return ast.literal_eval(val)
        except ValueError:
            return {}
    return val

# Convert all string representations to dictionaries using .loc to prevent the SettingWithCopyWarning
filtered_df.loc[:, 'Feature Importance'] = filtered_df['Feature Importance'].apply(to_dict)

# Initialize dictionaries to store the cumulative importance and occurrence count of each feature
cumulative_importance = {}
feature_count = {}

# Iterate over the 'Feature Importance' column
for feature_dict in filtered_df['Feature Importance']:
    for feature, importance in feature_dict.items():
        # If the feature is already in the dictionaries, add the current importance value and increment count
        if feature in cumulative_importance:
            cumulative_importance[feature] += importance
            feature_count[feature] += 1
        # If not, initialize the feature in the dictionaries with the current importance value and set count to 1
        else:
            cumulative_importance[feature] = importance
            feature_count[feature] = 1

# Convert the cumulative importance values to average by dividing by the occurrence count of each feature
average_importance = {feature: importance / feature_count[feature] for feature, importance in cumulative_importance.items()}

# Convert the average importance dictionary to a pandas DataFrame for better visualization
average_importance_df = pd.DataFrame(list(average_importance.items()), columns=['Feature', 'Average Importance']).sort_values(by='Average Importance', ascending=False)

# Display the average importance dataframe
average_importance_df

Unnamed: 0,Feature,Average Importance
1,Water trt target,0.507231
2,N trt target,0.18128
3,Bacteria_Nitrosospira_lacus,0.169727
0,Bacteria_Sphingomonas_sediminicola,0.141762


# Model stacking

In [11]:
import warnings
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

In [12]:
# Function to clean column names
def clean_column_names(df):
    new_columns = []
    for col in df.columns:
        cleaned_col = col.replace("[", "").replace("]", "").replace(",", "")
        new_columns.append(cleaned_col)

    df.columns = new_columns
    return df

In [13]:
# One-hot encode microbial features
dummies = pd.get_dummies(final_df[['Microbe', 'Interaction Microbe']])
X_second_layer = pd.concat([final_df.drop(['Microbe', 'Interaction Microbe'], axis=1), dummies], axis=1)
X_second_layer = X_second_layer.select_dtypes(include=['bool'])

# Apply the function to clean column names
X_second_layer = clean_column_names(X_second_layer)

y_second_layer = final_df["R2"]

In [14]:
X_second_layer.head()

Unnamed: 0,Microbe_Bacteria_Achromobacter_insolitus,Microbe_Bacteria_Acidibacter_ferrireducens,Microbe_Bacteria_Acidibrevibacterium_fodinaquatile,Microbe_Bacteria_Acidicapsa_acidisoli,Microbe_Bacteria_Acidimicrobium_ferrooxidans,Microbe_Bacteria_Acidipila_dinghuensis,Microbe_Bacteria_Acidipila_rosea,Microbe_Bacteria_Acidisarcina_polymorpha,Microbe_Bacteria_Acidisphaera_rubrifaciens,Microbe_Bacteria_Aciditerrimonas_ferrireducens,...,Interaction Microbe_Bacteria_Virgibacillus_necropolis,Interaction Microbe_Bacteria_Virgibacillus_sp._Bac332,Interaction Microbe_Bacteria_Williamsia_limnetica,Interaction Microbe_Bacteria_Woeseia_oceani,Interaction Microbe_Bacteria_Youhaiella_tibetensis,Interaction Microbe_Bacteria_Zavarzinella_formosa,Interaction Microbe_Bacteria_Zhizhongheella_caldifontis,Interaction Microbe_Bacteria_Brevibacterium_frigoritolerans,Interaction Microbe_Bacteria_Desulfotomaculum_salinum,Interaction Microbe_Bacteria_Polyangium_brachysporum
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [None]:
with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=FutureWarning)
    # Define hyperparameter grid to search

    param_grid = {
        'learning_rate': [0.01, 0.1, 0.3],
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 5, 7]
    }
    
    # Initialize the model
    second_layer_model = xgb.XGBRegressor(n_jobs=-1)
    
    # Initialize and fit GridSearchCV
    grid_search = GridSearchCV(estimator=second_layer_model, param_grid=param_grid, cv=5)
    grid_search.fit(X_second_layer, y_second_layer)
    
    # Extract best estimator and its feature importances
    best_estimator = grid_search.best_estimator_
    importances = best_estimator.feature_importances_

In [None]:
# Get feature importances
importances = best_estimator.feature_importances_

feature_names = X_second_layer.columns

# Pair feature importances with feature names and sort
feature_importance_pairs = sorted(zip(importances, feature_names), reverse=True)

# Extract the top 20 most important features
top_20_features = feature_importance_pairs[:20]

# Display the top 20 most important features
for importance, feature in top_20_features:
    print(f"{feature}: {importance}")