In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from pyod.models.iforest import IForest


In [13]:
df = pd.read_csv('dataset.csv')

In [14]:
df_cleaned = df.dropna()
feature_columns=['chem_num_atoms', 'chem_volume', 'chem_density', 'chem_avg_atomic_mass',
       'chem_avg_electronegativity', 'chem_electronegativity_variance',
       'chem_metal_fraction', 'chem_num_unique_elements',
       'chem_metal_atom_count', 'chem_volume_per_atom', 'geo_surface_area_m2g',
       'geo_surface_area_m2cm3', 'geo_void_fraction', 'geo_pld', 'geo_lcd',
       'link_linker_atom_fraction', 'link_linker_bond_length_mean',
       'link_linker_bond_length_std', 'link_metal_coord_number_mean',
       'topo_avg_node_connectivity', 'topo_avg_ring_size',
       'topo_coordination_number_mean', 'topo_degree_assortativity',
       'topo_degree_centrality_mean', 'topo_graph_density',
       'topo_graph_entropy', 'topo_graph_transitivity',
       'topo_largest_cc_fraction', 'topo_node_connectivity_std',
       'topo_num_connected_components', 'topo_num_edges', 'topo_num_nodes']

In [15]:
X = df_cleaned[feature_columns]

scaler_X = StandardScaler()
scaler_X.fit(X)
X_scaled=scaler_X.transform(X)


In [16]:
mof_ids = df_cleaned['MOF_ID'].copy()

In [17]:

contamination_rate = 0.05

clf = IForest(
    n_estimators=100,
    contamination=contamination_rate,
    random_state=42 # for reproducibility
)

# Train the model on the scaled data
clf.fit(X_scaled)




IForest(behaviour='old', bootstrap=False, contamination=0.05,
    max_features=1.0, max_samples='auto', n_estimators=100, n_jobs=1,
    random_state=42, verbose=0)

In [21]:
anomaly_scores = clf.decision_function(X_scaled)
anomaly_labels = clf.predict(X_scaled)


results_df = pd.DataFrame({
    'mof_id': mof_ids,
    'deep_forest_score': anomaly_scores,
    'deep_forest_anomaly_label': anomaly_labels
})

In [22]:
results_df_sorted = results_df.sort_values(by='deep_forest_score', ascending=False)

In [23]:
print("\n--- Top 10 Most Anomalous MOFs (according to Deep forest) ---")
print(results_df_sorted.head(10))


--- Top 10 Most Anomalous MOFs (according to Deep forest) ---
                             mof_id  deep_forest_score  \
2844                   FORXIU_clean           0.165616   
25696                  WOMXED_clean           0.155426   
937    c6ce00407e_c6ce00407e6_clean           0.151795   
936    c6ce00407e_c6ce00407e5_clean           0.151279   
406                    AVAJUE_clean           0.150909   
23348                  RIVDEH_clean           0.147690   
19690                  JALLOA_clean           0.147291   
20335                LAFRAN01_clean           0.143713   
116                    ADUROI_clean           0.140370   
24469                  UGOCAW_clean           0.139359   

       deep_forest_anomaly_label  
2844                           1  
25696                          1  
937                            1  
936                            1  
406                            1  
23348                          1  
19690                          1  
20335             