In [5]:
import pandas as pd
from sklearn.metrics import roc_auc_score

def calculate_auc(df, tasks):
    """
    Calculate AUC for each task in the dataframe
    
    Args:
        df (pd.DataFrame): DataFrame containing predictions and actual values
        tasks (list): List of task names
    
    Returns:
        dict: Dictionary of task names and their AUC scores
    """
    auc_scores = {}
    
    for task in tasks:
        # Get actual and predicted values
        y_true = df[f'actual_{task}']
        y_pred = df[f'logit_{task}']
        
        try:
            # Calculate AUC
            auc = roc_auc_score(y_true, y_pred)
            auc_scores[task] = auc
        except Exception as e:
            print(f"Error calculating AUC for {task}: {e}")
            auc_scores[task] = None
            
    return auc_scores

# Use your existing code to read the files and get tasks
structure_list = ["precise3d",]
dataset = "BBBP"  # Changed to BBBP based on your CSV example

for structure in structure_list:
    print(f"\nStructure: {structure}")
    
    # Read the CSV file
    df = pd.read_csv(f"/home/amin/equiformer/output/{dataset}-{structure}/test_results.csv")
    tasks = ["p_np"]
    
    # Calculate AUC scores
    auc_scores = calculate_auc(df, tasks)
    
    # Print results
    for task, auc in auc_scores.items():
        if auc is not None:
            print(f"{task} AUC: {auc:.4f}")


Structure: precise3d
p_np AUC: 0.6950


In [15]:
df=pd.read_csv("/home/amin/equiformer/data/qm9-precise3d/raw/gdb9.sdf.csv")
df["homo"].mean()

-0.23997669940620683

In [7]:
def get_tasks(df):
    # Get column names that start with 'pred_', 'actual_', or 'diff_'
    pred_cols = [col for col in df.columns if col.startswith('pred_')]
    tasks = [col.replace('pred_','') for col in pred_cols]
    
    return tasks

In [9]:
structure_list=["precise3d","optimized3d","rdkit3d","rdkit2d"]
dataset="Esol"
for structure in structure_list:
    df=pd.read_csv(f"/home/amin/equiformer/scripts/output/{dataset}-{structure}/test_results.csv")
    tasks=get_tasks(df)
    dataset_error=0
    for task in tasks:
        task_error=df[f'diff_{task}'].abs().mean()
        dataset_error+=task_error
        print(f"structure {structure}-{task}:{task_error}")
    print(f"structure {structure}:{dataset_error/len(tasks)}")

structure precise3d-measured log solubility in mols per litre:0.6990828288679246
structure precise3d:0.6990828288679246
structure optimized3d-measured log solubility in mols per litre:0.7139583149469027
structure optimized3d:0.7139583149469027
structure rdkit3d-measured log solubility in mols per litre:0.6585405857247787
structure rdkit3d:0.6585405857247787
structure rdkit2d-measured log solubility in mols per litre:0.6637784696371681
structure rdkit2d:0.6637784696371681


86.47916823550662