In [2]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import matthews_corrcoef, accuracy_score

# List of the datasets
datasets = [
    'HF_dopamine_D3_receptor_antagonists_CID_SID.csv', 
    'HF_Rab9_promoter_activators_CID_SID.csv', 
    'HF_CHOP_inhibitors_CID_SID.csv', 
    'HF_human_M1_muscarinic_receptor_antagonists_CID_SID.csv'
]

results_summary = []

for file in datasets:
    # 1. Load data
    df = pd.read_csv(file)
    
    # 2. CHRONOLOGICAL SORTING
    df = df.sort_values(by='CID', ascending=True)
    
    # 3. TEMPORAL SPLIT (80% Train / 20% Test)
    # We do not use train_test_split(random) here. 
    # We take the first 80% as 'past' and last 20% as 'future'.
    split_index = int(len(df) * 0.8)
    train_df = df.iloc[:split_index]
    test_df = df.iloc[split_index:]
    
    # Define features (X) and target (y)
    X_train = train_df[['CID', 'SID']] 
    y_train = train_df['target']
    X_test = test_df[['CID', 'SID']]
    y_test = test_df['target']
    
    # 4. Train Model
    model = XGBClassifier(eval_metric='logloss')
    model.fit(X_train, y_train)
    
    # 5. Predict and Evaluate
    y_pred = model.predict(X_test)
    mcc = matthews_corrcoef(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)
    
    results_summary.append({
        'Dataset': file,
        'Temporal_MCC': round(mcc, 3),
        'Temporal_ACC': round(acc, 3),
        'Test_Set_CID_Range': f"{test_df['CID'].min()}-{test_df['CID'].max()}"
    })

# 6. Display Results
summary_df = pd.DataFrame(results_summary)
print("Temporal Validation Results (Sorted by CID):")
print(summary_df)

Temporal Validation Results (Sorted by CID):
                                             Dataset  Temporal_MCC  \
0    HF_dopamine_D3_receptor_antagonists_CID_SID.csv         0.572   
1            HF_Rab9_promoter_activators_CID_SID.csv         0.298   
2                     HF_CHOP_inhibitors_CID_SID.csv         0.632   
3  HF_human_M1_muscarinic_receptor_antagonists_CI...         0.298   

   Temporal_ACC Test_Set_CID_Range  
0         0.850  9265700-136153826  
1         0.855  7216388-135915051  
2         0.858  6365505-135856003  
3         0.855  7216388-135915051  
