In [2]:
import pandas as pd
import os
from collections import Counter
from Feature_extraction.graph_builder import build_graphs_from_files

# ========== Adjustable Parameters ==========
INPUT_FILE = "1_Data/Processed_data_set/Final_merged_data_set/TS.csv"
USE_ESMC = False
USE_AAINDEX = True
DISTANCE_THRESHOLD = 4.0
FORCE_REPROCESS = False
VALIDATE_OUTPUT = True

# ========== Processing Pipeline ==========
print("🔬 Processing External Dataset")

if not os.path.exists(INPUT_FILE):
    print(f"❌ File not found: {INPUT_FILE}")
else:
    print(f"✅ File exists: {INPUT_FILE}")
    
    try:
        # Load data
        df = pd.read_csv(INPUT_FILE)
        base_filename = os.path.splitext(os.path.basename(INPUT_FILE))[0]
        
        print(f"📊 Data rows: {len(df)}, columns: {len(df.columns)}")
        
        # Check labels and sequences
        if 'Label' in df.columns:
            label_dist = Counter(df['Label'])
            print(f"🏷️  Label distribution: {dict(label_dist)}")
        
        if 'Sequence' in df.columns:
            seq_lengths = df['Sequence'].str.len()
            print(f"📏 Sequence lengths: {seq_lengths.min()}-{seq_lengths.max()}, average: {seq_lengths.mean():.1f}")
        
        # Configuration parameters
        config = {
            'graph': {
                'use_esmc': USE_ESMC,
                'use_aaindex': USE_AAINDEX,
                'distance_threshold': DISTANCE_THRESHOLD
            },
            'data': {
                'output_base_dir': '3_Graph_Data',  
            }
        }
        
        print(f"⚙️  Configuration: ESMC={USE_ESMC}, AAIndex={USE_AAINDEX}, distance={DISTANCE_THRESHOLD}Å")
        
        # Build graph dataset
        print("🚀 Starting graph dataset construction...")
        graphs_iamp, stats_iamp = build_graphs_from_files(
            file_configs=[(INPUT_FILE, None)],
            output_name=base_filename,
            config=config,
            force_reprocess=FORCE_REPROCESS,
            validate_output=VALIDATE_OUTPUT
        )
        
        # Display results
        if stats_iamp:
            success_rate = (stats_iamp.get('successful_graphs', 0) / stats_iamp.get('total_sequences', 1)) * 100
            print(f"📊 Total sequences: {stats_iamp.get('total_sequences', 0)}, successful: {stats_iamp.get('successful_graphs', 0)}, success rate: {success_rate:.1f}%")
        
        if graphs_iamp:
            # Label distribution
            labels = [graph.y.item() for graph in graphs_iamp]
            label_counts = Counter(labels)
            print(f"🎯 Graph label distribution: {dict(label_counts)}")
            
            # Save path
            feature_suffix = ""
            if USE_ESMC:
                feature_suffix += "ESMC"
            if USE_AAINDEX:
                if feature_suffix:
                    feature_suffix += "+"
                feature_suffix += "AAIndex"
            
            filename = f"3_Graph_Data/{base_filename}/{base_filename}_{feature_suffix}_{DISTANCE_THRESHOLD}A.pkl"
            print(f"💾 Saved to: {filename}")
        
        print("✅ Processing completed!")
        
    except Exception as e:
        print(f"❌ Processing failed: {e}")

🔬 Processing External Dataset
✅ File exists: 1_Data/Processed_data_set/Final_merged_data_set/TS.csv
📊 Data rows: 9297, columns: 7
🏷️  Label distribution: {0: 7894, 1: 1403}
📏 Sequence lengths: 5-50, average: 29.6
⚙️  Configuration: ESMC=False, AAIndex=True, distance=4.0Å
🚀 Starting graph dataset construction...
Graph builder initialized (strict mode - CSV input only)
Using features: ESMC=False, AAIndex=True
Distance threshold: 4.0Å
[32m2025-06-27 15:46:56[0m | [1mINFO    [0m | [36mFeature_extraction.utils[0m:[36msetup_dataset_logging[0m:[36m66[0m - [1mDataset-specific logging initialized: 2_Log/2.2_Feature_extraction/TS/TS_AAIndex_4.0A.log[0m
[32m2025-06-27 15:46:56[0m | [1mINFO    [0m | [36mFeature_extraction.graph_builder[0m:[36mautomated_pipeline[0m:[36m765[0m - [1mAutomated CSV Graph Construction Pipeline (Strict Mode)[0m
[32m2025-06-27 15:46:56[0m | [1mINFO    [0m | [36mFeature_extraction.graph_builder[0m:[36mautomated_pipeline[0m:[36m767[0m - [1

Building graphs-strict mode (AAIdx, 4.0Å): 100%|██████████| 9297/9297 [00:51<00:00, 181.69graph/s, success=9297/9297, time=0.011s, ETA=0s, status=✓] 

[32m2025-06-27 15:47:48[0m | [1mINFO    [0m | [36mFeature_extraction.graph_builder[0m:[36mbuild_graphs_from_csv[0m:[36m645[0m - [1mGraph construction complete: 9297/9297 successful[0m
[32m2025-06-27 15:47:48[0m | [1mINFO    [0m | [36mFeature_extraction.graph_builder[0m:[36mbuild_graphs_from_csv[0m:[36m646[0m - [1mTotal time: 51s[0m





[32m2025-06-27 15:47:48[0m | [1mINFO    [0m | [36mFeature_extraction.graph_builder[0m:[36m_report_statistics[0m:[36m656[0m - [1mDetailed statistics:[0m
[32m2025-06-27 15:47:48[0m | [1mINFO    [0m | [36mFeature_extraction.graph_builder[0m:[36m_report_statistics[0m:[36m657[0m - [1m  Successful graphs: 9297[0m
[32m2025-06-27 15:47:48[0m | [1mINFO    [0m | [36mFeature_extraction.graph_builder[0m:[36m_report_statistics[0m:[36m658[0m - [1m  Failed sequences: 0[0m
[32m2025-06-27 15:47:48[0m | [1mINFO    [0m | [36mFeature_extraction.graph_builder[0m:[36m_report_statistics[0m:[36m659[0m - [1m  Length mismatch errors: 0[0m
[32m2025-06-27 15:47:48[0m | [1mINFO    [0m | [36mFeature_extraction.graph_builder[0m:[36m_report_statistics[0m:[36m660[0m - [1m  Missing PDB: 0[0m
[32m2025-06-27 15:47:48[0m | [1mINFO    [0m | [36mFeature_extraction.graph_builder[0m:[36m_report_statistics[0m:[36m661[0m - [1m  Missing ESMC: 0[0m
[32m2025-