In [1]:
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import polars as pl

In [2]:
from extra_code.preprocess import *
from extra_code.qsar_analysis import *

2025-07-05 23:27:28 - rdkit - INFO - Enabling RDKit 2024.03.5 jupyter extensions


=== splitters.py loading ===
✅ RDKit successfully imported and working
✅ FilterCatalog available
✅ Advanced RDKit features available

=== RDKit Status ===
RDKIT_AVAILABLE: True
RDKIT_ADVANCED: True
FILTER_CATALOG_AVAILABLE: True
=== splitters.py loaded ===

✓ Successfully imported ConcurrentQSARAnalyzer
✓ Successfully imported configuration


In [3]:
# Load all data files - simple and clean!
df_dict = load_data()

    Processing huusk.csv with encoding: utf-8
    Processing curated-solubility-dataset.csv with encoding: ascii
    Successfully read huusk.csv: (1291, 6)
    Columns: ['ID', 'Name', 'InChI', 'InChIKey', 'SMILES', 'Solubility']
    Successfully read curated-solubility-dataset.csv: (9982, 26)
    Columns: ['ID', 'Name', 'InChI', 'InChIKey', 'SMILES', 'Solubility', 'SD', 'Ocurrences', 'Group', 'MolWt', 'MolLogP', 'MolMR', 'HeavyAtomCount', 'NumHAcceptors', 'NumHDonors', 'NumHeteroatoms', 'NumRotatableBonds', 'NumValenceElectrons', 'NumAromaticRings', 'NumSaturatedRings', 'NumAliphaticRings', 'RingCount', 'TPSA', 'LabuteASA', 'BalabanJ', 'BertzCT']

Data Loading Complete!
Success: 8 files
Failed: 0 files
Total data: 72,841 rows
Memory usage: 3.0 MB
System memory: 38.5% used

Usage example:
  df_dict = load_data()
  Lovric2020_logS0_df = df_dict['Lovric2020_logS0']
  print(Lovric2020_logS0_df.head())


In [4]:
print(f"\n📋 Available datasets:")
for idx, (name, df) in enumerate(df_dict.items()):
    valid_samples = df.filter(
        (pl.col("target_x").is_not_null()) & 
        (pl.col("target_y").is_not_null())
    ).shape[0]
    print(f" [{idx}] {name}: {df.shape[0]} total, {valid_samples} valid samples")


📋 Available datasets:
 [0] Lovric2020_logS0: 829 total, 829 valid samples
 [1] SAMPL: 642 total, 642 valid samples
 [2] ws496_logS: 496 total, 496 valid samples
 [3] delaney-processed: 1128 total, 1128 valid samples
 [4] huusk: 1291 total, 1291 valid samples
 [5] Lipophilicity: 4200 total, 4200 valid samples
 [6] curated-solubility-dataset: 9982 total, 9982 valid samples
 [7] BigSolDB: 54273 total, 54273 valid samples


In [5]:
df_dict.keys()

dict_keys(['Lovric2020_logS0', 'SAMPL', 'ws496_logS', 'delaney-processed', 'huusk', 'Lipophilicity', 'curated-solubility-dataset', 'BigSolDB'])

In [6]:
test_only_datasets=['SAMPL','Lipophilicity','curated-solubility-dataset','BigSolDB']

In [7]:
analyzer = run_enhanced_analysis(
    df_dict=df_dict,
    test_only_datasets=test_only_datasets,
    output_dir="result/1_preprocess",
    performance_mode=False,
    ad_analysis_mode='all', # ('strict', 'flexible', 'adaptive')
    max_samples=30000,
    show_recommendations=False,
)

🚀 Concurrent QSAR Analyzer Initialized
📈 Performance Mode: ✗ Disabled
🔧 Parallel Jobs: 3
🔧 Backend: threading
💾 Max Samples: 30000
💾 Memory Limit: 7.8 GB
    Created directory: result/1_preprocess/train/rm
    Created directory: result/1_preprocess/train/sc
    Created directory: result/1_preprocess/train/cs
    Created directory: result/1_preprocess/train/cl
    Created directory: result/1_preprocess/train/pc
    Created directory: result/1_preprocess/train/ac
    Created directory: result/1_preprocess/train/sa
    Created directory: result/1_preprocess/train/ti
    Created directory: result/1_preprocess/train/en
    Created directory: result/1_preprocess/train/to
    Created directory: result/1_preprocess/test/rm
    Created directory: result/1_preprocess/test/sc
    Created directory: result/1_preprocess/test/cs
    Created directory: result/1_preprocess/test/cl
    Created directory: result/1_preprocess/test/pc
    Created directory: result/1_preprocess/test/ac
    Created director

2025-07-05 23:59:47 - extra_code.qsar_analysis.utils - INFO - Results saved to result/1_preprocess/ad_analysis/by_dataset/all_datasets_results/ad_results_strict.json



🔄 Analyzing with FLEXIBLE mode

🔬 Step 4: Performing AD analysis (mode: flexible)
    Processing Lovric2020_logS0/random... (1/40)
      Sampled: 663 train, 166 test (actual test: 166)
          Calculating regulatory-approved AD methods...
            leverage... ✓
            descriptor_range... ✓
            euclidean_distance... ✓
            dmodx... ✓
            knn_distance... ✓
    Processing Lovric2020_logS0/time_series... (2/40)
      Sampled: 663 train, 166 test (actual test: 166)
          Calculating regulatory-approved AD methods...
            leverage... ✓
            euclidean_distance... ✓
            descriptor_range... ✓
            dmodx... ✓
            knn_distance... ✓
    Processing Lovric2020_logS0/scaffold... (3/40)
      Sampled: 645 train, 184 test (actual test: 184)
          Calculating regulatory-approved AD methods...
            leverage... ✓
            descriptor_range... ✓
            euclidean_distance... ✓
            dmodx... ✓
            knn_

2025-07-06 00:14:17 - extra_code.qsar_analysis.utils - INFO - Results saved to result/1_preprocess/ad_analysis/by_dataset/all_datasets_results/ad_results_flexible.json



🔄 Analyzing with ADAPTIVE mode

🔬 Step 4: Performing AD analysis (mode: adaptive)
    Processing Lovric2020_logS0/random... (1/40)
      Sampled: 663 train, 166 test (actual test: 166)
          Calculating regulatory-approved AD methods...
            leverage... ✓
            descriptor_range... ✓
            euclidean_distance... ✓
            dmodx... ✓
            knn_distance... ✓
    Processing Lovric2020_logS0/time_series... (2/40)
      Sampled: 663 train, 166 test (actual test: 166)
          Calculating regulatory-approved AD methods...
            leverage... ✓
            descriptor_range... ✓
            euclidean_distance... ✓
            dmodx... ✓
            knn_distance... ✓
    Processing Lovric2020_logS0/scaffold... (3/40)
      Sampled: 645 train, 184 test (actual test: 184)
          Calculating regulatory-approved AD methods...
            leverage... ✓
            descriptor_range... ✓
            euclidean_distance... ✓
            dmodx... ✓
            knn_

2025-07-06 00:28:26 - extra_code.qsar_analysis.utils - INFO - Results saved to result/1_preprocess/ad_analysis/by_dataset/all_datasets_results/ad_results_adaptive.json



📊 Creating mode comparison visualizations
  ✓ Mode comparison visualizations created
  ✓ All modes summary report saved: all_modes_summary_report.txt

📊 ANALYSIS MODE INFORMATION:
  • Scientific Consensus AD standards
  • Reference: Sahigara et al. (2012), Roy et al. (2015) - Practical AD implementation

🎉 ENHANCED QSAR ANALYSIS COMPLETE!

📊 DATASETS ANALYZED:
  • Lovric2020_logS0: 829 samples
  • SAMPL: 642 samples (test-only)
  • ws496_logS: 496 samples
  • delaney-processed: 1,128 samples
  • huusk: 1,291 samples
  • Lipophilicity: 4,200 samples (test-only)
  • curated-solubility-dataset: 9,982 samples (test-only)
  • BigSolDB: 54,273 samples (test-only)

⏰ Total analysis time: 3658.93 seconds (61.0 minutes)
💾 Peak memory usage: 1708.93 MB
💾 Average memory usage: 1446.35 MB
💾 Memory increase: 1077.50 MB

✅ KEY FEATURES IMPLEMENTED:
  • Scientific Consensus AD standards
  • Reference: Sahigara et al. (2012), Roy et al. (2015) - Practical AD implementation
  • Applicability Domain An