In [None]:
# Data Preprocessing Pipeline for Molecular Solubility Prediction
# ==============================================================
#
# This notebook implements a comprehensive preprocessing pipeline for molecular datasets
# including data loading, splitting, feature generation, and applicability domain analysis.
#
# Key Features:
# - Multiple molecular dataset loading
# - Various data splitting strategies (random, scaffold, chemical space, etc.)
# - Molecular descriptor and fingerprint generation
# - Applicability Domain (AD) analysis
# - Visualization and statistical analysis

import sys
from pathlib import Path
import numpy as np
import pandas as pd
import polars as pl

In [None]:
# Import custom preprocessing and QSAR analysis modules
# These modules provide:
# - Data loading utilities
# - Molecular feature extraction
# - Data splitting strategies
# - Applicability domain analysis
# - Statistical metrics and visualizations

from extra_code.preprocess import *
from extra_code.qsar_analysis import *

In [None]:
# Load all molecular datasets from the data directory
# ==================================================
#
# The load_data() function automatically:
# 1. Scans the data/ directory for CSV files
# 2. Loads molecular datasets containing SMILES and solubility values
# 3. Returns a dictionary with dataset names as keys and polars DataFrames as values
#
# IMPORTANT: We exclude 'train' and 'test' directories to prevent loading pre-split data

# Load all data files - simple and clean!
# Note: load_data() should exclude 'data/' directory to avoid loading train/test splits
df_dict = load_data()

# Verify that 'train' and 'test' are not loaded as datasets
if 'train' in df_dict or 'test' in df_dict:
    print("⚠️ WARNING: 'train' or 'test' found in datasets. Removing...")
    df_dict.pop('train', None)
    df_dict.pop('test', None)
    print("✓ Cleaned dataset dictionary")

print(f"\n📊 Loaded {len(df_dict)} datasets: {list(df_dict.keys())}")

In [None]:
# Display the loaded dataset names
# These are the molecular solubility datasets available for analysis
df_dict.keys()

In [None]:
# Define test-only datasets
# ========================
#
# Some datasets are reserved exclusively for testing to evaluate model generalization.
# These datasets will NOT be used during training or hyperparameter optimization.
#
# Current test-only datasets:
# - SAMPL: SAMPL challenge dataset
# - Lipophilicity: Lipophilicity dataset 
# - curated-solubility-dataset: Curated solubility collection
# - BigSolDB: Large solubility database

# Define test-only datasets (if any)
# These datasets will only be used for testing, not training
test_only_datasets = ['SAMPL', 'Lipophilicity', 'curated-solubility-dataset', 'BigSolDB']

In [None]:
# Main Analysis Pipeline Configuration
# ===================================
#
# This section sets up the main preprocessing and analysis pipeline.
# 
# Key Components:
# 1. Output Directory Management
#    - All results saved to result/1_preprocess_enhanced/
#    - Original data in data/ directory is NEVER modified
#    - Option to force full re-analysis or use existing splits
#
# 2. Analysis Modes:
#    - Full Analysis: Complete train/test splitting + AD analysis + visualizations
#    - AD-Only: If splits exist, only run applicability domain analysis
#
# 3. Applicability Domain (AD) Modes:
#    - Strict: Conservative predictions only within training space
#    - Flexible: Balanced approach for practical applications
#    - Adaptive: Dynamic thresholds based on local density
#
# 4. Generated Outputs:
#    - Train/test splits for 9 different strategies (rm, sc, cs, cl, pc, ac, sa, ti, en)
#    - Molecular fingerprints and descriptors
#    - AD analysis results and visualizations
#    - Statistical summaries and decision reports

import os
import shutil

# Output directory for analysis results
output_dir = "result/1_preprocess_enhanced"

# IMPORTANT: NEVER touch the data/ directory - it contains original data!
# Train/test splits will be saved in the output directory

# Option to force full analysis (set to True to run full analysis)
FORCE_FULL_ANALYSIS = True  # Change this to False if you want to keep existing results

# Clean up ONLY output directory if forcing full analysis
if FORCE_FULL_ANALYSIS and os.path.exists(output_dir):
    print(f"🗑️ Removing existing output directory: {output_dir}")
    shutil.rmtree(output_dir)
    print("✓ Output directory cleaned. Running full analysis...")

# Check if train and test directories already exist in output dir
train_dir = os.path.join(output_dir, "train")
test_dir = os.path.join(output_dir, "test")

if os.path.exists(train_dir) and os.path.exists(test_dir) and not FORCE_FULL_ANALYSIS:
    print("✓ Train and test data exist in output directory. Running AD analysis only...")
    print(f"  - Train directory: {train_dir}")
    print(f"  - Test directory: {test_dir}")
    print("\n⚠️ To run full analysis, set FORCE_FULL_ANALYSIS = True")
    
    # [Previous AD-only analysis code would go here]
    print("\n❌ AD-only analysis is not fully implemented yet.")
    print("Please set FORCE_FULL_ANALYSIS = True to run complete analysis.")
    
else:
    print("Starting full analysis (including train/test split)...")
    print("\n⚠️ IMPORTANT: Original data in 'data/' directory will NOT be modified!")
    
    # Run complete analysis with all results saved in output_dir
    analyzer = run_enhanced_analysis(
        df_dict=df_dict,
        test_only_datasets=test_only_datasets,
        output_dir=output_dir,
        performance_mode=False,
        ad_mode='flexible',  # Default mode for initial analysis
        ad_analysis_mode='all',  # This will analyze all modes: strict, flexible, adaptive
        max_samples=30000,
        show_recommendations=False,
        enable_reliability_scoring=False
    )
    
    print("\n" + "="*60)
    print("✅ FULL ANALYSIS COMPLETED!")
    print("="*60)
    print(f"\nAll results saved to: {output_dir}/")
    print("\nGenerated outputs:")
    print("  - Train/test splits (in output directory)")
    print("  - Molecular features and descriptors")
    print("  - AD analysis for all modes (strict, flexible, adaptive)")
    print("  - Statistical analysis")
    print("  - Comprehensive visualizations")
    print("  - Decision reports for each AD mode")
    print("\n✅ Original data in 'data/' directory is preserved!")
# 189m 32s

In [8]:
# print(f"\n📋 Available datasets:")
# for idx, (name, df) in enumerate(df_dict.items()):
#     valid_samples = df.filter(
#         (pl.col("target_x").is_not_null()) & 
#         (pl.col("target_y").is_not_null())
#     ).shape[0]
#     print(f" [{idx}] {name}: {df.shape[0]} total, {valid_samples} valid samples")

In [9]:
# df_dict.keys()

In [10]:
# test_only_datasets=['SAMPL','Lipophilicity','curated-solubility-dataset','BigSolDB']

In [11]:
# analyzer = run_enhanced_analysis(
#     df_dict=df_dict,
#     test_only_datasets=test_only_datasets,
#     output_dir="result/1_preprocess",
#     performance_mode=False,
#     ad_analysis_mode='all', # ('strict', 'flexible', 'adaptive')
#     max_samples=30000,
#     show_recommendations=False,
# )