Test the Integrated Thompson Sampler.

In [None]:
import json


import polars as pl
from PRISMS.thompson_sampling.ts_main import run_ts
from PRISMS.thompson_sampling.thompson_sampling import IntegratedThompsonSampler
from PRISMS.thompson_sampling.evaluators import LookupEvaluator
from PRISMS.thompson_sampling.baseline import random_baseline
from PRISMS.library_analysis.visualization import TS_Benchmarks

In [6]:
# Test Block for Integrated Thompson Sampler with TS_Benchmarks Visualization
# This version includes all the fixes and can be run in a Jupyter notebook

import json
import polars as pl
import pandas as pd
import numpy as np
from typing import List, Dict, Any
import os
import traceback
import sys
# Add the root directory of your project to the Python path
project_root = '/Users/aakankschitnandkeolyar/Desktop/PRISMS'
if project_root not in sys.path:
    sys.path.append(project_root)
# Import the required modules
from PRISMS.thompson_sampling.thompson_sampling import IntegratedThompsonSampler
from PRISMS.thompson_sampling.evaluators import LookupEvaluator
from PRISMS.thompson_sampling.baseline import random_baseline
from PRISMS.library_analysis.visualization import TS_Benchmarks

print("="*60)
print("INTEGRATED THOMPSON SAMPLER TEST WITH VISUALIZATION")
print("="*60)

# 1. Load your JSON config
input_json_file = '''{
    "reagent_file_list": [
        "/Users/aakankschitnandkeolyar/Desktop/PRISMS/tests/Data/Thrombin/input_files/acids.smi",
        "/Users/aakankschitnandkeolyar/Desktop/PRISMS/tests/Data/Thrombin/input_files/coupled_aa_sub.smi"
    ],
    "reaction_smarts": "[#6:1](=[O:2])[OH].[#7X3;H1,H2;!$(N[!#6]);!$(N[#6]=[O]);!$(N[#6]~[!#6;!#16]):3]>>[#6:1](=[O:2])[#7:3]",
    "num_warmup_trials": 3,
    "num_ts_iterations": 200,
    "sampler_type": "integrated",
    "ts_mode": "minimize",
    "evaluator_class_name": "LookupEvaluator",
    "evaluator_arg": {
        "ref_filename": "/Users/aakankschitnandkeolyar/Desktop/TS_Chem_Space/Thrombin/Linear_amide/docking_scores/product_scores.csv",
        "ref_colname": "Score"
    },
    "log_filename": "ts_logs.txt",
    "results_filename": "ts_results_integrated.csv",
    "processes": 1,
    "scaling": 1.0,
    "percent_of_library": 0.05,
    "search_stop": 50,
    "min_cpds_per_core": 5
}'''

try:
    input_dict = json.loads(input_json_file)
    input_dict["evaluator_class"] = LookupEvaluator(input_dict["evaluator_arg"])
    print("‚úÖ Configuration loaded successfully")
except Exception as e:
    print(f"‚ùå Error loading configuration: {str(e)}")
    print(f"Traceback: {traceback.format_exc()}")
    raise

# 2. Run Integrated Thompson Sampler (multiple cycles for benchmarking)
n_cycles = 2  # Reduced for testing
integrated_results = []

print(f"\nÔøΩÔøΩ Running Integrated Thompson Sampler ({n_cycles} cycles)...")
for cycle in range(1, n_cycles + 1):
    try:
        # Set a unique output file for this cycle
        cycle_results_file = f"ts_results_integrated_cycle{cycle}.csv"
        print(f"\nÔøΩÔøΩ Running Integrated Thompson Sampler - Cycle {cycle}")
        
        # Create sampler instance
        sampler = IntegratedThompsonSampler(
            processes=input_dict.get("processes", 1),
            scaling=input_dict.get("scaling", 1.0),
            percent_lib=input_dict.get("percent_of_library", 0.1),
            search_stop=input_dict.get("search_stop", 100),
            min_cpds_per_core=input_dict.get("min_cpds_per_core", 10),
            log_filename=input_dict.get("log_filename")
        )
        
        # Configure sampler
        sampler.set_hide_progress(True)
        sampler.set_evaluator(input_dict["evaluator_class"])
        sampler.read_reagents(input_dict["reagent_file_list"])
        sampler.set_reaction(input_dict["reaction_smarts"])
        
        # Run warm-up
        print(f"  Running warm-up...")
        sampler.warm_up_integrated(num_warmup_trials=input_dict.get("num_warmup_trials", 3))
        
        # Run search
        print(f"  Running search...")
        sampler.search_integrated(
            min_cpds_per_core=input_dict.get("min_cpds_per_core", 10),
            percent_of_library=input_dict.get("percent_of_library", 0.1),
            stop=input_dict.get("search_stop", 100),
            results_filename=cycle_results_file
        )
        
        # Load and validate results
        if os.path.exists(cycle_results_file):
            df = pl.read_csv(cycle_results_file)
            print(f"  ‚úÖ Cycle {cycle} completed successfully: {len(df)} compounds")
            print(f"  Columns: {df.columns}")
            
            # Ensure score column exists and is numeric
            if "score" in df.columns:
                df = df.with_columns(pl.col("score").cast(pl.Float64))
            elif "Score" in df.columns:
                df = df.rename({"Score": "score"})
                df = df.with_columns(pl.col("score").cast(pl.Float64))
            else:
                print(f"  Warning: No score column found in {cycle_results_file}")
                print(f"  Available columns: {df.columns}")
            
            integrated_results.append(df)
        else:
            print(f"  ‚ùå Results file not found: {cycle_results_file}")
            integrated_results.append(None)
            
    except Exception as e:
        print(f"‚ùå Error in cycle {cycle}: {str(e)}")
        print(f"Traceback: {traceback.format_exc()}")
        integrated_results.append(None)

# 3. Run Random Baseline (multiple cycles for benchmarking)
random_results = []

print(f"\nüîÑ Running Random Baseline ({n_cycles} cycles)...")
for cycle in range(1, n_cycles + 1):
    try:
        random_file = f"random_results_cycle{cycle}.csv"
        print(f"\nüîÑ Running Random Baseline - Cycle {cycle}")
        
        # Run random baseline
        random_baseline(
            input_dict,
            num_trials=100,  # Adjust as needed
            outfile_name=random_file,
            num_to_save=100,
            ascending_output=True
        )
        
        # Load and validate results
        if os.path.exists(random_file):
            df = pl.read_csv(random_file)
            print(f"  ‚úÖ Random cycle {cycle} completed successfully: {len(df)} compounds")
            print(f"  Columns: {df.columns}")
            
            # Ensure score column exists and is numeric
            if "score" in df.columns:
                df = df.with_columns(pl.col("score").cast(pl.Float64))
            elif "Score" in df.columns:
                df = df.rename({"Score": "score"})
                df = df.with_columns(pl.col("score").cast(pl.Float64))
            else:
                print(f"  Warning: No score column found in {random_file}")
                print(f"  Available columns: {df.columns}")
            
            random_results.append(df)
        else:
            print(f"  ‚ùå Results file not found: {random_file}")
            random_results.append(None)
            
    except Exception as e:
        print(f"‚ùå Error in random cycle {cycle}: {str(e)}")
        print(f"Traceback: {traceback.format_exc()}")
        random_results.append(None)

# 4. Create visualization
print(f"\nüîÑ Creating visualization...")

try:
    # Filter out None results
    valid_integrated = [df for df in integrated_results if df is not None]
    valid_random = [df for df in random_results if df is not None]
    
    if not valid_integrated:
        raise ValueError("No valid integrated sampler results found")
    if not valid_random:
        raise ValueError("No valid random baseline results found")
    
    print(f"\nüìä Preparing visualization data:")
    print(f"  - Valid integrated cycles: {len(valid_integrated)}")
    print(f"  - Valid random cycles: {len(valid_random)}")
    
    # Prepare data for TS_Benchmarks
    methods_list = ["IntegratedTS", "Random"]
    TS_runs_data = {
        "IntegratedTS": valid_integrated,
        "Random": valid_random
    }
    
    # Create TS_Benchmarks instance
    print(f"\nüîÑ Creating TS_Benchmarks visualization...")
    bench = TS_Benchmarks(
        no_of_cycles=min(len(valid_integrated), len(valid_random)),
        methods_list=methods_list,
        TS_runs_data=TS_runs_data,
        reference_data=None,  # Not needed for strip plot
        top_n=100,
        sort_type="minimize"
    )
    
    print(f"‚úÖ TS_Benchmarks created successfully")
    
    # 5. Generate strip plot
    print(f"\nüìä Generating strip plot...")
    try:
        chart = bench.stripplot_TS_results(show_plot=True)
        print("‚úÖ Strip plot generated successfully")
    except Exception as e:
        print(f"‚ùå Error generating strip plot: {str(e)}")
        print(f"Traceback: {traceback.format_exc()}")
        
except Exception as e:
    print(f"‚ùå Error creating TS_Benchmarks: {str(e)}")
    print(f"Traceback: {traceback.format_exc()}")

print(f"\n" + "="*60)
print("TEST COMPLETED")
print("="*60)

INTEGRATED THOMPSON SAMPLER TEST WITH VISUALIZATION
‚úÖ Configuration loaded successfully

ÔøΩÔøΩ Running Integrated Thompson Sampler (2 cycles)...

ÔøΩÔøΩ Running Integrated Thompson Sampler - Cycle 1
  Running warm-up...
‚ùå Error in cycle 1: unsupported operand type(s) for *: 'float' and 'NoneType'
Traceback: Traceback (most recent call last):
  File "/var/folders/sn/gcrsw56j5k93g0p6gnkvvn7c0000gn/T/ipykernel_4359/247604726.py", line 89, in <module>
    sampler.warm_up_integrated(num_warmup_trials=input_dict.get("num_warmup_trials", 3))
  File "/Users/aakankschitnandkeolyar/Desktop/PRISMS/PRISMS/thompson_sampling/thompson_sampling.py", line 153, in warm_up_integrated
    batch_size = min(100, self.num_prods // num_warmup_trials)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/aakankschitnandkeolyar/Desktop/PRISMS/PRISMS/thompson_sampling/thompson_sampling.py", line 48, in generate_unique_batch
    means = np.array([r.posterior_mean if r.posterior_mean 

  0%|          | 0/100 [00:00<?, ?it/s]

‚ùå Error in random cycle 1: <rdkit.Chem.rdchem.Mol object at 0x35a544900>
Traceback: Traceback (most recent call last):
  File "/var/folders/sn/gcrsw56j5k93g0p6gnkvvn7c0000gn/T/ipykernel_4359/247604726.py", line 136, in <module>
    random_baseline(
  File "/Users/aakankschitnandkeolyar/Desktop/PRISMS/PRISMS/thompson_sampling/baseline.py", line 111, in random_baseline
    product_smiles = Chem.MolToSmiles(product_mol)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/aakankschitnandkeolyar/Desktop/PRISMS/PRISMS/thompson_sampling/evaluators.py", line 146, in evaluate
    return self.ref_dict[product_name] # Changed to product code for easier lookup
           ~~~~~~~~~~~~~^^^^^^^^^^^^^^
KeyError: <rdkit.Chem.rdchem.Mol object at 0x35a544900>


üîÑ Running Random Baseline - Cycle 2
5.00e+05 products


  0%|          | 0/100 [00:00<?, ?it/s]

‚ùå Error in random cycle 2: <rdkit.Chem.rdchem.Mol object at 0x3510dcc80>
Traceback: Traceback (most recent call last):
  File "/var/folders/sn/gcrsw56j5k93g0p6gnkvvn7c0000gn/T/ipykernel_4359/247604726.py", line 136, in <module>
    random_baseline(
  File "/Users/aakankschitnandkeolyar/Desktop/PRISMS/PRISMS/thompson_sampling/baseline.py", line 111, in random_baseline
    product_smiles = Chem.MolToSmiles(product_mol)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/aakankschitnandkeolyar/Desktop/PRISMS/PRISMS/thompson_sampling/evaluators.py", line 146, in evaluate
    return self.ref_dict[product_name] # Changed to product code for easier lookup
           ~~~~~~~~~~~~~^^^^^^^^^^^^^^
KeyError: <rdkit.Chem.rdchem.Mol object at 0x3510dcc80>


üîÑ Creating visualization...
‚ùå Error creating TS_Benchmarks: No valid integrated sampler results found
Traceback: Traceback (most recent call last):
  File "/var/folders/sn/gcrsw56j5k93g0p6gnkvvn7c0000gn/T/ipykernel_4359/247604726.py