# Analysis of sEH DEL Data 
This was the data provided by Anagenex to develop Chris Zhang's workflow. It contains a list of experimentally validated binders. The goal here is to check if any of our search strategies are able to find the same experimentally validated binders with different runs.

In [1]:
import sys
from pathlib import Path
import polars as pl 

# Add src directory to path since TACTICS is in src/TACTICS
tactics_root = Path.cwd().parent / 'src'
sys.path.insert(0, str(tactics_root))

from TACTICS.thompson_sampling import (
    ThompsonSampler,
    GreedySelection,
    RouletteWheelSelection,
    UCBSelection,
    EpsilonGreedySelection,
    BayesUCBSelection,
    ROCSEvaluator,
    StratifiedWarmup
)

## Load the Data

In [2]:
data = pl.read_csv("./input_files/DEL_seH/total_compounds.csv")

In [10]:
data.columns

['structure',
 'read_count',
 'bb1',
 'bb2',
 'bb3',
 'bb1_iso',
 'bb2_iso',
 'bb3_iso',
 'binder']

In [3]:
data = data.with_columns(
    binder = (pl.col("read_count") != 0).cast(pl.Int32)
)

In [9]:
# Get the Number of Binders
print(f"The total number of binders: {data.filter(pl.col('binder') == 1).height}")
percent_binders = (sum(data["binder"]==1)/(len(set(data["bb1"]))*len(set(data["bb2"]))*len(set(data["bb3"]))))*100
print(f" That is {percent_binders}% of the library")
binders = data.filter(pl.col("binder") == 1)

The total number of binders: 103136
 That is 0.010919584134177126% of the library


In [17]:
# The number of unique building blocks among the binders
# Extract unique building blocks for each position
bb1_unique = binders["bb1_iso"].unique().sort()
bb2_unique = binders["bb2_iso"].unique().sort()
bb3_unique = binders["bb3_iso"].unique().sort()

# Create output directory if it doesn't exist
output_dir = Path("./input_files/DEL_seH/unique_binder_bbs")
output_dir.mkdir(exist_ok=True, parents=True)

# Write each to a separate CSV file
pl.DataFrame({"bb1_iso": bb1_unique}).write_csv(output_dir / "bb1_unique.csv")
pl.DataFrame({"bb2_iso": bb2_unique}).write_csv(output_dir / "bb2_unique.csv")
pl.DataFrame({"bb3_iso": bb3_unique}).write_csv(output_dir / "bb3_unique.csv")

print(f"Unique BB1: {len(bb1_unique)}")
print(f"Unique BB2: {len(bb2_unique)}")
print(f"Unique BB3: {len(bb3_unique)}")

Unique BB1: 562
Unique BB2: 167
Unique BB3: 932


In [None]:
(sum(data["binder"]==1)/(len(set(data["bb1"]))*len(set(data["bb2"]))*len(set(data["bb3"]))))*100

0.010919584134177126

In [16]:
len(set(data["bb1"]))* len(set(data["bb2"]))*len(set(data["bb3"]))

944504834

In [23]:
bb1_data = bb1_data.with_columns(pl.Series("bb_names", [f"HPA{i}" for i in range(len(bb1_data))]))
bb2_data = bb2_data.with_columns(pl.Series("bb_names", [f"CA{i}" for i in range(len(bb2_data))]))
bb3_data = bb3_data.with_columns(pl.Series("bb_names", [f"PA{i}" for i in range(len(bb3_data))]))