# Building Scalable Drug Discovery Applications

![Active learning can accelerate DMTL cycles](img/dmtl.png)

---
## 1. Setup

In [None]:
# %pip install -U py3dmol biotite xformers evo_prot_grad

In [None]:
import biotite
from biotite.structure.io import pdb
from biotite.database import rcsb
from biotite.sequence import ProteinSequence
import numpy as np
import tempfile
import helpers
import pandas as pd


---
## 2. View Nanobody sequence and structure

In [None]:
pdb_id = "7eow"
stack = biotite.structure.io.pdb.get_structure(
    pdb.PDBFile.read(rcsb.fetch(pdb_id, "pdb"))
)
vf_factor = helpers.clean_structure(stack[0][stack.chain_id == "A"])
caplacizumab = helpers.clean_structure(stack[0][stack.chain_id == "B"])

caplacizumab_seq = biotite.structure.to_sequence(caplacizumab)[0][0]

In [None]:
cdr1 = list(range(25, 32))
cdr2 = list(range(51, 57))
cdr3 = list(range(98, 117))
cdrs = cdr1 + cdr2 + cdr3
cdrs_1_base = [i+1 for i in cdrs]

preserved_regions = [
    (0, cdr1[0]),
    (cdr1[-1]+1, cdr2[0]),
    (cdr2[-1]+1, cdr3[0]),
    (cdr3[-1]+1, len(caplacizumab_seq)),
]

print(caplacizumab_seq)
print(helpers.format_cdrs(caplacizumab_seq, cdrs))

In [None]:
import py3Dmol

view = py3Dmol.view(width=600, height=600)
view.addModel(helpers.to_pdb_string(vf_factor))
view.addModel(helpers.to_pdb_string(caplacizumab))
view.setStyle({"chain": "A"}, {"cartoon": {"color": "orange", "opacity": 0.6}})
view.setStyle({"chain": "B"}, {"cartoon": {"color": "blue", "opacity": 0.6}})
view.addStyle(
    {"chain": "B", "resi": cdrs_1_base}, {"cartoon": {"color": "#57C4F8", "opacity": 1.0}}
)
view.zoomTo()
view.show()

---
## 3. Generate Sequence Variants

![Generate sequence variants using directed evolution](img/gen.png)

### 3.1. Random Mutation

In [None]:
generated_seqs = helpers.random_evolution(
    wt_protein=str(caplacizumab_seq),  # path to wild type fasta file)
    n_output_seqs=1000,
    preserved_regions=preserved_regions,  # leave the framework regions unchanged
)

generated_seqs['lab_result'] = np.NaN

n=25
print(caplacizumab_seq)
for i in generated_seqs[:n].itertuples():
    print(helpers.format_cdrs(i.seq, cdrs))

In [None]:
generated_seqs

### 3.2. Directed Evolution

In [None]:
# generated_seqs = helpers.run_evo_prot_grad(
#     wt_protein=caplacizumab_seq,  # path to wild type fasta file
#     output="all",  # return best, last, all variants
#     expert="esm",  # list of experts to compose
#     parallel_chains=10,  # number of parallel chains to run
#     n_steps=20,  # number of MCMC steps per chain
#     max_mutations=-1,  # maximum number of mutations per variant
#     preserved_regions=preserved_regions,  # leave the framework regions unchanged
# )['seq']

# display(generated_seqs)

---
## 4. Select Samples

![Identify candidates for lab testing using a selection model](img/select.png)

In [None]:
no_lab_data = generated_seqs[generated_seqs['lab_result'].isnull()]
selected_seqs = no_lab_data.sample(100)
selected_seqs

---
## 5. Submit to Lab

![Submit selected samples for experimental testing](img/lab.png)

In [None]:
lab_results = helpers.submit_seqs_to_lab(selected_seqs, delay=0)
lab_results

---
## 6. Fine-Tune Scoring Model

![Improve the scoring model using experimental results](img/ft.png)

In [None]:
# lab_results

---
## 7. Score

![Predict high-performing variants using a scoring model](img/score.png)

In [None]:
for result in lab_results.itertuples():
    generated_seqs.loc[[result.Index],['lab_result']] = result.lab_result
display(generated_seqs)

In [None]:
predicted_results = helpers.run_scoring_model(generated_seqs)

In [None]:
for result in predicted_results.itertuples():
    generated_seqs.loc[[result.Index],['last_prediction']] = result.prediction
display(generated_seqs)

---
## 8. Repeat

In [None]:
# Select another batch of samples without lab data
selected_seqs = generated_seqs[generated_seqs['lab_result'].isnull()].sort_values(by='last_prediction', ascending=False)[:100]

# Submit to lab
lab_results = helpers.submit_seqs_to_lab(selected_seqs, delay=0)

# Fine-Tune on new data
#TBD

# Score generated sequences
for result in lab_results.itertuples():
    generated_seqs.loc[[result.Index],['lab_result']] = result.lab_result

for result in predicted_results.itertuples():
    generated_seqs.loc[[result.Index],['last_prediction']] = result.prediction
display(generated_seqs)

generated_seqs.count()