## Implementation of "DIFFUSION PROBABILISTIC MODELING OF PROTEIN
## BACKBONES IN 3D FOR THE MOTIF-SCAFFOLDING
## PROBLEM"

source: https://github.com/blt2114/protdiff_smcdiff

Please note that **data, model, analysis, and experiments folders** from the repository should be uploaded to colab notebook for this code to be run.

Output folder is **inpaint_test_out**.

## Library Requirements

In [2]:
!pip3 install gputil

Collecting gputil
  Downloading GPUtil-1.4.0.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: gputil
  Building wheel for gputil (setup.py) ... [?25l[?25hdone
  Created wheel for gputil: filename=GPUtil-1.4.0-py3-none-any.whl size=7394 sha256=fd82e2ae4016974b8b4877eacf95902d59394388be74e5e5e66925d267cebe1a
  Stored in directory: /root/.cache/pip/wheels/a9/8a/bd/81082387151853ab8b6b3ef33426e98f5cbfebc3c397a9d4d0
Successfully built gputil
Installing collected packages: gputil
Successfully installed gputil-1.4.0


In [3]:
!pip3 install ml_collections

Collecting ml_collections
  Downloading ml_collections-0.1.1.tar.gz (77 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/77.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: ml_collections
  Building wheel for ml_collections (setup.py) ... [?25l[?25hdone
  Created wheel for ml_collections: filename=ml_collections-0.1.1-py3-none-any.whl size=94506 sha256=76012f7fb6387d0f843633d94c01c91c353544364c3f55ff7aa80d8ca28741be
  Stored in directory: /root/.cache/pip/wheels/7b/89/c9/a9b87790789e94aadcfc393c283e3ecd5ab916aed0a31be8fe
Successfully built ml_collections
Installing collected packages: ml_collections
Successfully installed ml_collections-0.1.1


In [4]:
!pip3 install biopython

Collecting biopython
  Downloading biopython-1.83-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/3.1 MB[0m [31m5.7 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━[0m [32m2.1/3.1 MB[0m [31m30.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m30.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: biopython
Successfully installed biopython-1.83


## Imports

In [5]:
import numpy as np
import importlib
import os
import torch
import GPUtil
import ml_collections
import time
import matplotlib.pyplot as plt
import tree
from plotly.subplots import make_subplots
np.int = np.int64

In [6]:
from data import diffuser
from data import utils as du
from model import reverse_diffusion

from experiments import torch_train_diffusion
from analysis import plotting
from analysis import utils as au

torch.manual_seed(0)
np.random.seed(0)

## Setup experiments

In [7]:
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
chosen_gpu = ''.join(
    [str(x) for x in GPUtil.getAvailable(order='memory')])
os.environ["CUDA_VISIBLE_DEVICES"] = chosen_gpu
print(chosen_gpu)

0


In [8]:
# Read ckpt
ckpt_dir = '/content/drive/MyDrive/ckpt/'
ckpt_path = os.path.join(ckpt_dir, os.listdir(ckpt_dir)[0]).replace('.pth', '.pkl')

print(ckpt_path)
ckpt_pkl = du.read_pkl(ckpt_path)
ckpt_cfg = ckpt_pkl['cfg']
ckpt_state = ckpt_pkl['exp_state']

/content/drive/MyDrive/ckpt/checkpoint_1000000.pkl


In [9]:
# Set-up experiment

data_setting = 'pdb'
cfg = torch_train_diffusion.get_config()
cfg = dict(cfg)
cfg['experiment'].update(ckpt_cfg.experiment)
cfg['experiment']['data_setting'] = data_setting
cfg['model'].update(ckpt_cfg.model)

# Pop unexpected model parameters
cfg['model'] = dict(cfg['model'])
cfg['model'].pop('cross_prod_num_neighbors')
cfg['model'].pop('inpainting_training')
cfg['model'].pop('num_heads')

cfg = ml_collections.ConfigDict(cfg)
cfg['data']['max_len'] = ckpt_cfg.data.max_len
cfg['data']['inpainting_training'] = False
cfg['data']['rmsd_filter'] = None
cfg['data']['monomer_only'] = True
print(cfg['data']['pdb_self_consistency_path'])


exp_cfg = cfg['experiment']
cfg['experiment']['batch_size'] = 4

exp = torch_train_diffusion.Experiment(cfg)
exp.model.load_state_dict(ckpt_state)




<All keys matched successfully>

## Unconditional Sampling (ProtDiff)

In [11]:
# Select number of samples and length of each sample
batch_size = 4
sample_dir = 'model_samples'

In [12]:
# Run sampling
def sampling(model, batch_size, directory):
  os.makedirs(directory, exist_ok=True)
  noise_scale = 1.
  for num_res_sample in [80]:
      N = num_res_sample
      bb_mask = np.zeros((batch_size, N))
      bb_mask[:, :num_res_sample] = 1
      print("shape:  ", bb_mask.shape)
      sampled_diffusion = model.sample_reverse_diffusion(bb_mask)

      # Save reverse diffusion movies
      for b_idx in range(batch_size):
          save_path = f'{directory}/len_{num_res_sample}_{b_idx}.pdb'
          au.write_prot_to_pdb(sampled_diffusion[b_idx][-1], save_path, no_indexing=True)
  return sampled_diffusion, bb_mask

In [13]:
sampled_diffusion, bb_mask = sampling(exp, batch_size, sample_dir)
print(type(sampled_diffusion))
print(np.asarray(sampled_diffusion).shape)

shape:   (4, 80)
On 999
On 899
On 799
On 699
On 599
On 499
On 399
On 299
On 199
On 99
<class 'list'>
(4, 1025, 80, 3)


## Visualize Samples

In [None]:
## Plot samples
num_res = np.sum(bb_mask, axis=-1)
nrows = int(np.sqrt(batch_size))
ncols = nrows
fig = make_subplots(
    rows=nrows, cols=ncols,
    specs=[[{'type': 'surface'}] * nrows]*ncols)

# Take last time step
last_sample = [x[-1] for x in sampled_diffusion]
fig.update_layout(
    title_text=f'Samples',
    height=1000,
    width=1000,
)
for i in range(nrows):
    for j in range(ncols):
        b_idx = i*nrows+j
        sample_ij = last_sample[b_idx]
        sample_bb_3d = plotting.create_scatter(
            sample_ij, mode='lines+markers', marker_size=3,
            opacity=1.0, name=f'Sample {i*nrows+j}: length={num_res[b_idx]}')
        fig.add_trace(sample_bb_3d, row=i+1, col=j+1)

fig.show()

## Conditional Sampling (SMCDiff) - Condition on motifs

In [None]:
from inpainting import motif_problems
from inpainting import inpaint_experiment
importlib.reload(motif_problems)

<module 'inpainting.motif_problems' from '/content/inpainting/motif_problems.py'>

In [None]:
sample_dir = "inpaint_test_out/"
os.makedirs(sample_dir, exist_ok=True)

In [None]:
# 6e6r test
name = "6e6r"
motif_start, motif_end = 10, 52
pdb_name, target_len, motif_ca_xyz, full_ca_xyz_true, motif_idcs, inpainting_task_name = \
    motif_problems.load_pdb_motif_problem(motif_start, motif_end, pdb_name=name, base_dir="./")

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# Test with replacement method
out = inpaint_experiment.run_inpainting(
    exp, target_len, motif_ca_xyz, motif_idcs, exp.diffuser,
    T=exp.cfg.experiment.T, N_samples_per_diffusion=4, inpainting_task_name="test", output_dir=sample_dir,
    inpaint_method="replacement", num_save=4)

On 999
On 899
On 799
On 699
On 599
On 499
On 399
On 299
On 199
On 99


In [None]:
# Test with SMC-Diff
out = inpaint_experiment.run_inpainting(
    exp, target_len, motif_ca_xyz, motif_idcs, exp.diffuser,
    T=exp.cfg.experiment.T, N_samples_per_diffusion=64, inpainting_task_name="test",
    output_dir=sample_dir, inpaint_method="particle", num_save=4)

On 999
948 resampling, departure=48.75
On 899
896 resampling, departure=48.13
853 resampling, departure=49.38
821 resampling, departure=50.89
On 799
706 resampling, departure=49.72
On 699
676 resampling, departure=48.13
On 599
567 resampling, departure=51.38
510 resampling, departure=48.11
On 499
431 resampling, departure=49.04
On 399
364 resampling, departure=52.32
On 299
280 resampling, departure=49.46
233 resampling, departure=49.81
212 resampling, departure=55.62
On 199
169 resampling, departure=53.70
136 resampling, departure=49.82
129 resampling, departure=49.23
On 99
96 resampling, departure=49.36
81 resampling, departure=48.29
72 resampling, departure=53.61
53 resampling, departure=50.98
45 resampling, departure=53.14
36 resampling, departure=58.03
27 resampling, departure=51.72
16 resampling, departure=50.19
12 resampling, departure=90.47
5 resampling, departure=62.87
2 resampling, departure=57.25


In [None]:
# 5trv scaffolding test
name = "5trv"
pad = 20
motif_start, motif_end = 42, 62 # minimal
motif_start, motif_end = motif_start-pad, motif_end+pad
pdb_name, target_len, motif_ca_xyz, full_ca_xyz_true, motif_idcs, inpainting_task_name = \
    motif_problems.load_pdb_motif_problem(motif_start, motif_end, pdb_name=name, base_dir="./")
out = inpaint_experiment.run_inpainting(
    exp, target_len, motif_ca_xyz, motif_idcs, exp.diffuser,
    T=exp.cfg.experiment.T, N_samples_per_diffusion=64, inpainting_task_name=inpainting_task_name,
    output_dir=sample_dir, inpaint_method="particle", num_save=1)

On 999
On 899
894 resampling, departure=48.02
865 resampling, departure=50.12
805 resampling, departure=50.52
On 799
777 resampling, departure=49.78
707 resampling, departure=53.19
On 699
On 599
599 resampling, departure=48.97
541 resampling, departure=48.00
On 499
490 resampling, departure=48.66
439 resampling, departure=48.33
409 resampling, departure=48.12
On 399
341 resampling, departure=48.68
304 resampling, departure=49.13
On 299
259 resampling, departure=49.00
222 resampling, departure=48.86
208 resampling, departure=48.85
On 199
186 resampling, departure=51.13
162 resampling, departure=54.18
152 resampling, departure=59.86
132 resampling, departure=49.56
115 resampling, departure=50.05
102 resampling, departure=49.48
On 99
93 resampling, departure=48.27
78 resampling, departure=51.29
65 resampling, departure=55.83
59 resampling, departure=53.16
54 resampling, departure=50.14
35 resampling, departure=50.35
29 resampling, departure=49.70
23 resampling, departure=82.04
20 resampli