# Initialization

In [None]:
!pip install pandas==1.4.2
!pip install sequence-models
!pip install biopython
!pip install fair-esm

from Bio import SeqIO
import pandas as pd
import json
import numpy as np
import torch
import sys
import time


from sequence_models.pretrained import load_model_and_alphabet

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pandas==1.4.2
  Downloading pandas-1.4.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.7/11.7 MB[0m [31m77.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pandas
  Attempting uninstall: pandas
    Found existing installation: pandas 1.4.4
    Uninstalling pandas-1.4.4:
      Successfully uninstalled pandas-1.4.4
Successfully installed pandas-1.4.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sequence-models
  Downloading sequence_models-1.6.0-py3-none-any.whl (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.5/60.5 KB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sequence-models
Successfully installed sequence-models-1.6.0
Looking in indexes: https://py

In [5]:
def update_dict(data, code, chain, domain, boundaries):
    data[code] = data.get(code, {})
    if chain not in data[code].keys():
        data[code][chain] = {}
    data[code][chain][str(int(domain))] = boundaries
    return data


def get_cath(lines_range = None, file="../data/cath/cath_domain_boundaries.txt", verb=False):
    with open(file, "r") as f:
        lines = f.readlines() if lines_range is None else f.readlines()[lines_range[0]:lines_range[1]]
        lines = map(lambda x : x.strip(), lines)
        big = 0
        data = {}
        for line in lines:
            name, boundaries = line.split("\t")
            code = name[:4]
            chain = name[4]
            domain = name[5:]
            data = update_dict(data, code, chain, domain, boundaries)
    # verb and pprint(data)
    return data

def boundaries(len_seq, domain, discontinuity_delimiter=','):
    """
        Defines a boundary as the beginning of a domain ONLY in multi-domain proteins
    """
    first_start = np.inf
    bounds = np.zeros((len_seq), dtype=np.int8)
    for k, v in domain.items():
        boundary_positions = v.split(discontinuity_delimiter)
        for b in boundary_positions:
            start, end = [int(i) for i in b.split('-')]
            if start < first_start:
                first_start = start
            bounds[start-1] = 1
    bounds[first_start-1] = 0            
    return np.array(bounds, dtype=np.bool_)


cath = get_cath()


with open('../data/cath/iid/chains_to_seq_iid.json') as json_file:
    key_to_seq_dict = json.load(json_file)


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# CASP13

In [None]:
df = pd.DataFrame(columns=['in', 'out', 'seq_len', 'key'])
df['in'] = df['in'].astype(object)
df['out'] = df['out'].astype(object)

model, alphabet = torch.hub.load("facebookresearch/esm:main", "esm2_t12_35M_UR50D")

batch_converter = alphabet.get_batch_converter()

model = model.to(device)
model.eval()  # disables dropout for deterministic results

counter = 0


with open('data/data_generation/casp13_data.json') as json_file:
    casp13 = json.load(json_file)

total = len(casp13['domains'])

start = time.time()
for name, domain in casp13['domains'].items():
    seq = casp13['seqs'][name]
    data = [[f"p{counter}", seq]]

    _, _, batch_tokens = batch_converter(data)
    batch_tokens = batch_tokens.to(device)
    batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)

    # Extract per-residue representations (on CPU)
    with torch.no_grad():
        results = model(batch_tokens, repr_layers=[12], return_contacts=True)

    input = results["representations"][12][0].cpu().detach().numpy().astype(np.float16)
    target = boundaries(len(seq), domain, ';')

    new_df = pd.DataFrame({'in':[input], 'out':[target], 'seq_len':[len(seq)], 'key':name})
    df = pd.concat([df,new_df], ignore_index=True)
    counter += 1
    if counter % 10 == 0:
        print(f'Written {counter}/{total} entries')

    torch.cuda.empty_cache()

end = time.time()
print(f"Time to train: {end - start}")

print(f'Written {counter}/{total} entries')

Downloading: "https://github.com/facebookresearch/esm/zipball/main" to /root/.cache/torch/hub/main.zip
Downloading: "https://dl.fbaipublicfiles.com/fair-esm/models/esm2_t12_35M_UR50D.pt" to /root/.cache/torch/hub/checkpoints/esm2_t12_35M_UR50D.pt
Downloading: "https://dl.fbaipublicfiles.com/fair-esm/regression/esm2_t12_35M_UR50D-contact-regression.pt" to /root/.cache/torch/hub/checkpoints/esm2_t12_35M_UR50D-contact-regression.pt


Written 10/80 entries
Written 20/80 entries
Written 30/80 entries
Written 40/80 entries
Written 50/80 entries
Written 60/80 entries
Written 70/80 entries
Written 80/80 entries
Time to train: 5.114464521408081
Written 80/80 entries


In [None]:
df.to_pickle('casp13_test_model.csv')

# Carp

In [None]:
model, collater = load_model_and_alphabet('data/data_generation/carp_38M.pt')
model = model.to(device)
model.eval() # disable dropout

CARP(
  (model): ByteNetLM(
    (embedder): ByteNet(
      (embedder): Embedding(30, 8, padding_idx=28)
      (up_embedder): PositionFeedForward(
        (conv): Conv1d(8, 1024, kernel_size=(1,), stride=(1,))
      )
      (layers): ModuleList(
        (0): ByteNetBlock(
          (conv): MaskedConv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,))
          (sequence1): Sequential(
            (0): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (1): GELU(approximate='none')
            (2): PositionFeedForward(
              (conv): Conv1d(1024, 512, kernel_size=(1,), stride=(1,))
            )
            (3): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
            (4): GELU(approximate='none')
          )
          (sequence2): Sequential(
            (0): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
            (1): GELU(approximate='none')
            (2): PositionFeedForward(
              (conv): Conv1d(512, 1024, kernel_size=(1,), s

In [None]:
df = pd.DataFrame(columns=['in', 'out', 'seq_len', 'key'])
df['in'] = df['in'].astype(object)
df['out'] = df['out'].astype(object)

counter = 0

start = time.time()
for chain, seq in key_to_seq_dict.items():

    pdb_code = chain[:4]

    seqs = [[seq]]
    x = collater(seqs)[0] # (n, max_len)
    x = x.to(device)
    rep = model(x)  # (n, max_len, d_model)
    input = rep['representations'][16][0].cpu().detach().numpy().astype(np.float16)
    domain = cath[pdb_code][chain[-1]]
    target = boundaries(len(seq), domain)

    new_df = pd.DataFrame({'in':[input], 'out':[target], 'seq_len':[len(seq)], 'key':chain})
    df = pd.concat([df,new_df], ignore_index=True)
    counter += 1
    if counter % 1000 == 0:
        print(f'Written {counter}/{len(key_to_seq_dict)} entries')


end = time.time()
print(f"Time to train: {end - start}")
print(f'Written {counter}/{len(key_to_seq_dict)} entries')

Written 1000/8497 entries
Written 2000/8497 entries
Written 3000/8497 entries
Written 4000/8497 entries
Written 5000/8497 entries
Written 6000/8497 entries
Written 7000/8497 entries
Written 8000/8497 entries
Time to train: 168.65171456336975
Written 8497/8497 entries


In [None]:
df.to_pickle('carp38M_data.csv')

# ESM

In [None]:
model, alphabet = torch.hub.load("facebookresearch/esm:main", "esm2_t12_35M_UR50D")

batch_converter = alphabet.get_batch_converter()

model = model.to(device)
model.eval()  # disables dropout for deterministic results

Using cache found in /root/.cache/torch/hub/facebookresearch_esm_main


ESM2(
  (embed_tokens): Embedding(33, 480, padding_idx=1)
  (layers): ModuleList(
    (0): TransformerLayer(
      (self_attn): MultiheadAttention(
        (k_proj): Linear(in_features=480, out_features=480, bias=True)
        (v_proj): Linear(in_features=480, out_features=480, bias=True)
        (q_proj): Linear(in_features=480, out_features=480, bias=True)
        (out_proj): Linear(in_features=480, out_features=480, bias=True)
        (rot_emb): RotaryEmbedding()
      )
      (self_attn_layer_norm): LayerNorm((480,), eps=1e-05, elementwise_affine=True)
      (fc1): Linear(in_features=480, out_features=1920, bias=True)
      (fc2): Linear(in_features=1920, out_features=480, bias=True)
      (final_layer_norm): LayerNorm((480,), eps=1e-05, elementwise_affine=True)
    )
    (1): TransformerLayer(
      (self_attn): MultiheadAttention(
        (k_proj): Linear(in_features=480, out_features=480, bias=True)
        (v_proj): Linear(in_features=480, out_features=480, bias=True)
        (

In [None]:
df = pd.DataFrame(columns=['in', 'out', 'seq_len', 'key'])
df['in'] = df['in'].astype(object)
df['out'] = df['out'].astype(object)

counter = 0

start = time.time()

for chain, seq in key_to_seq_dict.items():
    pdb_code = chain[:4]

    data = [[f"p{counter}", seq]]

    _, _, batch_tokens = batch_converter(data)
    batch_tokens = batch_tokens.to(device)
    batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)

    # Extract per-residue representations (on CPU)
    with torch.no_grad():
        results = model(batch_tokens, repr_layers=[12], return_contacts=True)

    input = results["representations"][12][0].cpu().detach().numpy().astype(np.float16)
    domain = cath[pdb_code][chain[-1]]
    target = boundaries(len(seq), domain)

    new_df = pd.DataFrame({'in':[input], 'out':[target], 'seq_len':[len(seq)], 'key':chain})
    df = pd.concat([df,new_df], ignore_index=True)
    counter += 1
    if counter % 1000 == 0:
        print(f'Written {counter}/{len(key_to_seq_dict)} entries')

    torch.cuda.empty_cache()

end = time.time()
print(f"Time to train: {end - start}")
print(f'Written {counter}/{len(key_to_seq_dict)} entries')

Written 1000/8497 entries
Written 2000/8497 entries
Written 3000/8497 entries
Written 4000/8497 entries
Written 5000/8497 entries
Written 6000/8497 entries
Written 7000/8497 entries
Written 8000/8497 entries
Time to train: 203.44633269309998
Written 8497/8497 entries


In [None]:
df.to_pickle('esm35M_data.csv')

# One hot

In [None]:
def unique_amino_acids():
    amino_acids = set()
    for seq in key_to_seq_dict.values():
        for aa in seq:
            amino_acids.add(aa)
    return str(sorted(list(amino_acids)))


def one_hot_seq(seq):
    amino_acids = "".join(list({'S', 'K', 'W', 'V', 'A', 'P', 'M', 'Q', 'I', 'H', 'Y', 'G', 'D', 'R', 'N', 'C', 'E', 'L', 'F', 'X', 'T'}))
    encoded = np.array([1 if elt == 'A' else 0 for elt in seq],dtype=np.float64)
    # start from the second element since the first one is A and was created above
    for amino_acid in amino_acids[1:]:
        new = np.array([1 if elt == amino_acid else 0 for elt in seq])
        encoded = np.vstack((encoded, new))
    return encoded.astype(np.float16)

In [None]:
df = pd.DataFrame(columns=['in', 'out', 'seq_len', 'key'])
df['in'] = df['in'].astype(object)
df['out'] = df['out'].astype(object)

counter = 0

for chain, seq in key_to_seq_dict.items():
    pdb_code = chain[:4]
    input = one_hot_seq(seq).T
    domain = cath[pdb_code][chain[-1]]
    target = boundaries(len(seq), domain)

    new_df = pd.DataFrame({'in':[input], 'out':[target], 'seq_len':[len(seq)], 'key':chain})
    df = pd.concat([df,new_df], ignore_index=True)
    counter += 1
    if counter % 1000 == 0:
        print(f'Written {counter}/{len(key_to_seq_dict)} entries')

print(f'Written {counter}/{len(key_to_seq_dict)} entries')

Written 1000/8497 entries
Written 2000/8497 entries
Written 3000/8497 entries
Written 4000/8497 entries
Written 5000/8497 entries
Written 6000/8497 entries
Written 7000/8497 entries
Written 8000/8497 entries
Written 8497/8497 entries


In [None]:
df.to_pickle('onehot_data.csv')