In [1]:
import pandas as pd
import numpy as np
import torch
import py3Dmol

from transformers import AutoTokenizer, EsmForProteinFolding
from transformers.models.esm.openfold_utils.protein import to_pdb, Protein as OFProtein
from transformers.models.esm.openfold_utils.feats import atom14_to_atom37

In [2]:
#https://github.com/huggingface/notebooks/blob/main/examples/protein_folding.ipynb
def convert_outputs_to_pdb(outputs):
    final_atom_positions = atom14_to_atom37(outputs["positions"][-1], outputs)
    outputs = {k: v.detach().numpy() for k, v in outputs.items()}
    final_atom_positions = final_atom_positions.cpu().detach().numpy()
    final_atom_mask = outputs["atom37_atom_exists"]
    pdbs = []
    for i in range(outputs["aatype"].shape[0]):
        aa = outputs["aatype"][i]
        pred_pos = final_atom_positions[i]
        mask = final_atom_mask[i]
        resid = outputs["residue_index"][i] + 1
        pred = OFProtein(
            aatype=aa,
            atom_positions=pred_pos,
            atom_mask=mask,
            residue_index=resid,
            b_factors=outputs["plddt"][i],
            chain_index=outputs["chain_index"][i] if "chain_index" in outputs else None,
        )
        pdbs.append(to_pdb(pred))
    return pdbs

In [3]:
PATH='./'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Tells the model we need to use the GPU

In [4]:
map_location=torch.device('cpu')

print('Reading data...')
df = pd.read_csv('disorder_full_amino.csv', delimiter=',', header=0)
df = df[(df['disorder_ratio'] >= 0.7) | (df['disorder_ratio'] <= 0.05)] 
labels = np.where(df['disorder_ratio'] >= 0.1, 1, 0)

classification_df = pd.DataFrame({'text' : df['mRNA'], 'label' : labels})

lengths = [len(seq) for seq in classification_df['text']]
classification_df['len'] = lengths
classification_df = classification_df[classification_df['len'] == 103].reset_index()
classification_df

Reading data...


Unnamed: 0,index,text,label,len
0,377,MADFTLSKSLFSGKYRNASSTPGNIAYALFVLFCFWAGAQLLNLLV...,0,103
1,563,VNISNSQVNRLRHFVRAGLRSLFRPEPQTAVEWADANYYLPKESAY...,0,103
2,1133,MNEQNLKHVIALLLEDKARLQQIEPNAGTEARILLKAQALKTRQAP...,0,103
3,1281,VKYLLIFLLVLAIFVISVTLGAQNDQQVFTNYLLAQGEYRISTLLA...,0,103
4,1616,MKKVLALVVKMGLSSAAFAAETTTTPAPTATTTKAAPKATTHHKKQ...,1,103


In [5]:
#disordered = classification_df.iloc[2].loc['text']
#ordered = classification_df.iloc[11].loc['text']

disordered = classification_df.iloc[4].loc['text']
ordered = classification_df.iloc[3].loc['text']

In [6]:
disordered[:-1]

'MKKVLALVVKMGLSSAAFAAETTTTPAPTATTTKAAPKATTHHKKQHKAAPAQKAQKAAKHHKNTKAEQKAPEQKAQAAKKHAKKHSHQQPAAAAPAAQPAA'

In [7]:
ordered[:-1]

'VKYLLIFLLVLAIFVISVTLGAQNDQQVFTNYLLAQGEYRISTLLAVLFKFAIGWLICGLFWLRVRVSLARAERKIKRLENQLSPATDVAVVPHSSAAAAGE'

In [8]:
print('Tokenizing...')
tokenizer = AutoTokenizer.from_pretrained('facebook/esmfold_v1')
#d_inputs = tokenizer(['MIIKTLKMSARKRN'], return_tensors="pt", add_special_tokens=False, padding=True)
#o_inputs = tokenizer(['MFSKLAQSSIKAMF'], return_tensors="pt", add_special_tokens=False, padding=True)
#d_inputs = tokenizer(['VDNARIDLRSKYYVKPKADHPWLTRRTQSHQQVKPPKLPKKKPDPDKKD'], return_tensors="pt", add_special_tokens=False, padding=True)
#o_inputs = tokenizer(['MKHNPLVVCLLIICITILTFTLLTRQTLYELRFRDGDKEVAALMACTSR'], return_tensors="pt", add_special_tokens=False, padding=True)
d_inputs = tokenizer(['MKKVLALVVKMGLSSAAFAAETTTTPAPTATTTKAAPKATTHHKKQHKAAPAQKAQKAAKHHKNTKAEQKAPEQKAQAAKKHAKKHSHQQPAAAAPAAQPAA'], return_tensors="pt", add_special_tokens=False, padding=True)
o_inputs = tokenizer(['VKYLLIFLLVLAIFVISVTLGAQNDQQVFTNYLLAQGEYRISTLLAVLFKFAIGWLICGLFWLRVRVSLARAERKIKRLENQLSPATDVAVVPHSSAAAAGE'], return_tensors="pt", add_special_tokens=False, padding=True)

Tokenizing...


In [9]:
print('Building Model...')
model = EsmForProteinFolding.from_pretrained('facebook/esmfold_v1')
d_outputs = model(**d_inputs)
o_outputs = model(**o_inputs)

Building Model...


Some weights of EsmForProteinFolding were not initialized from the model checkpoint at facebook/esmfold_v1 and are newly initialized: ['esm.contact_head.regression.bias', 'esm.contact_head.regression.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
d_pdb = convert_outputs_to_pdb(d_outputs)
o_pdb = convert_outputs_to_pdb(o_outputs)

In [11]:
view = py3Dmol.view(js='https://3dmol.org/build/3Dmol.js', width=800, height=400)
view.addModel("".join(d_pdb), 'pdb')
view.setStyle({'model': -1}, {"cartoon": {'color': 'spectrum'}})

<py3Dmol.view at 0x7f4749f000d0>

In [12]:
view = py3Dmol.view(js='https://3dmol.org/build/3Dmol.js', width=800, height=400)
view.addModel("".join(o_pdb), 'pdb')
view.setStyle({'model': -1}, {"cartoon": {'color': 'spectrum'}})

<py3Dmol.view at 0x7f4574b39160>