##**The Language Model Protein Folder**

for more details see: [Github](https://github.com/YaadLuria/NanoFoldLM).

#### **Tips and Instructions**
- Click the little ▶ play icon to the left of each cell below.
- Input sequance and choose the model you prefer - one-hot-encoding or models based on esm embbeding in diffrenet sizes.
- Display the fold visualization as you prefer.
- Compare it to a pdb known structure.
- You can download the output as pdb file.  
- Use GPU in order to get lower run times.

#### **Colab Limitations**
- 640 and 1280 embedding size models are not available because git storage problems.

Noa Klugman, Yaad Luria, Elad Applbaum, Eyal Zur

**About The Project**








#Installs and Imports

In [None]:
%%time
#@title import

!pip install import-ipynb
import import_ipynb
import os, time, re
from google.colab import files
import tensorflow as tf
from tensorflow import keras
from IPython.display import clear_output

In [None]:
#@title Clone NanoFoldLM trained model


%%bash
# rm -rf NanoFoldLM
# git clone https://github.com/YaadLuria/NanoFoldLM --quiet

if [ ! -f NanoFoldLMReady ]; then
  # install dependencies
  pip -q install biopython
  pip install py3Dmol
  pip install pdb-tools


  # download model
  if [ ! -d "NanoNet/" ]; then
    git clone https://github.com/YaadLuria/NanoFoldLM --quiet
  fi
  touch NanoFoldLMReady
fi

In [None]:
# %% [code]
#@title Test git use
os.chdir("/content/NanoFoldLM/NanoFoldLM")
import utils
# if "NanoFoldLM" not in dir():
#   from timeit import default_timer as timer
#   import sys
#   sys.path.insert(0, '/content/NanoFoldLM/')
#   from NanoFoldLM import *

print(utils.NB_MAX_LENGTH) # just for git clone check

#Set Parameters

In [6]:
jobname = "6xw6640" #@param {type:"string"}
jobname = re.sub(r'\W+', '', jobname)[:50]


download_path = "/drive/MyDrive/Hackathon3D" #@param {type:"string"}

sequence = "QVQLQESGGGLVQAGDSLRVSCAASGRTISSSPMGWFRQAPGKEREFVAAISGNGGNTYYLDSVKGRFTTSRDNAKNTVYLQLNNLKPEDTAIYYCAARSRFSAMHLAYRRLVDYDDWGQGTQVTVSSHHHHHH" #@param {type:"string"}
sequence = re.sub("[^A-Z:]", "", sequence.replace("/",":").upper())
sequence = re.sub(":+",":",sequence)
sequence = re.sub("^[:]+","",sequence)
sequence = re.sub("[:]+$","",sequence)



amino_acids = set("ACDEFGHIKLMNPQRSTVWY")
for i,aa in enumerate(sequence):
  if aa.upper() not in amino_acids:
    print(f"Invalid amino acid: {aa} in position {i}")

# if utils.check_valid_protein_sequence(sequence):
#   print(f"Invalid amino acid: {aa}")

seqs = sequence.split(":")
lengths = [len(s) for s in seqs]
length = sum(lengths)



download_pdb = True #@param {type:"boolean"}
print_atom_coordinates = False #@param {type:"boolean"}
embeding_model = "480" #@param ["320", "480", "One_hot_encoding"] {type:"string"}
embeding_size = embeding_model
# embeding_size = "640" #@param ["320", "480", "640",  "1280", "One_hot_encoding"] {type:"string"}




#Run model

In [None]:
%%time

if embeding_size ==  "One_hot_encoding":
  seq_embeddings = utils.generate_input_one_hot(sequence)
else:
  seq_embeddings = utils.get_esm_embedding_for_protein_sequence(sequence, int(embeding_size))

model_path = f"./Models"
model_dict = {"320": "model_320", "480": "model_480", "640": "model_640",  "1280": "model_1280", "One_hot_encoding": "model_22"}
model = tf.keras.models.load_model(f'{model_path}/{model_dict[embeding_size]}.tf')
import numpy as np
print(seq_embeddings.shape)
predict = model.predict(seq_embeddings)
utils.matrix_to_pdb(sequence, predict[0], jobname)



In [None]:
#@title Download predicted pdb (optional)
if download_pdb:
  from google.colab import files
  files.download(f"{jobname}.pdb")
  # files.download(f"{pdb_id}_{chain_id}")





#Display (optional)

In [None]:

import py3Dmol
pdb_predict = f"{jobname}.pdb"
# pdb_file_path =  "/content/drive/MyDrive/Hackathon3D/Datasets/Train_set/12E8_1.pdb"

with open(pdb_predict, 'r') as pdb_predicts:
  pdb_str = pdb_predicts.read()


from string import ascii_uppercase, ascii_lowercase
alphabet_list = list(ascii_uppercase+ascii_lowercase)
pymol_color_list = ["#33ff33","#00ffff","#ff33cc","#ffff00","#ff9999","#e5e5e5","#7f7fff","#ff7f00",
                    "#7fff7f","#199999","#ff007f","#ffdd5e","#8c3f99","#b2b2b2","#007fff","#c4b200",
                    "#8cb266","#00bfbf","#b27f7f","#fcd1a5","#ff7f7f","#ffbfdd","#7fffff","#ffff7f",
                    "#00ff7f","#337fcc","#d8337f","#bfff3f","#ff7fff","#d8d8ff","#3fffbf","#b78c4c",
                    "#339933","#66b2b2","#ba8c84","#84bf00","#b24c66","#7f7f7f","#3f3fa5","#a5512b"]



def show_pdb(pdb_str, show_sidechains=False, display='cartoons',
             color="spectrum", chains=None, vmin=50, vmax=90,
             size=(800,480), hbondCutoff=4.0,
             Ls=None,
             animate=False):

  if chains is None:
    chains = 1 if Ls is None else len(Ls)
  view = py3Dmol.view(js='https://3dmol.org/build/3Dmol.js', width=size[0], height=size[1])

  if animate:
    view.addModelsAsFrames(pdb_str,'pdb',{'hbondCutoff':hbondCutoff})
  else:
    view.addModel(pdb_str,'pdb',{'hbondCutoff':hbondCutoff})

  if display == 'cartoons':
    view.setStyle({'cartoon': {'color':color}})

  elif display == 'atoms':
    BB = ['C','O','N','CA']
    view.addStyle({'atom':BB},{'stick':{'colorscheme':f"WhiteCarbon",'radius':0.3}})
  if  show_sidechains:
    BB = ['C','O','N']
    view.addStyle({'and':[{'resn':["GLY","PRO"],'invert':True},{'atom':BB,'invert':True}]},
                  {'stick':{'colorscheme':f"WhiteCarbon",'radius':0.3}})
    view.addStyle({'and':[{'resn':"GLY"},{'atom':'CA'}]},
                  {'sphere':{'colorscheme':f"WhiteCarbon",'radius':0.3}})
    view.addStyle({'and':[{'resn':"PRO"},{'atom':['C','O'],'invert':True}]},
                  {'stick':{'colorscheme':f"WhiteCarbon",'radius':0.3}})

  view.zoomTo()

  if animate: view.animate()
  return view

# color = "chain" #@param ["confidence", "rainbow", "chain"]
display = "cartoons" #@param ["atoms", "cartoons"]
color = "spectrum" #@param ["red", "green", "blue", "yellow", "orange", "spectrum"]
if color == "confidence": color = "pLDDT"
show_sidechains = False #@param {type:"boolean"}
show_pdb(pdb_str, color=color,
         show_sidechains=show_sidechains,
         display=display,
         Ls=lengths).show()


In [10]:
#@title print atom coordinates
if(print_atom_coordinates):
  print(pdb_str)

In [None]:
#@title Compare to known pdb structure (optional) {run: "auto"}

have_solved_structure = True #@param {type:"boolean"}
pdb_id = "6xw6" #@param {type:"string"}
chain_id = "C" #@param {type:"string"}
if have_solved_structure:
  seq, atoms = utils.get_seq_aa_by_id(pdb_id, chain_id)


  print("********pdb_test********")
  with open(atoms, 'r') as pdb_:
    pdb_test = pdb_.read()
  # print(pdb_test)

  show_pdb(pdb_test, color=color,
        show_sidechains=show_sidechains,
        display=display,
        Ls=lengths).show()
