# Enhancing File Accessibility and Operations in Colab Notebooks through Google Drive Integration

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd ./drive/MyDrive/github

In [None]:
%cd infrastructure

# Setting the load requirements of the ESM model

In [None]:
!pip install --upgrade transformers accelerate cuda-python

In [None]:
from transformers import (
    EsmForMaskedLM,
    AutoTokenizer,
    )
from accelerate import (
    init_empty_weights
)
import numpy as np
import torch
from cuda import (
    cuda, 
    nvrtc
)
import random

from typing import Dict, Iterable, List, Optional, Tuple, Union
import time
import os
import argparse
import random

from dataclasses import dataclass

In [None]:
tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D")
model = EsmForMaskedLM.from_pretrained("facebook/esm2_t6_8M_UR50D")

In [None]:
from transformers import (
    AutoTokenizer,
    EsmForProteinFolding
)

tokenizer = AutoTokenizer.from_pretrained("facebook/esmfold_v1")
model = EsmForProteinFolding.from_pretrained("facebook/esmfold_v1")

In [None]:
delete_model = False #@param {type:"boolean"}
if delete_model == True:
 del model
 torch.cuda.empty_cache()

## Higher performance

In [None]:
Save_Higher_performance_Results = False #@param {type:"boolean"}
model.eval()
model.cuda()
torch.cuda.empty_cache()
model.trunk.set_chunk_size(64)

## Optimitations

- Set half-precision: model.esm.half()
- Increasing the processing speed: enabling TensorFloat32 (this action will not have any effect if the hardware is incompatible with TensorFloat32)
- Ensuring the PyTorch model is ready for inference by disabling certain training behaviors and gradient calculations: model.eval()
- Once the model and his tokenizer has been loaded. To transfer the model to GPU: model.cuda() 
- Freeing up space and improving the GPU's performance: torch.cuda.empty_cache()
- Dividing the input data into smaller: model.trunk.set_chunk_size(64)

In [None]:
model.esm = model.esm.half()
torch.backends.cuda.matmul.allow_tf32 = True
model.eval()
model.cuda()
torch.cuda.empty_cache()
model.trunk.set_chunk_size(64)

In [None]:
type(model)

# Processing the folding of a singular protein chain

In [None]:
!wget https://rest.uniprot.org/uniprotkb/P68871.fasta

## Module to seamlessly handle tasks such as reading, writing, and manipulating protein sequences

In [None]:
!pip install Bio
from Bio import SeqIO

In [None]:
fasta_sequences = SeqIO.parse("P68871.fasta",'fasta')
for record in fasta_sequences:
   print(record)
   protein_sequence =str(record.seq)

In [None]:
from collections import Counter
import matplotlib.pyplot as plt
common_amino = Counter(protein_sequence)
del common_amino['*']

sorted_common_amino = dict(sorted(common_amino.items(), key=lambda item: item[1]))
print(f"Amino acids frequency: {sorted_common_amino}")
print(f"Total number of amino acids: {sum(common_amino.values())}")

bar_colors = ['yellow' if amino == "F" else 'green' for amino in sorted_common_amino.keys()]

plt.bar(sorted_common_amino.keys(), sorted_common_amino.values(), color=bar_colors)
plt.xlabel('Amino Acid')
plt.yticks(range(int(max(common_amino.values())) + 1))
plt.ylabel('Frequency')
plt.title('Protein Sequence Frequency')

plt.show()

## Tokenizer, PDB extension for the inference & 3D outputs

In [None]:
inputs_ids = tokenizer(protein_sequence,
                       return_tensors="pt",
                       add_special_tokens=False)['input_ids'].cuda()

In [None]:
with torch.no_grad():
    output = model.infer_pdb(protein_sequence)

In [None]:
if Save_Higher_performance_Results == True:
  with open("result_high_performance.pdb", "w") as f:
    f.write(output)

In [None]:
!pip install --upgrade py3Dmol
import py3Dmol

view = py3Dmol.view(js='https://3dmol.org/build/3Dmol.js', width=800, height=400)
view.addModel("".join(output), 'pdb')
view.setStyle({'model': -1}, {"cartoon": {'color': 'spectrum'}})
view.addStyle({'resn':'PHE'}, {'stick': {'colorscheme':'yellowCarbon'}})
view.addStyle({'within':{'distance':'5', 'sel':{'resn':'PHE'}}}, {'stick': {}})
view.zoomTo()

In [None]:
with torch.no_grad():
    model_output = model(inputs_ids)

In [None]:
torch.max(model_output['plddt']) <= 1.0
print(model_output['plddt'])
print(model_output['plddt'].shape)
vmin = 0.5
vmax = 0.95

view.setStyle({"cartoon": {"colorscheme": {"prop":"b","gradient": "roygb","min": vmin,"max": vmax}}})
view.zoomTo()
view.show()

In [None]:
def plot_plddt(plddt_tensor):
    plddt_values = plddt_tensor.squeeze().cpu().numpy()
    mean_plddt = np.mean(plddt_values, axis=1)
    sorted_indices = np.argsort(mean_plddt)
    min_values = mean_plddt[sorted_indices[:2]]
    min_indices = sorted_indices[:2]
    mean_value = mean_plddt.mean()
  
    plt.figure(figsize=(10, 4), dpi=100)
    plt.title('Predicted lDDT')
    plt.plot(mean_plddt)
    plt.ylim(0, 1.0)  # Set the y-axis limit to 0-1.0
    plt.xlabel('Position')
    plt.ylabel('plDDT')
    plt.show()
    return min_values, min_indices, mean_value

plddt_tensor = model_output['plddt']
min_plddt_value, min_index, mean_value = plot_plddt(plddt_tensor)
print(f"Minimum plDDT value: {min_plddt_value}")
print(f"Position of minimum plDDT value: {min_index}")
print(f"Mean plDDT value: {mean_value}")

In [None]:
view.addSurface(py3Dmol.VDW, {'opacity':0.85, 'color':'grey'}, \
  {'not':{'or':[{'resn':'UH7'}, {'resn':'DMS'}]}})
view.zoomTo()
view.show()

In [None]:
view = py3Dmol.view(js='https://3dmol.org/build/3Dmol.js',width=1500, height=500, linked=False,viewergrid=(1,3))
view.addModel("".join(output), 'pdb')
view.setViewStyle({'style':'outline','color':'black','width':0.1})
view.setStyle({'stick':{'colorscheme':'greenCarbon'}},viewer=(0,0))
view.setStyle({'cartoon':{'color':'spectrum'}},viewer=(0,1))
view.removeAllModels(viewer=(0,0))
view.addModel("".join(output), 'pdb')
view.setStyle({"cartoon": {"colorscheme": {"prop":"b","gradient": "roygb","min": vmin,"max": vmax}}},viewer=(0,2))
view.spin()
view.zoomTo()
view.show()