In [2]:
# %pip install numpy
# %pip install torch
# %pip install tqdm
# %pip install biopython

In [11]:
import os
import unittest
import tempfile
import numpy as np
import torch
from tqdm import tqdm # for progress bar
import Bio.PDB

In [31]:
def process_fasta_files(folder_name):
    data = []
    lengths = []
    for file in os.listdir(folder_name):
        with open(os.path.join(folder_name, file), 'r') as fastaFile:
            id = ""
            seq = ""
            name = ""
            lines = fastaFile.readlines()
            if not lines: # empty file
                print(f"Skipped {file} as it is empty")
                continue
            for line in lines:
                line = line.strip()
                if line.startswith(">"):
                    contents = line.split("|")
                    # std format is id|chains|name|optional:species
                    try:
                        id = contents[0][1:]
                        name = contents[2]
                    except IndexError:
                        print(f"Skipped {file} as it is missing required metadata")
                        continue
                else:
                    seq = line.upper()
            if id!="" and seq!="" and name!="":
                protein = (id, seq, name)
                data.append(protein)
                lengths.append(len(seq))
            else:
                 print(f"Skipped {file} as it is missing required data")
    return data,lengths

In [32]:
class TestProcessFastaFiles(unittest.TestCase):
    def setUp(self):
        self.test_dir = tempfile.TemporaryDirectory()

    def tearDown(self):
        self.test_dir.cleanup()
        
    def make_test_file(self, filename, fasta):
        file = os.path.join(self.test_dir.name, filename)
        with open(file, 'w') as fastaFile:
            fastaFile.write(fasta)
        return file

    def test_empty_file(self):
        self.make_test_file("empty.fasta", "")
        data, lengths = process_fasta_files(self.test_dir.name)
        # protein should not be created
        self.assertEqual(len(data),0) 
        self.assertEqual(lengths,[])

    def test_standard_file(self):
        self.make_test_file("1ABC.fasta", ">1ABC|Chain A|Protein A-B-C\nABCDEFGHIJKLMNOPQRSTUVWXYZ\n")
        data, lengths = process_fasta_files(self.test_dir.name)
        self.assertEqual(len(data),1)
        self.assertEqual(lengths,[26])
        self.assertEqual(data[0][0],"1ABC") # id
        self.assertEqual(data[0][1],"ABCDEFGHIJKLMNOPQRSTUVWXYZ") # sequence
        self.assertEqual(data[0][2],"Protein A-B-C") # protein name
        
    def test_multiple_files(self):
        self.make_test_file("1ABC.fasta", ">1ABC|Chain A|Protein A-B-C\nABCDEFGHIJKLMNOPQRSTUVWXYZ\n")
        self.make_test_file("2DEF.fasta", ">2DEF|Chains A, B, C, D|Protein D-E-F|Homo Sapiens (9606)\nABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ\n")
        self.make_test_file("3GHI.fasta", ">3GHI|Chains A,B|Protein G-H-I\nABCDEFGHIJKLMNOPQRSTUVWXYZ\n")
        data, lengths = process_fasta_files(self.test_dir.name)
        self.assertEqual(len(data),3)
        self.assertEqual(lengths,[26,52,26])
        self.assertEqual(data[0][0],"1ABC") # id
        self.assertEqual(data[0][1],"ABCDEFGHIJKLMNOPQRSTUVWXYZ") # sequence
        self.assertEqual(data[0][2],"Protein A-B-C") # protein name
        self.assertEqual(data[1][0],"2DEF") # id
        self.assertEqual(data[1][1],"ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ") # sequence
        self.assertEqual(data[1][2],"Protein D-E-F") # protein name
        self.assertEqual(data[2][0],"3GHI") # id
        self.assertEqual(data[2][1],"ABCDEFGHIJKLMNOPQRSTUVWXYZ") # sequence
        self.assertEqual(data[2][2],"Protein G-H-I") # protein name
        
# if unit tests are ran, remember to delete the .ipynb folder in your sequence folder, created as a result of the temporary test folder
#unittest.main(argv=['first-arg-is-ignored'], exit=False)

...
----------------------------------------------------------------------
Ran 3 tests in 0.016s

OK


Skipped empty.fasta as it is empty


<unittest.main.TestProgram at 0x12840323310>

In [1]:
# retrieve the first model from the pdb file
# library: https://biopython.org/docs/1.75/api/Bio.PDB.Structure.html
# library code: https://github.com/biopython/biopython/blob/master/Bio/PDB/
def process_pdb_files(folder_name):
    models = {}
    for file in os.listdir(folder_name):
        protein_name = os.path.splitext(file)[0].upper()
        structure = Bio.PDB.PDBParser().get_structure(protein_name, folder_name+"/"+file)
        model = structure[0]
        models[protein_name] = model
    return models

In [None]:
# retrieve all residues from pdb file 
def process_pdb_files_by_residues(folder_name):
    proteins = {}
    for file in os.listdir(folder_name):
        protein_name = os.path.splitext(file)[0].upper()
        structure = Bio.PDB.PDBParser().get_structure(protein_name, folder_name+"/"+file)
        residues = list(structure.get_residues()) # type conversion  necessary as get_residues method is a generator 
        proteins[protein_name] = residues
    return proteins

In [None]:
# retrieve all chains from pdb files 
def process_pdb_files_by_chains(folder_name):
    proteins = {}
    for file in os.listdir(folder_name):
        protein_name = os.path.splitext(file)[0].upper()
        structure = Bio.PDB.PDBParser().get_structure(protein_name, folder_name+"/"+file)
        chains = list(structure.get_chains()) # type conversion  necessary as get_residues method is a generator 
        proteins[protein_name] = chains
    return proteins

In [2]:
#Source: https://github.com/zzhangzzhang/pLMs-interpretability/blob/main/jac/utils.py
def do_apc(x, rm=1):
  '''given matrix do apc correction'''
  # trying to remove different number of components
  # rm=0 remove none
  # rm=1 apc
  x = np.copy(x)
  if rm == 0:
    return x
  elif rm == 1:
    a1 = x.sum(0,keepdims=True)
    a2 = x.sum(1,keepdims=True)
    y = x - (a1*a2)/x.sum()
  else:
    # decompose matrix, rm largest(s) eigenvectors
    u,s,v = np.linalg.svd(x)
    y = s[rm:] * u[:,rm:] @ v[rm:,:]
  np.fill_diagonal(y,0)
  return y

In [3]:
# Source: https://github.com/zzhangzzhang/pLMs-interpretability/blob/main/jac/01_jac_calculate_visualise.ipynb 
def get_categorical_jacobian(x,ln,model,device='cpu'):
  # ∂in/∂out
  with torch.no_grad():
    f = lambda x: model(x)["logits"][...,1:(ln+1),4:24].cpu().numpy()
    fx = f(x.to(device))[0]
    x = torch.tile(x,[20,1]).to(device)
    fx_h = np.zeros((ln,20,ln,20))
    with tqdm(total=ln) as pbar:
        for n in range(ln): # for each position
          x_h = torch.clone(x)
          x_h[:,n+1] = torch.arange(4,24) # mutate to all 20 aa
          fx_h[n] = f(x_h)
        pbar.update(1)
    return fx_h - fx

In [4]:
# Source: https://github.com/zzhangzzhang/pLMs-interpretability/blob/main/jac/utils.py
def get_contacts(x, symm=True, center=True, rm=1):
  # convert jacobian (L,A,L,A) to contact map (L,L)
  j = x.copy()
  if center:
    for i in range(4): j -= j.mean(i,keepdims=True)
  j_fn = np.sqrt(np.square(j).sum((1,3)))
  np.fill_diagonal(j_fn,0)
  j_fn_corrected = do_apc(j_fn, rm=rm)
  if symm:
    j_fn_corrected = (j_fn_corrected + j_fn_corrected.T)/2
  return j_fn_corrected