<a href="https://colab.research.google.com/github/yc386/anubis_palaeoproteomics/blob/main/get_dihedrals.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Use Biopython to calculate dihedrals (phi, phi angles), SASA and atom distance from a .pdb file

In [None]:
#@title install libs and functions

!pip install biopython


import pandas as pd
import Bio.PDB
from Bio.PDB.SASA import ShrakeRupley
import math
from functools import reduce

AA = {'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K',
     'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N',
     'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W',
     'ALA': 'A', 'VAL':'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M', 'HOH':'Water'}

#get phi and psi angles from a .pdb file

def get_phi_psi(pdb_file_path):

  parser = PDBParser(QUIET=True)
  structure = parser.get_structure("structure", pdb_file_path)
  model = structure[0]
  chain = model['A']
  phi_psi_data = []

  poly = Bio.PDB.Polypeptide.Polypeptide(chain)
  phi_psi_list = poly.get_phi_psi_list()

  for i, (phi, psi) in enumerate(phi_psi_list):
        residue = poly[i].get_resname()
        res_id = poly[i].get_id()[1]
        phi_deg = math.degrees(phi) if phi is not None else None
        psi_deg = math.degrees(psi) if psi is not None else None

        phi_psi_data.append({
            'Residue': residue,
            'Residue_ID': res_id,
            'Phi': phi_deg,
            'Psi': psi_deg
        })

  phi_psi_df = pd.DataFrame(phi_psi_data)
  return phi_psi_df

#get SASA from a .pdb file

def get_SASA(pdb_file_path):

  parser = PDBParser(QUIET=True)
  structure = parser.get_structure("structure", pdb_file_path)
  model = structure[0]
  chain = model['A']
  sr = ShrakeRupley()
  sr.compute(structure, level="R")
  sasa_data = []
  for residue in chain:
    try:
      sasa = residue.sasa
      sasa_data.append({
            'Residue': residue.get_resname(),
            'Residue_ID': residue.get_id()[1],
            'SASA': sasa
        })
    except AttributeError:
      continue
  sasa_df = pd.DataFrame(sasa_data)
  return sasa_df

#get atom distance from a .pdb file

def get_atom_distance(pdb_file_path):

  parser = PDBParser(QUIET=True)
  structure = parser.get_structure("structure", pdb_file_path)
  model = structure[0]
  chain = model['A']
  distances = []
  residues = chain.get_list()
  for i in range(len(residues) - 1):
    residue0= residues[i-1]
    residue1 = residues[i]
    residue2 = residues[i + 1]
    try:
      distance = residue1['CG'] - residue2['N']
      distances.append({
            'Residue_before': residue0.get_resname(),
            'Residue': residue1.get_resname(),
            'Residue_ID': residue1.get_id()[1],
            'Residue_after': residue2.get_resname(),
            'CG_N_Distance': distance
        })
    except KeyError:
      continue

  distance_df = pd.DataFrame(distances)

  return distance_df


def get_all_data(pdb_file_path):

  phi_psi_df = get_phi_psi(pdb_file_path)
  sasa_df = get_SASA(pdb_file_path)
  distance_df = get_atom_distance (pdb_file_path)
  dfs=[phi_psi_df, sasa_df, distance_df]
  merge_keys = ['Residue', 'Residue_ID']
  merged_df = reduce(lambda left, right: pd.merge(left, right, on=merge_keys), dfs)
  m1=merged_df[(merged_df['Residue']=='ASN')|(merged_df['Residue']=='GLN')].reset_index(drop=True)
  m2=m1.fillna(0)
  m2.replace(AA, inplace=True)
  m3=m2.assign(matched_pattern=m2['Residue_before']+m2['Residue']+m2['Residue_after'])
  return m3


Collecting biopython
  Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.84


In [None]:
#path to a .pdb file
pdb_file='1beb.pdb'
df=get_all_data (pdb_file)
#save df.to_csv(output_path, index=False)
#remember to go to IUPRED3 website to collect measurements

Unnamed: 0,Residue,Residue_ID,Phi,Psi,SASA,Residue_before,Residue_after,CG_N_Distance,matched_pattern
0,Q,5,0.0,111.615688,107.899721,Water,T,4.853964,WaterQT
1,Q,13,-69.30563,-25.123339,42.095029,I,K,4.704711,IQK
2,Q,35,-48.346937,-31.406966,32.860406,A,S,4.586759,AQS
3,Q,59,-113.108423,135.191849,1.698973,L,K,4.664249,LQK
4,N,63,-102.157928,150.717464,43.85735,E,G,3.606013,ENG
5,Q,68,-112.718703,129.82091,86.914134,A,K,3.721913,AQK
6,N,88,64.413044,13.200449,58.328129,L,E,4.783754,LNE
7,N,90,-136.973122,2.588254,7.528198,E,K,3.865857,ENK
8,N,109,-107.605287,111.54745,18.368792,E,S,3.710419,ENS
9,Q,115,-104.637435,13.020071,60.994939,E,S,4.86398,EQS
