<a href="https://colab.research.google.com/github/akshayonly/Mini-Projects/blob/main/RNA_Lignads_Data_Extraction_Processing_NALDB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Packages and Libraries

In [None]:
! pip install pubchempy
! pip install biopython



In [None]:
# For Webscrapping & REST API
import re
import requests
from bs4 import BeautifulSoup

# Accessing Bio/ Chem databases
from Bio import Entrez
from Bio import Medline
import pubchempy as pcp

# Data processing
import numpy as np
import pandas

# Miscellaneous
from tqdm import tqdm
import time
from datetime import datetime

In [None]:
start=datetime.now()

# Helper Functions

In [None]:
def NALDBFetchComp(comp_id):
  """
  By webscrapping this function fetches the metadata related
  to given compound present in NALDB based on their 
  compound ID (e.g., APBD10)
  """

  page = requests.get(f"http://bsbe.iiti.ac.in/bsbe/naldb/{comp_id}.php")

  soup = BeautifulSoup(page.content, 'html.parser')

  # Extract content from <td> tags
  td_html_tag = soup.select('td')

  # Extract image src from <img> tags
  img_html_tag = soup.select('img')

  meta_data = dict()

  # storing title of the page
  meta_data["page_title"] = soup.title.text
  
  meta_data["target_name"] = td_html_tag[2].text.strip()

  meta_data["target_sequence"] = td_html_tag[3].text.strip()

  meta_data["ligand_name"] = td_html_tag[4].text.strip()

  meta_data["binding_detail"] = td_html_tag[5].text.strip()

  meta_data["pubmed_id"] = td_html_tag[9].text.strip()

  meta_data["canonical_smiles"] = td_html_tag[11].text

  meta_data["molecular_formula"] = td_html_tag[12].text

  meta_data["molecular_weight"] = td_html_tag[13].text

  src = [img.get('src') for img in img_html_tag][0]

  meta_data["img_URL"] = f"http://bsbe.iiti.ac.in/bsbe/naldb/{src}"   

  return meta_data

In [None]:
def pubmedSearch(pubmed_id, 
                 abstract=False, PDB_record=False, 
                 authors=False, title=False):
  """
  Get the PDB ID of corresponding Pubmed entry. 
  If PDB ID present returns the ID, or else 'None'
  """
  handle = Entrez.efetch(
      db="pubmed", 
      id=pubmed_id, 
      rettype="medline", 
      retmode="text")
    
  records = Medline.parse(handle)

  if abstract:
    return list(records)[0].get('AB')
  elif PDB_record:
    return list(records)[0].get('SI')
  elif authors:
    return list(records)[0].get('AU')
  elif title:
    return list(records)[0].get('TI')
  else:
    return records

In [None]:
def PubChemCID(SMILE):
  """
  Returns PubChem CID (ID) of given compound using
  PubChem CID through pubchempy (pcp) python library
  """
  cid = pcp.get_compounds(SMILE, 'smiles')
  cid_extracted = re.findall(r'\d+', str(cid[0]))[0]

  return cid_extracted

def PubChemData(pubchem_cid):
  """
  Returns PubChem information of given compound using
  PubChem CID through pubchempy (pcp) python library
  """
  compound = pcp.Compound.from_cid(pubchem_cid)

  sdf_url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{pubchem_cid}/SDF"
  image_url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{pubchem_cid}/PNG"

  pubchem_meta_data = dict()

  pubchem_meta_data["xlogp"] = compound.xlogp
  pubchem_meta_data["pubchem_id"] = cid_extracted
  pubchem_meta_data["molecular_formula"] = compound.molecular_formula
  pubchem_meta_data["molecular_weight"] = compound.molecular_weight
  pubchem_meta_data["isomeric_smiles"] = compound.isomeric_smiles
  pubchem_meta_data["iupac_name"] = compound.iupac_name
  pubchem_meta_data["synonyms"] = compound.synonyms
  pubchem_meta_data["image_url"] = image_url
  pubchem_meta_data['SDF_file'] = sdf_url

  return pubchem_meta_data  

In [None]:
def LigandSearchPDB(SMILE):
  search_request = {
    "query": {
      "type": "terminal",
      "service": "chemical",
      "parameters": {
        "value": SMILE,
        "type": "descriptor",
        "descriptor_type": "SMILES",
        "match_type": "graph-exact"
      }
    },
    "return_type": "entry"
  }

  search_url = f"https://search.rcsb.org/rcsbsearch/v1/query?json="

  request = requests.post(url=search_url, json=search_request)

  search_results = dict(request.json())

  pdb_ids_list = [search_results['result_set'][i].get('identifier') for i in range(len(search_results['result_set']))]

  return pdb_ids_list

# pubchempy


In [None]:
"""

cid = pcp.get_compounds(naldb_dataa.get("smiles"), 'smiles')

temp = re.findall(r'\d+', str(cid[0]))

cid_extracted = temp[0]

print(cid_extracted)

download_url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid_extracted}/SDF"
image_url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid_extracted}/PNG"

c = pcp.Compound.from_cid(cid_extracted)

pubchem_meta_data = dict()

pubchem_meta_data["molecular_formula"] = c.molecular_formula
pubchem_meta_data["molecular_weight"] = c.molecular_weight
pubchem_meta_data["isomeric_smiles"] = c.isomeric_smiles
pubchem_meta_data["iupac_name"] = c.iupac_name
pubchem_meta_data["synonyms"] = c.synonyms[0:5]
pubchem_meta_data["image_url"] = image_url

"""

'\n\ncid = pcp.get_compounds(naldb_dataa.get("smiles"), \'smiles\')\n\ntemp = re.findall(r\'\\d+\', str(cid[0]))\n\ncid_extracted = temp[0]\n\nprint(cid_extracted)\n\ndownload_url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid_extracted}/SDF"\nimage_url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid_extracted}/PNG"\n\nc = pcp.Compound.from_cid(cid_extracted)\n\npubchem_meta_data = dict()\n\npubchem_meta_data["molecular_formula"] = c.molecular_formula\npubchem_meta_data["molecular_weight"] = c.molecular_weight\npubchem_meta_data["isomeric_smiles"] = c.isomeric_smiles\npubchem_meta_data["iupac_name"] = c.iupac_name\npubchem_meta_data["synonyms"] = c.synonyms[0:5]\npubchem_meta_data["image_url"] = image_url\n\n'

In [None]:
"""
from tqdm import tqdm
import requests

response = requests.get(download_url, stream=True)

with open(f"{cid_extracted}.sdf", "wb") as handle:
    for data in tqdm(response.iter_content()):
        handle.write(data)

"""        

'\nfrom tqdm import tqdm\nimport requests\n\nresponse = requests.get(download_url, stream=True)\n\nwith open(f"{cid_extracted}.sdf", "wb") as handle:\n    for data in tqdm(response.iter_content()):\n        handle.write(data)\n\n'

In [None]:
# ! cat /content/238.sdf

# Accessing PDB IDs

In [None]:
# Nucleic acid aptamer binding ligands IDs
NALDB_compounds_ID = ["APBD"+str(id) for id in np.arange(1, 484)]

RNA_present = dict()
RNA_absent = dict()
counter = 0
Entrez.email = 'akishirsath@gmail.com'

for comp_id in tqdm(NALDB_compounds_ID):
  # Grabs metadata (PubMed ID and smiles etc.) 
  # from NALDB website
  pubmed_id = NALDBFetchComp(comp_id).get('pubmed_id')

  # Based on Pubmed ID returns the corresponding 
  # Abstract text and Title
  abstract = pubmedSearch(pubmed_id, abstract=True)
  title = pubmedSearch(pubmed_id, title=True)

  try:
    # Checks whether the pubmed abstract or title of 
    # given pubmed id has 'RNA' mentioned in it. 
    if ('RNA' in abstract) or ('RNA' in title):

      result = pubmedSearch(pubmed_id, PDB_record=True)
  
      if (result is None):
        RNA_present[comp_id]=None

      elif  ("PDB" in result[0]):
        RNA_present[comp_id] = result[0].split('/')[1]

      elif ("GENBANK" in result[0]):
        RNA_present[comp_id]=(result[0].split('/')[1], result[1].split('/')[1])

    else:
      RNA_absent[comp_id]=naldb_data['pubmed_id']

  except (TypeError):
    pass
  
  counter+=1

  if counter>=10:
    time.sleep(1)  
    counter = 0     

100%|██████████| 483/483 [14:01<00:00,  1.74s/it]


In [None]:
RNA_present

{'APBD114': '1F1T',
 'APBD115': '1F1T',
 'APBD116': '1F1T',
 'APBD117': '1F1T',
 'APBD142': None,
 'APBD143': None,
 'APBD144': None,
 'APBD145': None,
 'APBD146': None,
 'APBD147': None,
 'APBD148': None,
 'APBD149': None,
 'APBD150': None,
 'APBD151': None,
 'APBD152': None,
 'APBD153': None,
 'APBD154': None,
 'APBD155': None,
 'APBD156': None,
 'APBD169': '1NEM',
 'APBD175': None,
 'APBD176': None,
 'APBD177': None,
 'APBD178': None,
 'APBD179': None,
 'APBD180': None,
 'APBD181': None,
 'APBD182': None,
 'APBD183': None,
 'APBD184': None,
 'APBD185': None,
 'APBD186': None,
 'APBD187': None,
 'APBD188': None,
 'APBD189': None,
 'APBD190': None,
 'APBD191': None,
 'APBD192': None,
 'APBD193': None,
 'APBD194': '2G9C',
 'APBD195': '2G9C',
 'APBD196': '2G9C',
 'APBD197': '2G9C',
 'APBD198': '2B57',
 'APBD199': '2B57',
 'APBD200': '2B57',
 'APBD201': '2B57',
 'APBD202': '2B57',
 'APBD203': '2B57',
 'APBD204': '3DS7',
 'APBD205': '3DS7',
 'APBD206': '3DS7',
 'APBD207': '3DS7',
 'APBD20

# PDB Rest API

In [None]:
cid_extracted = PubChemCID(naldb_data['canonical_smiles'])

pubchem_data = PubChemData(cid_extracted)

isomeric_smiles = pubchem_data.get('isomeric_smiles')

canonical_smiles=naldb_data.get('canonical_smiles')

pdb_ids_list=LigandSearchPDB(canonical_smiles)

pubmed_id=naldb_data['pubmed_id']

naldb_comp_id=naldb_data['page_title']

In [None]:
print(f"{naldb_comp_id} | {pubmed_id} | {cid_extracted} | {pdb_ids_list[0]} | {canonical_smiles}")

In [None]:
RNA_present

In [None]:
RNA_absent

In [None]:
end=datetime.now()

In [None]:
print(f"{end-start}")