<a href="https://colab.research.google.com/github/akshayonly/RNA-Ligand-Database/blob/main/RNA_Lignads_Data_Extraction_Processing_NALDB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Packages and Libraries

In [1]:
! pip install pubchempy
! pip install biopython



In [2]:
# For Webscrapping & REST API
import re
import requests
from bs4 import BeautifulSoup

# Accessing Bio/ Chem databases
from Bio import Entrez
from Bio import Medline
import pubchempy as pcp

# Data processing
import numpy as np
import pandas as pd

# Miscellaneous
from tqdm import tqdm
import time
from datetime import datetime

In [3]:
start=datetime.now()

# Helper Functions

## NALDBFetchComp(APBD1)

In [4]:
def NALDBFetchComp(comp_id):
  """
  Webscrapping  function fetches the data from html page related to given 
  compound present in NALDB based on their compound ID (e.g., APBD10)
  """

  page = requests.get(f"http://bsbe.iiti.ac.in/bsbe/naldb/{comp_id}.php")

  soup = BeautifulSoup(page.content, 'html.parser')

  # Extract content from <td> tags
  td_html_tag = soup.select('td')

  # Extract image src from <img> tags
  img_html_tag = soup.select('img')

  meta_data = dict()

  # 'NALDB_ID'
  naldb_id = soup.title.text

  # 'pubmed_ID'
  pubmed_ID = td_html_tag[9].text.strip()

  # 'ligand_name'
  ligand_name = td_html_tag[4].text.strip()

  # 'target_name'
  target_name = td_html_tag[2].text.strip()

  # 'canonical_SMILES'
  canonical_SMILES = td_html_tag[11].text

  # 'molecular_formula' 
  molecular_formula = td_html_tag[12].text

  # 'molecular_weight'
  molecular_weight = td_html_tag[13].text

  # 'binding_detail' 
  binding_detail = td_html_tag[5].text.strip()

  # 'minimized_energy' 
  minimized_energy = td_html_tag[22].text

  # 'energy'
  energy = td_html_tag[21].text

  # 'net_charge'
  net_charge = td_html_tag[14].text

  # 'a_log_p'
  a_log_p = td_html_tag[15].text

  # 'num_aromatic_rings' 
  num_aromatic_rings = td_html_tag[16].text

  # 'num_h_acceptors'
  num_h_acceptors = td_html_tag[17].text

  # 'num_h_donors' 
  num_h_donors = td_html_tag[18].text

  # 'num_rings'
  num_rings = td_html_tag[19].text

  # 'img_URL'
  src = [img.get('src') for img in img_html_tag][0]
  img_URL = f"http://bsbe.iiti.ac.in/bsbe/naldb/{src}" 

  return ((naldb_id, pubmed_ID, ligand_name, target_name, canonical_SMILES, 
               molecular_formula, molecular_weight, binding_detail, 
               minimized_energy, energy, net_charge, a_log_p, 
               num_aromatic_rings, num_h_acceptors, num_h_donors, 
               num_rings, img_URL))

## pubmedSearch(pubmed_id, abstract, PDB_record, authors, title)

In [5]:
def pubmedSearch(pubmed_id, 
                 abstract=False, PDB_record=False, 
                 authors=False, title=False):
  """
  Get the PDB ID of corresponding Pubmed entry. 
  If PDB ID present returns the ID, or else 'None'
  """
  handle = Entrez.efetch(
      db="pubmed", 
      id=pubmed_id, 
      rettype="medline", 
      retmode="text")
    
  records = Medline.parse(handle)

  if abstract:
    return list(records)[0].get('AB')
  elif PDB_record:
    return list(records)[0].get('SI')
  elif authors:
    return list(records)[0].get('AU')
  elif title:
    return list(records)[0].get('TI')
  else:
    return records

## PubChemCID(SMILE)

In [6]:
def PubChemCID(SMILE):
  """
  Returns PubChem CID (ID) of given compound using
  PubChem CID through pubchempy (pcp) python library
  """
  cid = pcp.get_compounds(SMILE, 'smiles')
  cid_extracted = re.findall(r'\d+', str(cid[0]))[0]

  return cid_extracted  

## PubChemData(pubchem_cid)

In [7]:
def PubChemData(pubchem_cid):
  """
  Returns PubChem information of given compound using
  PubChem CID through pubchempy (pcp) python library
  """
  compound = pcp.Compound.from_cid(pubchem_cid)

  sdf_url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{pubchem_cid}/SDF"
  image_url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{pubchem_cid}/PNG"

  pubchem_meta_data = dict()

  pubchem_meta_data["xlogp"] = compound.xlogp
  pubchem_meta_data["pubchem_id"] = cid_extracted
  pubchem_meta_data["molecular_formula"] = compound.molecular_formula
  pubchem_meta_data["molecular_weight"] = compound.molecular_weight
  pubchem_meta_data["isomeric_smiles"] = compound.isomeric_smiles
  pubchem_meta_data["iupac_name"] = compound.iupac_name
  pubchem_meta_data["synonyms"] = compound.synonyms
  pubchem_meta_data["image_url"] = image_url
  pubchem_meta_data['SDF_file'] = sdf_url

  return pubchem_meta_data

## LigandSearchPDB(SMILE)

In [8]:
def LigandSearchPDB(SMILE):
  search_request = {
    "query": {
      "type": "terminal",
      "service": "chemical",
      "parameters": {
        "value": SMILE,
        "type": "descriptor",
        "descriptor_type": "SMILES",
        "match_type": "graph-exact"
      }
    },
    "return_type": "entry"
  }

  search_url = f"https://search.rcsb.org/rcsbsearch/v1/query?json="

  request = requests.post(url=search_url, json=search_request)

  search_results = dict(request.json())

  pdb_ids_list = [search_results['result_set'][i].get('identifier') for i in range(len(search_results['result_set']))]

  return pdb_ids_list

## PubChemSDF( )


In [9]:
"""
from tqdm import tqdm
import requests

response = requests.get(download_url, stream=True)

with open(f"{cid_extracted}.sdf", "wb") as handle:
    for data in tqdm(response.iter_content()):
        handle.write(data)

"""        

'\nfrom tqdm import tqdm\nimport requests\n\nresponse = requests.get(download_url, stream=True)\n\nwith open(f"{cid_extracted}.sdf", "wb") as handle:\n    for data in tqdm(response.iter_content()):\n        handle.write(data)\n\n'

In [10]:
# ! cat /content/238.sdf

# MAIN

## NALDB - Nucleic Acid Aptamer Binding Ligands Data

In [11]:
# creating list of all the Ligand ID in 
# NALDB's Nucleic Acid Aptamer Binding Ligands Data section
naldb_ligand_IDs = [f"APBD{no}" for no in np.arange(1, 484)]

# Column names for pandas dataframe and excel sheet
column_names=(
    'naldb_ID', 'pubmed_ID', 'ligand_name', 'target_name', 'canonical_SMILES', 
    'molecular_formula', 'molecular_weight', 'binding_detail', 
    'minimized_energy', 'energy', 'net_charge', 'a_log_p', 
    'num_aromatic_rings', 'num_h_acceptors', 'num_h_donors', 
    'num_rings', 'img_URL')

In [12]:
# Iterating over all the entries of the 
# NALDB's Nucleic Acid Aptamer Binding Ligands Data section
# and storing the values of each ligand entries

naldb_data = list()
for id in tqdm(naldb_ligand_IDs):
  naldb_data.append(NALDBFetchComp(id))

100%|██████████| 483/483 [04:35<00:00,  1.75it/s]


In [13]:
# Creating dataframe from fetched naldb 
# 'Nucleic acid aptamer binding ligands' entries
naldb_dataframe = pd.DataFrame(naldb_data, columns=column_names)

In [None]:
# Converting 'text' data type obtained from 
# webscrapping (html file) to 'numeric'

naldb_dataframe['molecular_weight'] = naldb_dataframe['molecular_weight'].astype(float, errors = 'raise')

naldb_dataframe['minimized_energy'] = naldb_dataframe['minimized_energy'].astype(float, errors = 'raise')

naldb_dataframe['energy'] = naldb_dataframe['energy'].astype(float, errors = 'raise')

naldb_dataframe['net_charge'] = naldb_dataframe['net_charge'].astype(float, errors = 'raise')

naldb_dataframe['a_log_p'] = naldb_dataframe['a_log_p'].astype(float, errors = 'raise')

naldb_dataframe['num_aromatic_rings'] = naldb_dataframe['num_aromatic_rings'].astype(int, errors = 'raise')

naldb_dataframe['num_h_acceptors'] = naldb_dataframe['num_h_acceptors'].astype(int, errors = 'raise')

naldb_dataframe['num_h_donors'] = naldb_dataframe['num_h_donors'].astype(int, errors = 'raise')

naldb_dataframe['num_rings'] = naldb_dataframe['num_rings'].astype(int, errors = 'raise')

naldb_dataframe = naldb_dataframe.replace(r'^\s*$', np.nan, regex=True)

In [14]:
naldb_dataframe.head(10)

Unnamed: 0,naldb_ID,pubmed_ID,ligand_name,target_name,canonical_SMILES,molecular_formula,molecular_weight,binding_detail,minimized_energy,energy,net_charge,a_log_p,num_aromatic_rings,num_h_acceptors,num_h_donors,num_rings,img_URL
0,APBD1,24168267,ATP,SSA-1,NC1=NC=NC2=C1N=CN2C1OC(COP(O)(=O)OP(O)(=O)OP(O...,C10 H16 N5 O13 P3,507.181,EC50= 29.9 ± 5.8µM,39.82,156.8,0.0,-2.526,2,17,7,3,http://bsbe.iiti.ac.in/bsbe/naldb/img/APBD1.png
1,APBD2,23971905,ABA,Aptamer 2,C\C(\C=C\[C@@]1(O)C(C)=CC(=O)CC1(C)C)=C\C(O)=O,C15 H20 O4,264.317,kd= 0.98 – 0.14 µM,2.49,6.59,0.0,2.161,0,4,2,1,http://bsbe.iiti.ac.in/bsbe/naldb/img/APBD2.png
2,APBD3,23971905,ABA,aptamer 9,C\C(\C=C\[C@@]1(O)C(C)=CC(=O)CC1(C)C)=C\C(O)=O,C15 H20 O4,264.317,kd= 0.80 – 0.07 µM,2.49,6.59,0.0,2.161,0,4,2,1,http://bsbe.iiti.ac.in/bsbe/naldb/img/APBD3.png
3,APBD4,23830440,opiµM alkaloid codeine (3-methylmorphine),HL7-14,[H][C@@]12OC3=C(OC)C=CC4=C3C11CCN(C)[C@H](C4)[...,C18 H21 N O3,299.364,Kd= 0.91 ± 0.19 M,37.35,77.2,0.0,1.637,1,4,1,5,http://bsbe.iiti.ac.in/bsbe/naldb/img/APBD4.png
4,APBD5,23734784,CPT1,CMA-70,[H]NC(=O)COCC(=O)NCCOCCOCCOCCOCCOCCOCCOCCOCCOC...,C28 H57 N3 O14,659.764,0.039( µM),10.89,35.63,0.0,-4.305,0,15,3,0,http://bsbe.iiti.ac.in/bsbe/naldb/img/APBD5.png
5,APBD6,23734784,CPT1,CDA-36,[H]NC(=O)COCC(=O)NCCOCCOCCOCCOCCOCCOCCOCCOCCOC...,C28 H57 N3 O14,659.764,1.1 ( µM),10.89,35.63,0.0,-4.305,0,15,3,0,http://bsbe.iiti.ac.in/bsbe/naldb/img/APBD6.png
6,APBD7,23734784,CPT1,CMA-59,[H]NC(=O)COCC(=O)NCCOCCOCCOCCOCCOCCOCCOCCOCCOC...,C28 H57 N3 O14,659.764,0.086( µM),10.89,35.63,0.0,-4.305,0,15,3,0,http://bsbe.iiti.ac.in/bsbe/naldb/img/APBD7.png
7,APBD8,23734784,CPT1,CMA-53,[H]NC(=O)COCC(=O)NCCOCCOCCOCCOCCOCCOCCOCCOCCOC...,C28 H57 N3 O14,659.764,0.79( µM),10.89,35.63,0.0,-4.305,0,15,3,0,http://bsbe.iiti.ac.in/bsbe/naldb/img/APBD8.png
8,APBD9,22702719,ATP,Flanked by two dsDNAfragments that end withand...,NC1=NC=NC2=C1N=CN2C1OC(COP(O)(=O)OP(O)(=O)OP(O...,C10 H16 N5 O13 P3,507.181,By laser tweezers instrµMent 2.0 ± 0.2 µM,39.82,156.8,0.0,-2.526,2,17,7,3,http://bsbe.iiti.ac.in/bsbe/naldb/img/APBD9.png
9,APBD10,19361229,Ethanolamine,ethanolamine DNA aptamer EA#14.3,NCCO,C2 H7 N O,61.0831,aptamer eluted = 84 %,-0.01,0.72,0.0,-1.187,0,2,2,0,http://bsbe.iiti.ac.in/bsbe/naldb/img/APBD10.png


In [17]:
# Creating - nuc_acid_aptamer_binding_lig_data_RNA_DNA.xlsx
naldb_dataframe.to_excel('nuc_acid_aptamer_binding_lig_data_RNA_DNA.xlsx', 
                         sheet_name='NALDB_RNA_DNA') 

In [18]:
# Copying - nuc_acid_aptamer_binding_lig_data_RNA_DNA.xlsx file to Google Drive
! cp -r /content/nuc_acid_aptamer_binding_lig_data_RNA_DNA.xlsx /content/drive/MyDrive/Colab\ Notebooks/RNA-LIGAND-DATABASE/

## Selecting entries with RNA ligand only

In [24]:
# Number of unique (non-redundant) pubmed refrences
unique_pubmed_ids = len(np.unique(naldb_dataframe['pubmed_ID']))

total_pubmed_ids = len(naldb_dataframe['pubmed_ID'])

print(f"{unique_pubmed_ids} pubmed references are repeated or used for all the {total_pubmed_ids} entries")

54 pubmed references are repeated or used for all the 483 entries
