<a href="https://colab.research.google.com/github/akshayonly/Mini-Projects/blob/main/RNA_Lignads_Data_Extraction_Processing_NALDB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Packages and Libraries

In [24]:
! pip install pubchempy
! pip install biopython



In [25]:
# For Webscrapping & REST API
import re
import requests
from bs4 import BeautifulSoup

# Accessing Bio/ Chem databases
from Bio import Entrez
from Bio import Medline
import pubchempy as pcp

# Data processing
import numpy as np
import pandas

# Miscellaneous
from tqdm import tqdm
import time
from datetime import datetime

In [26]:
start=datetime.now()

# Helper Functions

## NALDBFetchComp(APBD1)

In [27]:
def NALDBFetchComp(comp_id):
  """
  By webscrapping this function fetches the metadata related
  to given compound present in NALDB based on their 
  compound ID (e.g., APBD10)
  """

  page = requests.get(f"http://bsbe.iiti.ac.in/bsbe/naldb/{comp_id}.php")

  soup = BeautifulSoup(page.content, 'html.parser')

  # Extract content from <td> tags
  td_html_tag = soup.select('td')

  # Extract image src from <img> tags
  img_html_tag = soup.select('img')

  meta_data = dict()

  # storing title of the page
  meta_data["NALDB_ID"] = soup.title.text
  
  meta_data["target_name"] = td_html_tag[2].text.strip()

  meta_data["target_sequence"] = td_html_tag[3].text.strip()

  meta_data["ligand_name"] = td_html_tag[4].text.strip()

  meta_data["binding_detail"] = td_html_tag[5].text.strip()

  meta_data["pubmed_id"] = td_html_tag[9].text.strip()

  meta_data["canonical_smiles"] = td_html_tag[11].text

  meta_data["molecular_formula"] = td_html_tag[12].text

  meta_data["molecular_weight"] = td_html_tag[13].text

  src = [img.get('src') for img in img_html_tag][0]

  meta_data["img_URL"] = f"http://bsbe.iiti.ac.in/bsbe/naldb/{src}"   

  return meta_data

## pubmedSearch(pubmed_id, abstract, PDB_record, authors, title)

In [28]:
def pubmedSearch(pubmed_id, 
                 abstract=False, PDB_record=False, 
                 authors=False, title=False):
  """
  Get the PDB ID of corresponding Pubmed entry. 
  If PDB ID present returns the ID, or else 'None'
  """
  handle = Entrez.efetch(
      db="pubmed", 
      id=pubmed_id, 
      rettype="medline", 
      retmode="text")
    
  records = Medline.parse(handle)

  if abstract:
    return list(records)[0].get('AB')
  elif PDB_record:
    return list(records)[0].get('SI')
  elif authors:
    return list(records)[0].get('AU')
  elif title:
    return list(records)[0].get('TI')
  else:
    return records

## PubChemCID(SMILE)

In [29]:
def PubChemCID(SMILE):
  """
  Returns PubChem CID (ID) of given compound using
  PubChem CID through pubchempy (pcp) python library
  """
  cid = pcp.get_compounds(SMILE, 'smiles')
  cid_extracted = re.findall(r'\d+', str(cid[0]))[0]

  return cid_extracted  

## PubChemData(pubchem_cid)

In [30]:
def PubChemData(pubchem_cid):
  """
  Returns PubChem information of given compound using
  PubChem CID through pubchempy (pcp) python library
  """
  compound = pcp.Compound.from_cid(pubchem_cid)

  sdf_url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{pubchem_cid}/SDF"
  image_url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{pubchem_cid}/PNG"

  pubchem_meta_data = dict()

  pubchem_meta_data["xlogp"] = compound.xlogp
  pubchem_meta_data["pubchem_id"] = cid_extracted
  pubchem_meta_data["molecular_formula"] = compound.molecular_formula
  pubchem_meta_data["molecular_weight"] = compound.molecular_weight
  pubchem_meta_data["isomeric_smiles"] = compound.isomeric_smiles
  pubchem_meta_data["iupac_name"] = compound.iupac_name
  pubchem_meta_data["synonyms"] = compound.synonyms
  pubchem_meta_data["image_url"] = image_url
  pubchem_meta_data['SDF_file'] = sdf_url

  return pubchem_meta_data

## LigandSearchPDB(SMILE)

In [31]:
def LigandSearchPDB(SMILE):
  search_request = {
    "query": {
      "type": "terminal",
      "service": "chemical",
      "parameters": {
        "value": SMILE,
        "type": "descriptor",
        "descriptor_type": "SMILES",
        "match_type": "graph-exact"
      }
    },
    "return_type": "entry"
  }

  search_url = f"https://search.rcsb.org/rcsbsearch/v1/query?json="

  request = requests.post(url=search_url, json=search_request)

  search_results = dict(request.json())

  pdb_ids_list = [search_results['result_set'][i].get('identifier') for i in range(len(search_results['result_set']))]

  return pdb_ids_list

## PubChemSDF( )


In [32]:
"""
from tqdm import tqdm
import requests

response = requests.get(download_url, stream=True)

with open(f"{cid_extracted}.sdf", "wb") as handle:
    for data in tqdm(response.iter_content()):
        handle.write(data)

"""        

'\nfrom tqdm import tqdm\nimport requests\n\nresponse = requests.get(download_url, stream=True)\n\nwith open(f"{cid_extracted}.sdf", "wb") as handle:\n    for data in tqdm(response.iter_content()):\n        handle.write(data)\n\n'

In [33]:
# ! cat /content/238.sdf

# MAIN

In [34]:
[f"APBD{no}" for no in np.arange(1, 484)]

['APBD1',
 'APBD2',
 'APBD3',
 'APBD4',
 'APBD5',
 'APBD6',
 'APBD7',
 'APBD8',
 'APBD9',
 'APBD10',
 'APBD11',
 'APBD12',
 'APBD13',
 'APBD14',
 'APBD15',
 'APBD16',
 'APBD17',
 'APBD18',
 'APBD19',
 'APBD20',
 'APBD21',
 'APBD22',
 'APBD23',
 'APBD24',
 'APBD25',
 'APBD26',
 'APBD27',
 'APBD28',
 'APBD29',
 'APBD30',
 'APBD31',
 'APBD32',
 'APBD33',
 'APBD34',
 'APBD35',
 'APBD36',
 'APBD37',
 'APBD38',
 'APBD39',
 'APBD40',
 'APBD41',
 'APBD42',
 'APBD43',
 'APBD44',
 'APBD45',
 'APBD46',
 'APBD47',
 'APBD48',
 'APBD49',
 'APBD50',
 'APBD51',
 'APBD52',
 'APBD53',
 'APBD54',
 'APBD55',
 'APBD56',
 'APBD57',
 'APBD58',
 'APBD59',
 'APBD60',
 'APBD61',
 'APBD62',
 'APBD63',
 'APBD64',
 'APBD65',
 'APBD66',
 'APBD67',
 'APBD68',
 'APBD69',
 'APBD70',
 'APBD71',
 'APBD72',
 'APBD73',
 'APBD74',
 'APBD75',
 'APBD76',
 'APBD77',
 'APBD78',
 'APBD79',
 'APBD80',
 'APBD81',
 'APBD82',
 'APBD83',
 'APBD84',
 'APBD85',
 'APBD86',
 'APBD87',
 'APBD88',
 'APBD89',
 'APBD90',
 'APBD91',
 'APBD92

In [35]:
NALDBFetchComp("APBD54")

{'NALDB_ID': 'APBD54 ',
 'binding_detail': '',
 'canonical_smiles': 'C[N+]1=CC=C(C=C1)C1=C2\\C=CC(=N2)\\C(=C2/N\\C(\\C=C2)=C(/C2=N/C(/C=C2)=C(\\C2=CC=C\\1N2)C1=CC=[N+](C)C=C1)C1=CC=[N+](C)C=C1)\\C1=CC=[N+](C)C=C1',
 'img_URL': 'http://bsbe.iiti.ac.in/bsbe/naldb/img/APBD54.png',
 'ligand_name': 'TMPyP4',
 'molecular_formula': 'C44 H38 N8',
 'molecular_weight': '678.826',
 'pubmed_id': '20166743',
 'target_name': 'AS1411 aptamer',
 'target_sequence': ''}

In [36]:
page = requests.get(f"http://bsbe.iiti.ac.in/bsbe/naldb/APBD54.php")
soup = BeautifulSoup(page.content, 'html.parser')

# Extract content from <td> tags
td_html_tag = soup.select('td')

# Extract image src from <img> tags
img_html_tag = soup.select('img')

meta_data = dict()

# storing title of the page
meta_data["NALDB_ID"] = soup.title.text
meta_data["target_name"] = td_html_tag[2].text.strip()
meta_data["target_sequence"] = td_html_tag[3].text.strip()
meta_data["ligand_name"] = td_html_tag[4].text.strip()
meta_data["binding_detail"] = td_html_tag[5].text.strip()
meta_data["pubmed_ID"] = td_html_tag[9].text.strip()
meta_data["canonical_smiles"] = td_html_tag[11].text
meta_data["molecular_formula"] = td_html_tag[12].text
meta_data["molecular_weight"] = td_html_tag[13].text
meta_data['net_charge'] = td_html_tag[14].text
meta_data['a_log_p'] = td_html_tag[15].text
meta_data['num_aromatic_rings'] = td_html_tag[16].text
meta_data['num_h_acceptors'] = td_html_tag[17].text
meta_data['num_h_donors'] = td_html_tag[18].text
meta_data['num_rings'] = td_html_tag[19].text
meta_data['num_rotatable_bonds'] = td_html_tag[20].text
meta_data['energy'] = td_html_tag[21].text
meta_data['minimized_energy'] = td_html_tag[22].text

src = [img.get('src') for img in img_html_tag][0]
meta_data["img_URL"] = f"http://bsbe.iiti.ac.in/bsbe/naldb/{src}" 

In [37]:
meta_data

{'NALDB_ID': 'APBD54 ',
 'a_log_p': '9.799',
 'binding_detail': '',
 'canonical_smiles': 'C[N+]1=CC=C(C=C1)C1=C2\\C=CC(=N2)\\C(=C2/N\\C(\\C=C2)=C(/C2=N/C(/C=C2)=C(\\C2=CC=C\\1N2)C1=CC=[N+](C)C=C1)C1=CC=[N+](C)C=C1)\\C1=CC=[N+](C)C=C1',
 'energy': '250.03',
 'img_URL': 'http://bsbe.iiti.ac.in/bsbe/naldb/img/APBD54.png',
 'ligand_name': 'TMPyP4',
 'minimized_energy': '101.64',
 'molecular_formula': 'C44 H38 N8',
 'molecular_weight': '678.826',
 'net_charge': '4',
 'num_aromatic_rings': '8',
 'num_h_acceptors': '2',
 'num_h_donors': '2',
 'num_rings': '9',
 'num_rotatable_bonds': '4',
 'pubmed_ID': '20166743',
 'target_name': 'AS1411 aptamer',
 'target_sequence': ''}

In [38]:
len(sorted(list(meta_data.keys())))

19

In [39]:
columns=[
 "NALDB_ID",
 "pubmed_ID",
 "ligand_name",
 "target_name",
 "canonical_SMILES",
 "molecular_formula",
 "molecular_weight",
 "binding_detail",
 "minimized_energy",
 "energy",
 "net_charge",
 "a_log_p",
 "num_aromatic_rings",
 "num_h_acceptors",
 "num_h_donors",
 "num_rings",
 "img_URL"
 ]

In [40]:
len(columns)

17

In [41]:
print(columns)

['NALDB_ID', 'pubmed_ID', 'ligand_name', 'target_name', 'canonical_SMILES', 'molecular_formula', 'molecular_weight', 'binding_detail', 'minimized_energy', 'energy', 'net_charge', 'a_log_p', 'num_aromatic_rings', 'num_h_acceptors', 'num_h_donors', 'num_rings', 'img_URL']
