# Packages and Libraries

In [12]:
! pip install pubchempy
! pip install biopython



In [13]:
# For Webscrapping & REST API
import re
import requests
from bs4 import BeautifulSoup

# Accessing Bio/ Chem databases
from Bio import Entrez
from Bio import Medline
import pubchempy as pcp

# Data processing
import numpy as np
import pandas

# Miscellaneous
from tqdm import tqdm
import time
from datetime import datetime

In [14]:
start=datetime.now()

# Helper Functions

## NALDBFetchComp(APBD1)

In [15]:
def NALDBFetchComp(comp_id):
  """
  By webscrapping this function fetches the metadata related
  to given compound present in NALDB based on their 
  compound ID (e.g., APBD10)
  """

  page = requests.get(f"http://bsbe.iiti.ac.in/bsbe/naldb/{comp_id}.php")

  soup = BeautifulSoup(page.content, 'html.parser')

  # Extract content from <td> tags
  td_html_tag = soup.select('td')

  # Extract image src from <img> tags
  img_html_tag = soup.select('img')

  meta_data = dict()

  # storing title of the page
  meta_data["page_title"] = soup.title.text
  
  meta_data["target_name"] = td_html_tag[2].text.strip()

  meta_data["target_sequence"] = td_html_tag[3].text.strip()

  meta_data["ligand_name"] = td_html_tag[4].text.strip()

  meta_data["binding_detail"] = td_html_tag[5].text.strip()

  meta_data["pubmed_id"] = td_html_tag[9].text.strip()

  meta_data["canonical_smiles"] = td_html_tag[11].text

  meta_data["molecular_formula"] = td_html_tag[12].text

  meta_data["molecular_weight"] = td_html_tag[13].text

  src = [img.get('src') for img in img_html_tag][0]

  meta_data["img_URL"] = f"http://bsbe.iiti.ac.in/bsbe/naldb/{src}"   

  return meta_data

## pubmedSearch(pubmed_id, abstract, PDB_record, authors, title)

In [16]:
def pubmedSearch(pubmed_id, 
                 abstract=False, PDB_record=False, 
                 authors=False, title=False):
  """
  Get the PDB ID of corresponding Pubmed entry. 
  If PDB ID present returns the ID, or else 'None'
  """
  handle = Entrez.efetch(
      db="pubmed", 
      id=pubmed_id, 
      rettype="medline", 
      retmode="text")
    
  records = Medline.parse(handle)

  if abstract:
    return list(records)[0].get('AB')
  elif PDB_record:
    return list(records)[0].get('SI')
  elif authors:
    return list(records)[0].get('AU')
  elif title:
    return list(records)[0].get('TI')
  else:
    return records

## PubChemCID(SMILE)

In [17]:
def PubChemCID(SMILE):
  """
  Returns PubChem CID (ID) of given compound using
  PubChem CID through pubchempy (pcp) python library
  """
  cid = pcp.get_compounds(SMILE, 'smiles')
  cid_extracted = re.findall(r'\d+', str(cid[0]))[0]

  return cid_extracted  

## PubChemData(pubchem_cid)

In [None]:
def PubChemData(pubchem_cid):
  """
  Returns PubChem information of given compound using
  PubChem CID through pubchempy (pcp) python library
  """
  compound = pcp.Compound.from_cid(pubchem_cid)

  sdf_url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{pubchem_cid}/SDF"
  image_url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{pubchem_cid}/PNG"

  pubchem_meta_data = dict()

  pubchem_meta_data["xlogp"] = compound.xlogp
  pubchem_meta_data["pubchem_id"] = cid_extracted
  pubchem_meta_data["molecular_formula"] = compound.molecular_formula
  pubchem_meta_data["molecular_weight"] = compound.molecular_weight
  pubchem_meta_data["isomeric_smiles"] = compound.isomeric_smiles
  pubchem_meta_data["iupac_name"] = compound.iupac_name
  pubchem_meta_data["synonyms"] = compound.synonyms
  pubchem_meta_data["image_url"] = image_url
  pubchem_meta_data['SDF_file'] = sdf_url

  return pubchem_meta_data

## LigandSearchPDB(SMILE)

In [18]:
def LigandSearchPDB(SMILE):
  search_request = {
    "query": {
      "type": "terminal",
      "service": "chemical",
      "parameters": {
        "value": SMILE,
        "type": "descriptor",
        "descriptor_type": "SMILES",
        "match_type": "graph-exact"
      }
    },
    "return_type": "entry"
  }

  search_url = f"https://search.rcsb.org/rcsbsearch/v1/query?json="

  request = requests.post(url=search_url, json=search_request)

  search_results = dict(request.json())

  pdb_ids_list = [search_results['result_set'][i].get('identifier') for i in range(len(search_results['result_set']))]

  return pdb_ids_list

## PubChemSDF( )


In [20]:
"""
from tqdm import tqdm
import requests

response = requests.get(download_url, stream=True)

with open(f"{cid_extracted}.sdf", "wb") as handle:
    for data in tqdm(response.iter_content()):
        handle.write(data)

"""        

'\nfrom tqdm import tqdm\nimport requests\n\nresponse = requests.get(download_url, stream=True)\n\nwith open(f"{cid_extracted}.sdf", "wb") as handle:\n    for data in tqdm(response.iter_content()):\n        handle.write(data)\n\n'

In [21]:
# ! cat /content/238.sdf

# PDB Rest API

In [None]:
cid_extracted = PubChemCID(naldb_data['canonical_smiles'])

pubchem_data = PubChemData(cid_extracted)

isomeric_smiles = pubchem_data.get('isomeric_smiles')

canonical_smiles=naldb_data.get('canonical_smiles')

pdb_ids_list=LigandSearchPDB(canonical_smiles)

pubmed_id=naldb_data['pubmed_id']

naldb_comp_id=naldb_data['page_title']

In [None]:
print(f"{naldb_comp_id} | {pubmed_id} | {cid_extracted} | {pdb_ids_list[0]} | {canonical_smiles}")

In [None]:
print(f"{end-start}")