In [9]:
from langchain.tools import OpenAPISpec, APIOperation
from langchain.chains import OpenAPIEndpointChain
from langchain.requests import Requests
# from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
import pymol2
import ast
import requests

OPENAI_API_KEY = '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'

In [2]:
def search_rcsdb(search_text): 
    url = 'https://search.rcsb.org/rcsbsearch/v2/query'

    query = {
      "query": {
        "type": "terminal",
        "service": "full_text",
        "parameters": {
          "value": search_text
        }
      },
      "return_type": "entry"
    }
    headers = {"Content-Type": "application/json"}

    # make POST request and retrieve response data
    response = requests.post(url, headers=None, json=query)

    # check the response status code and content
    if response.status_code == 200:
        data = response.json()
        ids = [r['identifier'] for r in data['result_set']]
        return ids
    else:
        print(response)

In [14]:
def ask_openAI(ids, path = '/core/entry/{entry_id}', OPENAI_API_KEY=OPENAI_API_KEY):
    spec = OpenAPISpec.from_url('https://data.rcsb.org/redoc/rcsb-restful-api-docs.json')
    
    if not path in spec.paths.keys():
        print('Error: unrecognized path')

    operation = APIOperation.from_openapi_spec(spec, path, "get")
    
    operation.base_url = 'https://data.rcsb.org'+ operation.base_url
    
    llm = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model = 'gpt-4', temperature=0.0)
    
    chain = OpenAPIEndpointChain.from_api_operation(
        operation, 
        llm, 
        requests=Requests(), 
        verbose=False,
        return_intermediate_steps=True # Return request and response text

    )
    
    
    output = {r: ast.literal_eval(chain(f'entry_id for {r}')['intermediate_steps']['response_text']) for r in ids}
    return output


In [7]:
def get_seqs(r):
    with pymol2.PyMOL() as pymol:
        pymol.cmd.fetch(r, 'prot')
        seqs = pymol.cmd.get_fastastr('prot').split()
        return seqs

In [15]:
text = 'How do I express hemoglobin?'

# This gets the protein IDs from PDB
ids = search_rcsdb(text)

# This Asks OpenAI to use the IDs retrieved and download all data from PDB, including sequences
pdb_data = ask_openAI(ids, path)
sequences = {k:get_seqs(k) for k in ids}


Attempting to load an OpenAPI 3.0.1 spec.  This may result in degraded performance. Convert your OpenAPI spec to 3.1.* spec for better support.
Attempting to load an OpenAPI 3.0.1 spec.  This may result in degraded performance. Convert your OpenAPI spec to 3.1.* spec for better support.


 ExecutiveLoad-Detail: Detected mmCIF
 ExecutiveLoad-Detail: Detected mmCIF
 ExecutiveLoad-Detail: Detected mmCIF
 ExecutiveLoad-Detail: Detected mmCIF
 ExecutiveLoad-Detail: Detected mmCIF
 ExecutiveLoad-Detail: Detected mmCIF
 ExecutiveLoad-Detail: Detected mmCIF
 ExecutiveLoad-Detail: Detected mmCIF
 ExecutiveLoad-Detail: Detected mmCIF
 ExecutiveLoad-Detail: Detected mmCIF


In [16]:
ids

['3SZK',
 '1KHP',
 '4HRR',
 '4HRT',
 '1KHQ',
 '6A2B',
 '5X2S',
 '5X2T',
 '1ITH',
 '7SQZ']

In [17]:
pdb_data[ids[0]]

{'audit_author': [{'name': 'Jacques, D.A.', 'pdbx_ordinal': 1},
  {'name': 'Kumar, K.K.', 'pdbx_ordinal': 2},
  {'name': 'Guss, J.M.', 'pdbx_ordinal': 3},
  {'name': 'Gell, D.A.', 'pdbx_ordinal': 4}],
 'cell': {'angle_alpha': 90.0,
  'angle_beta': 90.0,
  'angle_gamma': 90.0,
  'length_a': 65.88,
  'length_b': 123.206,
  'length_c': 143.933,
  'zpdb': 8},
 'citation': [{'country': 'US',
   'id': 'primary',
   'journal_abbrev': 'J.Biol.Chem.',
   'journal_id_astm': 'JBCHA3',
   'journal_id_csd': '0071',
   'journal_id_issn': '0021-9258',
   'journal_volume': '286',
   'page_first': '38439',
   'page_last': '38447',
   'pdbx_database_id_doi': '10.1074/jbc.M111.287300',
   'pdbx_database_id_pub_med': 21917915,
   'rcsb_authors': ['Krishna Kumar, K.',
    'Jacques, D.A.',
    'Pishchany, G.',
    'Caradoc-Davies, T.',
    'Spirig, T.',
    'Malmirchegini, G.R.',
    'Langley, D.B.',
    'Dickson, C.F.',
    'Mackay, J.P.',
    'Clubb, R.T.',
    'Skaar, E.P.',
    'Guss, J.M.',
    'Gell, 

In [18]:
sequences[ids[0]]

['>prot_D',
 'VLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHGKKVADALTNAV',
 'AHVDDMPNALSALSDLHAHKLRVDPVNFKLLSHCLLVTLAAHLPAEFTPAVHASLDKFLASVSTVLTSKY',
 'R',
 '>prot_B',
 'VHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLSTPDAVMGNPKVKAHGKKVLGA',
 'FSDGLAHLDNLKGTFATLSELHCDKLHVDPENFRLLGNVLVCVLAHHFGKEFTPPVQAAYQKVVAGVANA',
 'LAHKYH',
 '>prot_E',
 'VHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLSTPDAVMGNPKVKAHGKKVLGA',
 'FSDGLAHLDNLKGTFATLSELHCDKLHVDPENFRLLGNVLVCVLAHHFGKEFTPPVQAAYQKVVAGVANA',
 'LAHKYH',
 '>prot_F',
 'GSSHHHHHHSSGLVPRGSHMADESLKDAIKDPALENKEHDIGPREQVNFQLLDKNNETQYYHFFSIKDPA',
 'DVYYTKKKAEVELDINTASTWKKFEVYENNQKLPVRLVSYSPVPEDHAYIRFPVSDGTQELKIVSSTQID',
 'DGEETNYDYTKLVFAKPIYNDPSL',
 '>prot_A',
 'VLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHGKKVADALTNAV',
 'AHVDDMPNALSALSDLHAHKLRVDPVNFKLLSHCLLVTLAAHLPAEFTPAVHASLDKFLASVSTVLTSKY',
 'R',
 '>prot_C',
 'GSSHHHHHHSSGLVPRGSHMADESLKDAIKDPALENKEHDIGPREQVNFQLLDKNNETQYYHFFSIKDPA',
 'DVYYTKKKAEVELDINTASTWKKFEVYENNQKLPVRLV