# RCSB CIF Data Preprocessing

Aggregate the current CIF data from the RCSB from the DIGs

Modified from Ivan.

In [None]:
import glob
import json
import numpy as np
from openbabel import openbabel
import gzip

import sys
sys.path.append("/home/akubaney/projects/na_mpnn")
sys.path.remove("/home/akubaney/software/cifutils")

import cifutils

import pandas as pd

openbabel.obErrorLog.SetOutputLevel(0)


## 1. Compile the database of residues observed in the PDB

First, run `/projects/ml/ligand_datasets/pdb/download.sh` to update the files.

In [None]:
sdfnames = glob.glob('/projects/ml/ligand_datasets/pdb/ligands/?/*_model.sdf')
len(sdfnames)

In [None]:
%%time
obConversion = openbabel.OBConversion()
obConversion.SetInFormat("sdf")
obConversion.SetOutFormat("sdf")
ligands = {}
for sdfname in sdfnames[:]:
    obmol = openbabel.OBMol()
    obConversion.ReadFile(obmol,sdfname)
    xyz = np.array([(a.x(),a.y(),a.z()) for a in openbabel.OBMolAtomIter(obmol)])

    cifname = sdfname.replace('_model.sdf','.cif')
    try:
        cif = cifutils.ParsePDBLigand(cifname)
    except:
        print("FAILED:", sdfname)
        continue

    if obmol.NumAtoms()!=cif['xyz'].shape[0]:
        print("FAILED:", sdfname)
        continue
    '''
    flag = ((xyz-cif['xyz'])[~np.isnan(cif['xyz'])]<1e-3).all()
    if flag==False:
        print("FAILED:", sdfname)
        continue
    '''

    ID = cifname.split('/')[-1][:-4]
    ligands[ID] = {
        'sdf' : obConversion.WriteString(obmol),
        'atom_id' : cif['atom_id'].tolist(),
        'leaving' : cif['leaving'].tolist(),
        'pdbx_align' : cif['pdbx_align'].tolist()
    }

In [None]:
out_strs_lst = []
for k,v in ligands.items():
    out_strs_lst.append('\t"%s" : %s,\n'%(k,json.dumps(v)))

out_str = '{\n' + ("".join(out_strs_lst))[:-2] + '\n}\n'

with open("ligands.json", "w") as outfile:
    outfile.write(out_str)

In [None]:
!rm -f ligands.json.gz
!gzip ligands.json

In [None]:
%%time
with gzip.open('ligands.json.gz','rt') as file:
    ligands = json.load(file)
len(ligands)

## 2. Process the latest PDB

Run `scan_rcsb_cif_database.sh` to process all PDB entries.

In [None]:
# concatenate all .csv files into one
df = pd.concat([pd.read_csv(csv) for csv in glob.glob('./pdb_content/*.csv')])
df = df.sort_values('label')

In [None]:
tolist = lambda l : l[1:-1].replace("'","").split(", ")
for key in ('poly','poly_type','nonpoly','poly_sequence'):
    df[key] = df[key].apply(tolist)

In [None]:
df

In [None]:
df.date.max()

In [None]:
df.to_csv('pdb_21Jan2025.csv', index=False)