# Pre-process PDB information

### Load libraries

In [1]:
import pandas as pd
import re

### Load data

In [2]:
pdb_info_df = pd.read_csv("pdb_ligand_info.csv", header = 1)

### Pre-process PDB data to get PDB entries for final dataset

In [3]:
# preprocess removal of entries with no PDB name (cases where a protein has more ligands)
ligand_names = list(pdb_info_df["Entry ID"])

ligand_names.append("extend")
ligand_names_mod = []

for i in range(len(ligand_names)-1):
	if pd.isna(ligand_names[i]) or pd.isna(ligand_names[i+1]):
		ligand_names_mod.append("remove")
	else:
		ligand_names_mod.append(ligand_names[i])
        
# remove appended item
ligand_names.pop()

pdb_info_df["Entry ID modified"] = ligand_names_mod

# remove proteins with more than 1 ligand
pdb_info_df = pdb_info_df[pdb_info_df["Entry ID modified"] != "remove"]

# unit test
assert(list(pdb_info_df["Entry ID"]) == list(pdb_info_df["Entry ID modified"]))

# remove entries with missing ligands
pdb_info_df = pdb_info_df.dropna(how="any", subset=["Ligand Name"])

# remove entries with an ion as ligand
pdb_info_df = pdb_info_df[pdb_info_df["Ligand Name"].str.contains("ION") == False]

# remove cases where ligands are in more than one chain
pdb_info_df = pdb_info_df[~pdb_info_df["Asym ID"].str.contains(",")]

# save pdb IDs (SBDD + DiffDock), 
# SMILES structures of ligands (DiffDock)
# and number of atoms in ligands (SBDD) to separate files
pdb_ids_outfile = open("pdb_ids", "w")
smiles_outfile = open("SMILES_ligands", "w")
atoms_counts_outfile = open("atoms_count", "w")

for index, row in pdb_info_df.iterrows():
	pdb_ids_outfile.write(row["Entry ID"] + "\n")
	smiles_outfile.write(row["Ligand SMILES"] + "\n")

	# extract number of atoms each ligand
	atoms_no = re.findall(r'\d+', row["Ligand Formula"])
	atoms_no = [int(i) for i in atoms_no]

	# store the different number of atoms
	unique_atoms = len(row["Ligand Formula"].split())

	# add count of atoms only accuring once
	if len(atoms_no) < unique_atoms:
		atoms_no.append(unique_atoms - len(atoms_no))

	# find sum of atoms in ligand
	atoms_sum = sum(atoms_no)

	atoms_counts_outfile.write(str(atoms_sum) + "\n")

# close files
pdb_ids_outfile.close()
smiles_outfile.close()
atoms_counts_outfile.close()