In [3]:
import pandas as pd

# Read the Affinity TXT file into a dataframe
dataframe = pd.read_csv('PDB_URV_DATABASE/Affinity.txt',sep=r'\s+',header=None)  

# Display the dataframe
#print(dataframe)

# Select the first column
protiens = dataframe.iloc[:, 0]

# Loop over the values in the first column
# for protien in protiens:
#     print(protien)

In [3]:
# preprocess ligand files
import pandas as pd
from rdkit import Chem
from rdkit.Chem import MolFromMol2File
import os
# Folder path
folder_path = 'PDB_URV_DATABASE/Ligand_mol2/'


ligand_df = pd.DataFrame(columns=['Filepath','Id', 'Ligand_state','SMILES', 'Error'])

# Loop over .mol2 files in the folder
for filename in os.listdir(folder_path):
    mol2_file_path = os.path.join(folder_path, filename)
    new_row = {'Filepath': mol2_file_path, 'Id': filename[:filename.index('_')],'Ligand_state': '','Error': '' }
    ligand_df = pd.concat([ligand_df, pd.DataFrame([new_row])], ignore_index=True)

    try:
        
        # Read the .mol2 file
        mol = MolFromMol2File(mol2_file_path, sanitize=False, cleanupSubstructures=False)
        # Check if the molecule was successfully loaded
        if mol is not None:
            
            try:
                # Perform structure validation
                Chem.SanitizeMol(mol)
                ligand_df.loc[ligand_df['Filepath'] == mol2_file_path, 'Ligand_state'] = 'valid'

            except ValueError as e:
                print("Error: " + str(e))
                ligand_df.loc[ligand_df['Filepath'] == mol2_file_path, 'Ligand_state'] = 'invalid'
                ligand_df.loc[ligand_df['Filepath'] == mol2_file_path, 'Error'] = str(e)
            
            # For example, compute 3D coordinates using the AllChem module
            #AllChem.Compute2DCoords(mol)
            
            # Print molecule information
            #print(Chem.MolToMolBlock(mol))

            # Convert molecule to SMILES notation
            smiles = Chem.MolToSmiles(mol)
            ligand_df.loc[ligand_df['Filepath'] == mol2_file_path, 'SMILES'] = smiles
            #print("SMILES notation of ",sdf_file_path," : ", smiles)
        else:
            #print("Failed to load molecule from .mol2 file.")
            ligand_df.loc[ligand_df['Filepath'] == mol2_file_path, 'Ligand_state'] = 'failed to load'
            raise ValueError("Failed to parse Mol2 file. The file may be empty or invalid.")
 
    except FileNotFoundError:
        print("File not found.")
        ligand_df.loc[ligand_df['Filepath'] == mol2_file_path, 'Error'] = "File not found."
    except IOError:
        print("Error reading the file.")
        ligand_df.loc[ligand_df['Filepath'] == mol2_file_path, 'Error'] = "Error reading the file."
    except ValueError as e:
        print("Error:", e)
        ligand_df.loc[ligand_df['Filepath'] == mol2_file_path, 'Error'] = str(e)
    except Exception as e:
        print("An unexpected error occurred:", e)
        ligand_df.loc[ligand_df['Filepath'] == mol2_file_path, 'Error'] = str(e)

print(ligand_df.shape)
ligand_df.to_csv('PDB_URV_Database/logs/ligands_mol2_file.csv', index=False)  # Set index=False to avoid writing row indices
ligand_df.head(53)

[00:02:47] Can't kekulize mol.  Unkekulized atoms: 2 4 11 22 23
[00:02:47] Can't kekulize mol.  Unkekulized atoms: 0 1 17 18 28
[00:02:47] Can't kekulize mol.  Unkekulized atoms: 1 2 16 17 27
[00:02:47] Can't kekulize mol.  Unkekulized atoms: 3 12 16 19 27
[00:02:48] Can't kekulize mol.  Unkekulized atoms: 0 12 14 25
[00:02:48] Can't kekulize mol.  Unkekulized atoms: 0 20 21 26 28
[00:02:48] Can't kekulize mol.  Unkekulized atoms: 2 11 12 17 18
[00:02:48] Can't kekulize mol.  Unkekulized atoms: 24 26 28 29
[00:02:48] Can't kekulize mol.  Unkekulized atoms: 13 14 23 24 26
[00:02:48] Can't kekulize mol.  Unkekulized atoms: 1 20 21 26 34
[00:02:48] Can't kekulize mol.  Unkekulized atoms: 1 8 18 19 24


Error: Can't kekulize mol.  Unkekulized atoms: 2 4 11 22 23
Error: Can't kekulize mol.  Unkekulized atoms: 0 1 17 18 28
Error: Can't kekulize mol.  Unkekulized atoms: 1 2 16 17 27
Error: Can't kekulize mol.  Unkekulized atoms: 3 12 16 19 27
Error: Can't kekulize mol.  Unkekulized atoms: 0 12 14 25
Error: Can't kekulize mol.  Unkekulized atoms: 0 20 21 26 28
Error: Can't kekulize mol.  Unkekulized atoms: 2 11 12 17 18
Error: Can't kekulize mol.  Unkekulized atoms: 24 26 28 29
Error: Can't kekulize mol.  Unkekulized atoms: 13 14 23 24 26
Error: Can't kekulize mol.  Unkekulized atoms: 1 20 21 26 34
Error: Can't kekulize mol.  Unkekulized atoms: 1 8 18 19 24


[00:02:48] Can't kekulize mol.  Unkekulized atoms: 6 7 11 12 26
[00:02:48] Can't kekulize mol.  Unkekulized atoms: 0 7 17 21 26
[00:02:48] Can't kekulize mol.  Unkekulized atoms: 1 3 12 13 17
[00:02:48] Can't kekulize mol.  Unkekulized atoms: 0 14 15 16


Error: Can't kekulize mol.  Unkekulized atoms: 6 7 11 12 26
Error: Can't kekulize mol.  Unkekulized atoms: 0 7 17 21 26
Error: Can't kekulize mol.  Unkekulized atoms: 1 3 12 13 17
Error: Can't kekulize mol.  Unkekulized atoms: 0 14 15 16


[00:02:48] Can't kekulize mol.  Unkekulized atoms: 3 15 16 17
[00:02:48] Can't kekulize mol.  Unkekulized atoms: 0 10 16 17
[00:02:48] Can't kekulize mol.  Unkekulized atoms: 0 10 15 16
[00:02:48] Can't kekulize mol.  Unkekulized atoms: 0 6 19 20
[00:02:48] Can't kekulize mol.  Unkekulized atoms: 2 11 16 17
[00:02:48] Can't kekulize mol.  Unkekulized atoms: 1 8 19 20
[00:02:48] Can't kekulize mol.  Unkekulized atoms: 0 9 16 17
[00:02:48] Can't kekulize mol.  Unkekulized atoms: 0 15 16 17
[00:02:48] Can't kekulize mol.  Unkekulized atoms: 5 14 17 18


Error: Can't kekulize mol.  Unkekulized atoms: 3 15 16 17
Error: Can't kekulize mol.  Unkekulized atoms: 0 10 16 17
Error: Can't kekulize mol.  Unkekulized atoms: 0 10 15 16
Error: Can't kekulize mol.  Unkekulized atoms: 0 6 19 20
Error: Can't kekulize mol.  Unkekulized atoms: 2 11 16 17
Error: Can't kekulize mol.  Unkekulized atoms: 1 8 19 20
Error: Can't kekulize mol.  Unkekulized atoms: 0 9 16 17
Error: Can't kekulize mol.  Unkekulized atoms: 0 15 16 17
Error: Can't kekulize mol.  Unkekulized atoms: 5 14 17 18
Error: Can't kekulize mol.  Unkekulized atoms: 0 10 13 22
Error: Can't kekulize mol.  Unkekulized atoms: 0 10 13 23
Error: Can't kekulize mol.  Unkekulized atoms: 1 19 20 26
(53, 5)


[00:02:48] Can't kekulize mol.  Unkekulized atoms: 0 10 13 22
[00:02:48] Can't kekulize mol.  Unkekulized atoms: 0 10 13 23
[00:02:48] Can't kekulize mol.  Unkekulized atoms: 1 19 20 26


Unnamed: 0,Filepath,Id,Ligand_state,SMILES,Error
0,PDB_URV_DATABASE/Ligand_mol2/6M2N_ligand.mol2,6M2N,valid,[H]Oc1c([O-])c([H])c2oc(-c3c([H])c([H])c([H])c...,
1,PDB_URV_DATABASE/Ligand_mol2/6W63_ligand.mol2,6W63,valid,[H]c1nc([H])c([C@]([H])(C(=O)N([H])C2([H])C([H...,
2,PDB_URV_DATABASE/Ligand_mol2/7AU4_ligand.mol2,7AU4,valid,[H]c1onc(C([H])([H])N2C(=O)N([H])[C@]3(C2=O)c2...,
3,PDB_URV_DATABASE/Ligand_mol2/7B2J_ligand.mol2,7B2J,valid,[H]c1c([H])c([H])c2c(nnn2C([H])([H])C(=O)N2C([...,
4,PDB_URV_DATABASE/Ligand_mol2/7B2U_ligand.mol2,7B2U,valid,[H]c1nc([H])c(N2C(=O)N([H])[C@@]([H])(C([H])([...,
5,PDB_URV_DATABASE/Ligand_mol2/7B5Z_ligand.mol2,7B5Z,valid,[H]C([H])=C1C([H])([H])C([H])([H])N(C(=O)C([H]...,
6,PDB_URV_DATABASE/Ligand_mol2/7B77_ligand.mol2,7B77,valid,[H]c1oc([H])c(C([H])([H])N(C(=O)C([H])([H])n2n...,
7,PDB_URV_DATABASE/Ligand_mol2/7E18_ligand.mol2,7E18,valid,[H]c1c([H])c([H])c2sc(C(=O)[C@@]([H])(N([H])C(...,
8,PDB_URV_DATABASE/Ligand_mol2/7E19_ligand.mol2,7E19,valid,[H]c1c([H])c([H])c(C([H])([H])OC(=O)N([H])[C@]...,
9,PDB_URV_DATABASE/Ligand_mol2/7KX5_ligand.mol2,7KX5,valid,[H]c1nc([H])c([C@]([H])(C(=O)N([H])[C@]([H])(c...,


In [4]:
# preprocess ligand files
import pandas as pd
from rdkit import Chem
from rdkit.Chem import MolFromMol2File
import os
# Folder path
folder_path = 'PDB_URV_DATABASE/Ligand_sdf/'

sdf_ligand_df = pd.DataFrame(columns=['Filepath', 'Id', 'Ligand_state','SMILES', 'Error'])

for filename in os.listdir(folder_path):
    sdf_file_path = os.path.join(folder_path, filename)
    new_row = {'Filepath': sdf_file_path, 'Id': filename[:filename.index('_')], 'Ligand_state': '','Error': '' }
    sdf_ligand_df = pd.concat([sdf_ligand_df, pd.DataFrame([new_row])], ignore_index=True)

    try:
        
        # Read the .mol2 file
        supplier = Chem.SDMolSupplier(sdf_file_path, sanitize = False,removeHs=False, strictParsing=False)

        # Iterate over the molecules in the SDF file
        for mol in supplier:
            # Check if the molecule was successfully loaded
            if mol is not None:
                
                try:
                    # Perform structure validation
                    Chem.SanitizeMol(mol)
                    sdf_ligand_df.loc[sdf_ligand_df['Filepath'] == sdf_file_path, 'Ligand_state'] = 'valid'

                except ValueError as e:
                    print("Error: " + str(e))
                    sdf_ligand_df.loc[sdf_ligand_df['Filepath'] == sdf_file_path, 'Ligand_state'] = 'invalid'
                    sdf_ligand_df.loc[sdf_ligand_df['Filepath'] == sdf_file_path, 'Error'] = str(e)
                
                # For example, compute 3D coordinates using the AllChem module
                #AllChem.Compute2DCoords(mol)
                
                # Print molecule information
                #print(Chem.MolToMolBlock(mol))

                # Convert molecule to SMILES notation
                smiles = Chem.MolToSmiles(mol)
                sdf_ligand_df.loc[sdf_ligand_df['Filepath'] == sdf_file_path, 'SMILES'] = smiles
                print("SMILES notation of ",sdf_file_path," : ", smiles)
            else:
                #print("Failed to load molecule from .mol2 file.")
                sdf_ligand_df.loc[sdf_ligand_df['Filepath'] == sdf_file_path, 'Ligand_state'] = 'failed to load'
                raise ValueError("Failed to parse sdf file. The file may be empty or invalid.")
 
    except FileNotFoundError:
        print("File not found.")
        sdf_ligand_df.loc[sdf_ligand_df['Filepath'] == sdf_file_path, 'Error'] = "File not found."
    except IOError:
        print("Error reading the file.")
        sdf_ligand_df.loc[sdf_ligand_df['Filepath'] == sdf_file_path, 'Error'] = "Error reading the file."
    except ValueError as e:
        print("Error:", e)
        sdf_ligand_df.loc[sdf_ligand_df['Filepath'] == sdf_file_path, 'Error'] = str(e)
    except Exception as e:
        print("An unexpected error occurred:", e)
        sdf_ligand_df.loc[sdf_ligand_df['Filepath'] == sdf_file_path, 'Error'] = str(e)

print(sdf_ligand_df.shape)
sdf_ligand_df.to_csv('PDB_URV_Database/logs/ligands_sdf_file.csv', index=False)  # Set index=False to avoid writing row indices
sdf_ligand_df.head(160)

SMILES notation of  PDB_URV_DATABASE/Ligand_sdf/6M2N_ligand.sdf  :  [H]Oc1c([O-])c([H])c2oc(-c3c([H])c([H])c([H])c([H])c3[H])c([H])c(=O)c2c1O[H]
SMILES notation of  PDB_URV_DATABASE/Ligand_sdf/6W63_ligand.sdf  :  [H]c1nc([H])c([C@]([H])(C(=O)N([H])C2([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])C2([H])[H])N(C(=O)c2nc([H])n([H])c2[H])c2c([H])c([H])c(C(C([H])([H])[H])(C([H])([H])[H])C([H])([H])[H])c([H])c2[H])c([H])c1[H]
SMILES notation of  PDB_URV_DATABASE/Ligand_sdf/7AU4_ligand.sdf  :  [H]c1onc(C([H])([H])N2C(=O)N([H])[C@]3(C2=O)c2c([H])c([H])c(Cl)c([H])c2C([H])([H])C3([H])[H])c1[H]
SMILES notation of  PDB_URV_DATABASE/Ligand_sdf/7B2J_ligand.sdf  :  [H]c1c([H])c([H])c2c(nnn2C([H])([H])C(=O)N2C([H])([H])C([H])([H])C([H])(C([H])([H])[H])C([H])([H])C2([H])[H])c1[H]
SMILES notation of  PDB_URV_DATABASE/Ligand_sdf/7B2U_ligand.sdf  :  [H]c1nc([H])c(N2C(=O)N([H])[C@@]([H])(C([H])([H])C3([H])C([H])([H])C([H])([H])C([H])([H])C([H])([H])C3([H])[H])C2=O)c([H])c1F
SMILES notation of  PDB_URV_DA

[00:03:57] Explicit valence for atom # 5 N, 4, is greater than permitted


Error: Explicit valence for atom # 5 N, 4, is greater than permitted
SMILES notation of  PDB_URV_DATABASE/Ligand_sdf/7VU6_ligand.sdf  :  [H]C1=N/C(=C(\[H])N2C(=O)N([H])C(N([H])C3=C(\[H])C4=C([H])N(C([H])([H])[H])N=C4/C([H])=C\3Cl)N(C([H])([H])C3=C(F)C([H])=C(F)C(F)=C3[H])C2=O)N=N1C([H])([H])[H]
SMILES notation of  PDB_URV_DATABASE/Ligand_sdf/7VVP_ligand.sdf  :  [H]C1=C(C([H])([H])[C@@]([H])(C(=O)C([H])([H])OP([O])([O])=O)N([H])C(=O)[C@@]([H])(N([H])C(=O)c2c([H])c3c(OC([H])([H])[H])c([H])c([H])c([H])c3n2[H])C([H])([H])C([H])(C([H])([H])[H])C([H])([H])[H])C(=O)N([H])C1([H])[H]
SMILES notation of  PDB_URV_DATABASE/Ligand_sdf/7VVT_ligand.sdf  :  [H]c1c([H])c(Cl)c([H])c(N([H])C(=O)C([H])([H])[C@]2([H])C(=O)N([H])C([H])([H])C([H])([H])N2C(=O)C([H])([H])[H])c1[H]
SMILES notation of  PDB_URV_DATABASE/Ligand_sdf/7X6K_ligand.sdf  :  [H]C(=O)c1c([H])c2c([H])c([H])c([H])c([H])c2n1[H]
SMILES notation of  PDB_URV_DATABASE/Ligand_sdf/8ACD_ligand.sdf  :  [H]c1sc(C([H])([H])N([H])C(=O)[C@@]2([H])N(c3c(

Unnamed: 0,Filepath,Id,Ligand_state,SMILES,Error
0,PDB_URV_DATABASE/Ligand_sdf/6M2N_ligand.sdf,6M2N,valid,[H]Oc1c([O-])c([H])c2oc(-c3c([H])c([H])c([H])c...,
1,PDB_URV_DATABASE/Ligand_sdf/6W63_ligand.sdf,6W63,valid,[H]c1nc([H])c([C@]([H])(C(=O)N([H])C2([H])C([H...,
2,PDB_URV_DATABASE/Ligand_sdf/7AU4_ligand.sdf,7AU4,valid,[H]c1onc(C([H])([H])N2C(=O)N([H])[C@]3(C2=O)c2...,
3,PDB_URV_DATABASE/Ligand_sdf/7B2J_ligand.sdf,7B2J,valid,[H]c1c([H])c([H])c2c(nnn2C([H])([H])C(=O)N2C([...,
4,PDB_URV_DATABASE/Ligand_sdf/7B2U_ligand.sdf,7B2U,valid,[H]c1nc([H])c(N2C(=O)N([H])[C@@]([H])(C([H])([...,
...,...,...,...,...,...
155,PDB_URV_DATABASE/Ligand_sdf/Mpro-x2649_ligand.sdf,Mpro-x2649,valid,[H]c1nc([H])c(N([H])C(=O)[C@@]([H])(c2c([H])c(...,
156,PDB_URV_DATABASE/Ligand_sdf/Mpro-x2908_ligand.sdf,Mpro-x2908,valid,[H]c1nc([H])c(N([H])C(=O)N([H])c2c([H])c([H])c...,
157,PDB_URV_DATABASE/Ligand_sdf/Mpro-x2910_ligand.sdf,Mpro-x2910,valid,[H]c1c([H])c([H])c(OC([H])([H])C([H])([H])N([H...,
158,PDB_URV_DATABASE/Ligand_sdf/Mpro-x2912_ligand.sdf,Mpro-x2912,valid,[H]c1nc([H])c(N([H])C(=O)[C@@]([H])(c2c([H])c(...,
