In [3]:
#Import necessary Packages
import pandas as pd
import numpy as np
import os
from os.path import dirname, join as pjoin
import sys
from rdkit import Chem
from rdkit.Chem import AllChem
from deepchem.utils.docking_utils import prepare_inputs
from sklearn.model_selection import train_test_split

In [4]:
#Access File Directories and import Ids and labels

def load_data(csv_file='./data.csv'):
    """
    Loads data with 3 features and associated label for ML use

    :param matrix_in: csv file with 'Ids' and 'Labels' as column headers
    :type matrix_in: file name string
    :return: Id array and label array
    """
    norm_dir = ('C:\\Users\\joslynn.deaver\\SoftDrugs\\Sanitization')
    os.chdir(norm_dir)
    loaded_data = pd.read_csv(csv_file)
    n_files = len(loaded_data)
    Ids = loaded_data["Ids"]
    labels = loaded_data["Labels"]
    labels = labels.to_numpy()
    return Ids, labels

In [5]:
#Access File, Sanitize, Save

def sanitizer(Ids, Labels, data_folder):
    file_count = len(Ids)
    sanitized_data_Ids_and_labels = np.zeros((file_count, 2), dtype = int)
    sanitized_data_Ids_and_labels = sanitized_data_Ids_and_labels.astype(str)
    count = 0
    data_dir = ('C:\\Users\\joslynn.deaver\\Documents\\SoftDrugs Files\\Raw Data\\%s' % (data_folder))
    save_dir = ('C:\\Users\\joslynn.deaver\\SoftDrugs\\Sanitization\\Sanitized Data')
    norm_dir = ('C:\\Users\\joslynn.deaver\\SoftDrugs\\Sanitization')
    for pdbid in Ids:
        folder_dir = None
        raw_protein, raw_ligand, molfile = None, None, None
        protein, ligand = None, None
        try:
            folder_dir = pjoin(data_dir, pdbid)
            os.chdir(folder_dir)
        except:
            print('%s folder access failed' % (pdbid))
        if os.getcwd() == folder_dir:
            try:
                raw_protein = ('%s_protein.pdb' % (pdbid))
                protein = raw_protein
            except:
                print('%s access failed' % (pdbid))
            try:
                raw_ligand = ('%s_ligand.mol2' % (pdbid))
            except:
                print('%s ligand access failed' % (pdbid))
            if raw_ligand:
                try:
                    molfile = Chem.rdmolfiles.MolFromMol2File(raw_ligand)
                except:
                    print('%s ligand mol from mol2 file generation failed' % (pdbid))
                if molfile:
                    try:
                        ligand = Chem.rdmolfiles.MolToSmiles(molfile)
                    except:
                        print('%s ligand pdb generation failed' % (pdbid))
            if protein and ligand:
                p, m = None, None
                try:
                    p, m = prepare_inputs(protein, ligand)
                except:
                    print('%s failed sanitization' % (pdbid)) 
                if p and m:
                    os.chdir(save_dir)
                    Chem.rdmolfiles.MolToPDBFile(p, '%s.pdb' % (pdbid))
                    Chem.rdmolfiles.MolToPDBFile(m, '%s_ligand.pdb' % (pdbid))
                    sanitized_data_Ids_and_labels[count, 0] = pdbid
                    sanitized_data_Ids_and_labels[count, 1] = Labels[count]
        count += 1
    os.chdir(norm_dir)
    sanitized_data_Ids_and_labels = sanitized_data_Ids_and_labels[~np.all(sanitized_data_Ids_and_labels == 0, axis=1)]
    return sanitized_data_Ids_and_labels

In [6]:
#join sanitized general and refined dataset labels into one
def join_datasets(general_dataset, refine_dataset):
    gen_count = len(general_dataset)
    ref_count = len(refine_dataset)
    sanitized_files = gen_count + ref_count
    sanitized_dataset = np.zeros((sanitized_files,2))
    count = 0
    for loc in range(gen_count):
        sanitized_dataset[count, 0] = general_dataset[loc, 0]
        sanitized_dataset[count, 0] = general_dataset[loc, 1]
        count += 1
    for loc in range(ref_count):
        sanitized_dataset[count, 0] = refine_dataset[loc, 0]
        sanitized_dataset[count, 0] = refine_dataset[loc, 1]
        count += 1
    return sanitized_dataset

In [None]:
#Run Code
general_Ids, general_labels = load_data('General Ids and Labels.csv')
refine_Ids, refine_labels = load_data('Refined Ids and Labels.csv')

sanitized_general = sanitizer(general_Ids, general_labels, 'v2019-other-PL')
sanitized_refine = sanitizer(refine_Ids, refine_labels, 'refined-set')

sanitzed_data_directory = join_datasets(sanitized_general, sanitized_refine)

4lh2 failed sanitization


In [None]:
len(sanitzed_data_directory)

In [None]:
#split sanitized data
def generate_datasets(sanitzed_data_directory):
    X = sanitzed_data_directory[:, 0]
    y = sanitzed_data_directory[:, 1]
    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, test_size=0.3)
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=0.5)
    train_data = [X_train, y_train]
    validation_data = [X_val, y_val]
    test_data = [X_test, y_test]
    return train_data, validation_data, test_data