In [None]:
import pandas as pd
import numpy as np

def load_data(csv_file='./data.csv'):
    loaded_data = pd.read_csv(csv_file)
    n_files = len(loaded_data)
    Ids = loaded_data["Ids"]
    labels = loaded_data["Labels"]
    labels = labels.to_numpy()
    return Ids, labels

In [None]:
import deepchem as dc
import os
from os.path import dirname, join as pjoin
from deepchem.feat.base_classes import MolecularFeaturizer

def featurizing_MACKeys(Id_array, label_array, data_folder_name):
    n_samples = len(Id_array)
    features = np.zeros(n_samples)
    labels = np.zeros(n_samples)
    prim_path = ('C:\\Users\\joslynn.deaver\\SoftDrugs\\Sanitization')
    data_dir = pjoin(prim_path, data_folder_name)
    os.chdir(data_dir)
    MACKEYS_featurizer = MACCSKeysFingerprint()
    for idx in range(n_samples):
        pdbid = Id_array[idx]
        ligand = '%s_ligand.pdb' % (pdbid)
        try:
            features[idx] = MACKEYS_featurizer.featurize(([Chem.MolFromPDBFile(ligand)])).flatten()
            labels[idx] = label_array[idx]
        except:
            print('%s Featurization Failed' % (pdbid))
    os.chdir(prim_path)
    features_and_labels = {'Feature Array': features, 'Label': labels}
    features_and_labels = pd.DataFrame(data=features_and_labels)
    features_and_labels.to_csv('featured_MACKeys_train_data.csv')

In [None]:
import deepchem as dc
import os
from os.path import dirname, join as pjoin
from deepchem.feat.base_classes import MolecularFeaturizer

def featurizing_CF(Id_array, label_array, data_folder_name):
    n_samples = len(Id_array)
    features = np.zeros(n_samples)
    labels = np.zeros(n_samples)
    prim_path = ('C:\\Users\\joslynn.deaver\\SoftDrugs\\Sanitization')
    data_dir = pjoin(prim_path, data_folder_name)
    os.chdir(data_dir)
    CF_featurizer = CircularFingerprint()
    for idx in range(n_samples):
        pdbid = Id_array[idx]
        ligand = '%s_ligand.pdb' % (pdbid)
        try:
            features[idx] = CF_featurizer.featurize(([Chem.MolFromPDBFile(ligand)])).flatten()
            labels[idx] = label_array[idx]
        except:
            print('%s Featurization Failed' % (pdbid))
    os.chdir(prim_path)
    features_and_labels = {'Feature Array': features, 'Label': labels}
    features_and_labels = pd.DataFrame(data=features_and_labels)
    features_and_labels.to_csv('featured_CircularFP_train_data.csv')

In [None]:
import deepchem as dc
import os
from os.path import dirname, join as pjoin
import pubchempy as pcp
from deepchem.feat.base_classes import MolecularFeaturizer

def featurizing_PubFP(Id_array, label_array, data_folder_name):
    n_samples = len(Id_array)
    features = np.zeros(n_samples)
    labels = np.zeros(n_samples)
    prim_path = ('C:\\Users\\joslynn.deaver\\SoftDrugs\\Sanitization')
    data_dir = pjoin(prim_path, data_folder_name)
    os.chdir(data_dir)
    PubFB_featurizer = PubChemFingerprint()
    for idx in range(n_samples):
        pdbid = Id_array[idx]
        ligand = '%s_ligand.pdb' % (pdbid)
        try:
            features[idx] = PubFB_featurizer.featurize(([Chem.MolFromPDBFile(ligand)])).flatten()
            labels[idx] = label_array[idx]
        except:
            print('%s Featurization Failed' % (pdbid))
    os.chdir(prim_path)
    features_and_labels = {'Feature Array': features, 'Label': labels}
    features_and_labels = pd.DataFrame(data=features_and_labels)
    features_and_labels.to_csv('featured_PubChemFP_train_data.csv')

In [None]:
import deepchem as dc
import os
from os.path import dirname, join as pjoin
import pubchempy as pcp
from deepchem.feat.base_classes import MolecularFeaturizer

def featurizing_RDKit(Id_array, label_array, data_folder_name):
    n_samples = len(Id_array)
    features = np.zeros(n_samples)
    labels = np.zeros(n_samples)
    prim_path = ('C:\\Users\\joslynn.deaver\\SoftDrugs\\Sanitization')
    data_dir = pjoin(prim_path, data_folder_name)
    os.chdir(data_dir)
    RDKit_featurizer = RDKitDescriptors()
    for idx in range(n_samples):
        pdbid = Id_array[idx]
        ligand = '%s_ligand.pdb' % (pdbid)
        try:
            features[idx] = RDKit_featurizer.featurize(([Chem.MolFromPDBFile(ligand)])).flatten()
            labels[idx] = label_array[idx]
        except:
            print('%s Featurization Failed' % (pdbid))
    os.chdir(prim_path)
    features_and_labels = {'Feature Array': features, 'Label': labels}
    features_and_labels = pd.DataFrame(data=features_and_labels)
    features_and_labels.to_csv('featured_RDKit_train_data.csv')

In [None]:
os.chdir('C:\\Users\\joslynn.deaver\\SoftDrugs\\Sanitization')
Ids, Labels = load_data('train_dataset.csv')
featurizing_MACKeys(Ids, Labels, 'Sanitized Data')
featurizing_CF(Ids, Labels, 'Sanitized Data')
featurizing_PubFP(Ids, Labels, 'Sanitized Data')
featurizing_RDKit(Ids, Labels, 'Sanitized Data')