In [27]:
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
import time
from multiprocessing import Pool
import multiprocessing as mp

In [2]:
data = pd.read_csv("data.csv",low_memory=False);

In [3]:
descriptors = pd.read_csv("descriptors.csv");

# Import smiles data from txt file

The simplified molecular-input line-entry system (SMILES) is a specification in form of a line notation for describing the structure of chemical species using short ASCII strings. SMILES strings can be imported by most molecule editors for conversion back into two-dimensional drawings or three-dimensional models of the molecules. The smiles data file shows the molecule ids and their respective structure. It was downloaded here:
https://pubchem.ncbi.nlm.nih.gov/bioassay/624202#section=Top

Click on the "Download" tab -> select "tested substances" -> click "Download service" -> Chose "Smiles" as format and download. 

In [4]:
smilesData = pd.read_csv("smilesData.txt",sep="\t", header=None);
smilesData.columns = ['molSID','chemStruc']

descList = [i[0] for i in Descriptors._descList]
data = data.loc[5:,['PUBCHEM_SID','PUBCHEM_ACTIVITY_SCORE']];

In [6]:
inactives= data.loc[data.PUBCHEM_ACTIVITY_SCORE==0]
actives= data.loc[data.PUBCHEM_ACTIVITY_SCORE!=0]
print("Number of inactives is ",len(inactives))
print("Number of actives is ", len(actives))

Number of inactives is  364035
Number of actives is  13515


In [7]:
selectedInactives = inactives.sample(350)
selectedActives = actives.sample(700)
smallDataset = pd.concat([selectedInactives,selectedActives])

In [8]:
smallDataset.head()

Unnamed: 0,PUBCHEM_SID,PUBCHEM_ACTIVITY_SCORE
345419,87335589.0,0.0
362606,103050555.0,0.0
93244,17409466.0,0.0
308058,56315486.0,0.0
7386,851101.0,0.0


In [9]:
print("Shape of small dataset is",smallDataset.shape)

Shape of small dataset is (1050, 2)


In [10]:
# Convert floating point values to integers. 
smallDataset['PUBCHEM_SID'] = smallDataset['PUBCHEM_SID'].astype(np.int64)
smallDataset['PUBCHEM_ACTIVITY_SCORE'] = smallDataset['PUBCHEM_ACTIVITY_SCORE'].astype(np.int64)

In [11]:
smallDataset.head()

Unnamed: 0,PUBCHEM_SID,PUBCHEM_ACTIVITY_SCORE
345419,87335589,0
362606,103050555,0
93244,17409466,0
308058,56315486,0
7386,851101,0


# Creating the descriptors Matrix 

In [12]:
smallSmilesData = smilesData.loc[smilesData.molSID.isin(smallDataset.PUBCHEM_SID.values)]

In [13]:
smallSmilesData.head()

Unnamed: 0,molSID,chemStruc
65,124898757,CCN(CC)CCCOC1=C(C=C2C(=C1)N=C(N=C2NC3CCN(CC3)C...
207,124897432,C1=CC(=CC=C1C=O)N(CCCl)CCCl
332,124897072,CN(C)C1=CC=C(C=C1)/C=C/2\CCC3=CC=CC=C32
647,124896036,CC(C)(C)C1=CC=C(C=C1)C(=O)C[N+]2=CC=CC=C2Cl
876,124894670,COC1=CC=CC=C1/C=C/C2=NC3=CC=CC=C3C=C2


In [17]:
dM = pd.DataFrame(np.zeros([len(smallSmilesData),len(descList)]))
dM.columns = descList
dM.insert(loc=0, column="molSID",value = smallSmilesData.molSID.values)
dM.insert(loc=1, column="chemStr",value = smallSmilesData.chemStruc.values)

In [24]:
chemStr = dM.chemStr.values
molSID = dM.molSID.values


# Multiprocessing module


In [25]:
def simpleCalcDesc(molSID):
    chemStr = dM.chemStr.values[dM.molSID==molSID][0]
    m= Chem.MolFromSmiles(chemStr)
    for idx,val in enumerate(descList):        
        tempDescVal = getattr(Descriptors,val)(m)    
        dM.loc[dM.molSID == molSID, val] = tempDescVal
    return dM        

In [34]:
def runParallelProcessing(nCores,void):
    start = time.time()
    pool = mp.Pool(nCores)
    result = pool.map(simpleCalcDesc,molSID)

    end = time.time()
    pool.close()
    pool.join()
    print("time to complete: ", end-start, " seconds")
    if(void==True):
        return 
    
    return result

In [32]:
# Run multiprocessing module with 8 cores
result = runParallelProcessing(8,False)

time to complete:  155.50537705421448


In [35]:
# Run multiprocessing module with 6 cores
runParallelProcessing(6,True)

time to complete:  158.8592791557312  seconds


In [36]:
# Run multiprocessing module with 4 cores
runParallelProcessing(4,True)

time to complete:  168.8393039703369  seconds


In [37]:
# Run multiprocessing module with 2 cores
runParallelProcessing(2,True)

time to complete:  255.0969340801239  seconds


In [38]:
# Run multiprocessing module with 1 cores
runParallelProcessing(1,True)

time to complete:  459.0300409793854  seconds


# Reduce mapping output from parallel processing

Because the tasks have been spread out over multiple cores the result that returns is in fragments from each core, thus we have to combine the fragments in the process of reducing. 

Each core has its own memory allocation and during the mapping process the memory is a snapshot of the variables. There are ways to implement memory sharing but I haven't been able to get that working, therefore I will have to slice the results matrix in the following way.

In [29]:
descriptorMatrix = pd.DataFrame()
for i in range(len(result)):
    temp = pd.DataFrame(result[i].iloc[i]).T
    descriptorMatrix = temp.append(descriptorMatrix)
    
descriptorMatrix = descriptorMatrix.sort_index()

In [30]:
descriptorMatrix.head()

Unnamed: 0,molSID,chemStr,MinEStateIndex,ExactMolWt,MinPartialCharge,MaxEStateIndex,HeavyAtomMolWt,FpDensityMorgan1,qed,FpDensityMorgan2,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,124898757,CCN(CC)CCCOC1=C(C=C2C(=C1)N=C(N=C2NC3CCN(CC3)C...,0.402237,513.379,-0.492842,6.23833,466.355,0.945946,0.455413,1.7027,...,0,0,0,0,0,0,0,0,1,0
1,124897432,C1=CC(=CC=C1C=O)N(CCCl)CCCl,0.561826,245.037,-0.369144,10.4832,233.033,1.06667,0.567846,1.6,...,0,0,0,0,0,0,0,0,0,0
2,124897072,CN(C)C1=CC=C(C=C1)/C=C/2\CCC3=CC=CC=C32,1.16149,249.152,-0.377661,2.32468,230.205,0.894737,0.76918,1.63158,...,0,0,0,0,0,0,0,0,0,0
3,124896036,CC(C)(C)C1=CC=C(C=C1)C(=O)C[N+]2=CC=CC=C2Cl,0.0588511,288.115,-0.28699,12.2478,269.646,1.1,0.477902,1.75,...,0,0,0,0,0,0,0,0,0,0
4,124894670,COC1=CC=CC=C1/C=C/C2=NC3=CC=CC=C3C=C2,0.865314,261.115,-0.496137,5.33233,246.204,0.85,0.696198,1.65,...,0,0,0,0,0,0,0,0,0,0


# Trying things

In [None]:
output = mp.Queue()
mp.cpu_count()

In [None]:
def calculateDescriptor(descriptor, chemStr,molSID):
    for idx,val in enumerate(molSID):
        m= Chem.MolFromSmiles(chemStr[idx])
        descVal = Descriptors.MaxAbsPartialCharge(m)
        dM.loc[dM.molSID == val, descriptor] = descVal


In [None]:
dM.loc[dM.molSID == 124897320, 'MinAbsEStateIndex']

In [None]:
simpleCalcDesc('MinAbsEStateIndex','CSC1=CC=C(C=C1)/C=C/C2=CC=NC3=CC=CC=C23',124897320)

In [None]:
dM.head()

In [None]:
dM.loc[dM.molSID == 124897571, 'NumRadicalElectrons']= 2

In [None]:
def parallelize_dataframe(df, func):
    df_split = np.array_split(df, num_partitions)
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [None]:
result = parallelize_dataframe(exDescM,calculateDescriptors)

In [None]:
for idx,val in enumerate(smallSmilesData.chemStruc.values):
    m = Chem.MolFromSmiles(val)
    descriptor = descList[idx]
    tempDescVal = getattr(Descriptors,descriptor)(m)
    print(tempDescVal)
    break

In [None]:
descriptorsMatrix = pd.DataFrame(np.zeros([len(smallSmilesData),len(descList)]))
descriptorsMatrix.columns = descList
descriptorsMatrix.insert(loc=0, column="molSID",value = smallSmilesData.molSID.values)

# Mini dataframe


In [None]:
extraSmallData = smallSmilesData[:5]

In [None]:
exDescM = pd.DataFrame(np.zeros([len(extraSmallData),len(descList)]))
exDescM.columns = descList
exDescM.insert(loc=0, column="molSID",value = extraSmallData.molSID.values)
exDescM.insert(loc=1, column="chemStr",value = extraSmallData.chemStruc.values)

In [None]:
exDescM

# mini end

# method for calculating descriptors for full matrix

In [None]:
for i, mol in enumerate(extraSmallData.molSID):
    m= Chem.MolFromSmiles(extraSmallData.chemStruc.values[i])
    for idx, val in enumerate(descList):
        descriptor = descList[idx]
        tempDescVal = getattr(Descriptors,descriptor)(m)
        exDescM.loc[exDescM.molSID == mol,descriptor] = tempDescVal 
        break
    break

In [None]:
def calculateDescriptors(descriptorsMatrix):
    descList = list(descriptorsMatrix.columns.values)
    descList.remove('molSID')
    descList.remove('chemStruc')
    chemStruc = exDescM.chemStruc.values
    for i, mol in enumerate(descriptorsMatrix.molSID):
        m= Chem.MolFromSmiles(chemStruc[i])
        for idx, val in enumerate(descList): 
            descriptor = descList[idx]
            tempDescVal = getattr(Descriptors,descriptor)(m)
            descriptorsMatrix.loc[descriptorsMatrix.molSID == mol , descriptor] = tempDescVal 