In [1]:
# import necessary packages 
import pandas  
import numpy as np 
from rdkit import Chem
import networkx
from rdkit.Chem import AllChem
from io import StringIO
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import Descriptors3D
from IPython.display import SVG
from rdkit import Chem
from rdkit.Chem import rdDepictor
from rdkit.Chem.Draw import rdMolDraw2D

In [2]:
# import data frame of all data 
DataMat= pandas.read_csv("MolMatrix.csv", sep= ';')

 # remove erroneous columns if necessary
DataMat = DataMat.loc[:, ~DataMat.columns.str.contains('^Unnamed')]
DataMat.head(20)

Unnamed: 0,CSD_Code,Electronic_E,Dispersion_E,Dipole_M,Metal_q,HL_Gap,HOMO_Energy,LUMO_Energy,Polarizability,SMILES,Stoichiometry,Charge,S,MND
0,ILOJOK,-3314.579807,-0.287378,2.947,1.77729,0.09247,-0.17508,-0.08261,726.729174,[Y]1234(N(c5c(cccc5C)C)[C]5C=CC=C(N15)N(c1c(cc...,C58H51F6N8Y,0,S=0,7
1,OBIQAS,-2182.364646,-0.165425,13.2425,0.95356,0.09408,-0.30051,-0.20643,463.072163,[La]12345(I)(I)[N@@](C[C]6N2C=CN=C6)(C[C]2N3C=...,C26H30I2LaN10(1+),1,S=0,8
2,ZOBPEN,-2252.596426,-0.263845,4.6839,1.92393,0.10349,-0.15361,-0.05012,630.762664,[Y]12(N([C](C)C=C(N1c1c(cccc1)C)C)c1c(cccc1)C)...,C50H60N5Y,0,S=0,5
3,KETHAV,-1085.624629,-0.106185,1.8276,1.89356,0.15642,-0.18356,-0.02714,303.843529,[Y](c1c(OC)cccc1)([C@H]1C(=C(C(=C1C)C)C)C)[C@H...,C25H33OY,0,S=0,12
4,RUXTAH,-3916.912886,-0.290756,3.2622,1.12029,0.12116,-0.17208,-0.05092,681.659139,[Sc]123(S[P]([C@]3([Si](C)(C)C)[C@@H](N2C)c2cc...,C49H70N4PSScSi,0,S=0,5
5,WOLRIA,-1934.263822,-0.143611,3.5856,1.73467,0.13199,-0.17411,-0.04212,381.808553,[C]12[C@@H]([CH]c3c1cccc3)[Y](N(CCN(C)C)[Si](C...,C22H42N3Si3Y,0,S=0,8
6,GACJAW,-1999.074241,-0.205098,7.3734,1.77909,0.12299,-0.18405,-0.06106,511.174137,[Y]12345Oc6c([CH][N@]3CC[N@](CC[N@@]4[CH]c3c(O...,C39H51N4O3Y,0,S=0,7
7,YIXHOE,-3380.95611,-0.180335,4.1099,1.1619,0.12814,-0.18742,-0.05928,570.504772,[Sc]123(OC(=C[C](O1)c1cn(nc1C)C)c1cn(nc1C)C)(O...,C39H45N12O6Sc,0,S=0,6
8,VAQFAX,-3381.587804,-0.262134,10.8247,1.39478,0.11034,-0.2694,-0.15906,638.486122,[Sc@]12([N@]([C](C=C(N1c1c(cccc1C(C)C)C(C)C)C)...,C50H67N2O3PSc(1+),1,S=0,4
9,LURLUG,-2353.577963,-0.122106,4.2626,1.40565,0.14595,-0.19225,-0.0463,355.724755,[Sc](C[Si](C)(C)C)(C[Si](C)(C)C)[C@@]1(C(=C(C(...,C24H41OScSi2,0,S=0,8


In [3]:
# make dataframe containing smiles and CSD code:
Mols = DataMat[['CSD_Code','SMILES']]

In [5]:
# analyze dataframe
Mols.head()
Mols.shape

(68123, 2)

In [6]:
# make a function that takes in SMILES and outputs descriptors
def descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    Descriptor_List=[
    'ExactMolWt',
    'FpDensityMorgan1',
    'FpDensityMorgan2',
    'FpDensityMorgan3',
    'HeavyAtomMolWt',
    'MolWt',
    'NumRadicalElectrons',
    'NumValenceElectrons',
     'Chi0',
     'Chi0n',
     'Chi0v',
     'Chi1',
     'Chi1n',
     'Chi1v',
     'Chi2n',
     'Chi2v',
     'Chi3n',
     'Chi3v',
     'Chi4n',
     'Chi4v',
    'Kappa1',
     'Kappa2',
     'Kappa3']
    Descriptor_values = np.zeros(len(Descriptor_List))
    c = 0
    for item in Descriptor_List:
        desc_call = "Chem.Descriptors.{}(mol)".format(item)
        Descriptor_values[c]=eval(desc_call)
        c+=1
    return Descriptor_values

In [7]:
# create descriptor list
Descriptor_List=[
    'ExactMolWt',
    'FpDensityMorgan1',
    'FpDensityMorgan2',
    'FpDensityMorgan3',
    'HeavyAtomMolWt',
    'MolWt',
    'NumRadicalElectrons',
    'NumValenceElectrons',
     'Chi0',
     'Chi0n',
     'Chi0v',
     'Chi1',
     'Chi1n',
     'Chi1v',
     'Chi2n',
     'Chi2v',
     'Chi3n',
     'Chi3v',
     'Chi4n',
     'Chi4v',
    'Kappa1',
     'Kappa2',
     'Kappa3']

In [8]:
# create description matrix columns
DescMat = pandas.DataFrame(columns=('SMILES','CSD_Code','ExactMolWt',
    'FpDensityMorgan1',
    'FpDensityMorgan2',
    'FpDensityMorgan3',
    'HeavyAtomMolWt',
    'MolWt',
    'NumRadicalElectrons',
    'NumValenceElectrons',
     'Chi0',
     'Chi0n',
     'Chi0v',
     'Chi1',
     'Chi1n',
     'Chi1v',
     'Chi2n',
     'Chi2v',
     'Chi3n',
     'Chi3v',
     'Chi4n',
     'Chi4v',
    'Kappa1',
     'Kappa2',
     'Kappa3'))

In [9]:
# modify description matrix
for i, r in Mols.iterrows():
    print(i)
    D = r['CSD_Code']
    S = r['SMILES']
    desc = descriptors(S)
    desc = desc.tolist()
    base = [S,D]
    for j in desc:
        base.append(j)
    DescMat.loc[len(DescMat)] = base

0
1


ArgumentError: Python argument types in
    rdkit.Chem.rdMolDescriptors.CalcExactMolWt(NoneType)
did not match C++ signature:
    CalcExactMolWt(class RDKit::ROMol mol, bool onlyHeavy=False)

In [None]:
# append to description matrix
test = [S,D]
for i in desc:
    test.append(i)

DescMat.loc[len(DescMat)] = test

In [None]:
# analyze complete description matrix
DescMat

In [None]:
# remove unhandleable data
Datamat = DataMat.iloc[0:14094]

In [None]:
# analyze completed data matrix
Datamat

In [None]:
# merge data matrix with description matrix on CSD Code
ModelMat = pandas.merge(Datamat,DescMat,on = ['CSD_Code','CSD_Code'])

In [None]:
# analyze completed model matrix
ModelMat

In [None]:
# save the matrix to a file 
ModelMat.to_csv("ModelMat.csv",sep= ';')