In [1]:
# import necessary packages
import pandas
import numpy as np 
from rdkit import Chem
import networkx
from rdkit.Chem import AllChem
from io import StringIO
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import Descriptors3D
from IPython.display import SVG
from rdkit import Chem
from rdkit.Chem import rdDepictor
from rdkit.Chem.Draw import rdMolDraw2D

# Import and curate the TM complex data 

In [None]:
# location of SMILES strings for TM complexes 
with open("tmQM_X1_smi.smi",'r') as f:
    lines = f.readlines()

# name the columns
curated_list = "SMILES,CSD_Code,Charge,S,Stoichiometry,MND\n"

for l in lines: 
    # remove all smiles strings with fragmented TM Complexes
    if "." in l:
        next
    elif "@" in l:
        next
    # make sure the molecule registers as an rdkit object
    else:
        
        l = l.strip()
        x = l.replace("|",",")
        y = x.replace("\tCSD_code =",",")
        z = y.replace(" Stoichiometry = ","")
        a = z.replace(" MND = ","")
        b = a.replace(" q = ","")
        c = b.replace(" ","")
        curated_list+= c + "\n"
        
w = open("tmQM_X1_smi_curated.smi",'w')         
w.write(curated_list)
w.close()

In [None]:
# import the curated data and analyze with pandas
df_X1 = pandas.read_csv("tmQM_X1_smi_curated.smi",',')
df_X1.head()
df_X1.describe()

# Import and curate the second half of the data

In [None]:
# location of SMILES strings for TM complexes 
with open("tmQM_X2_smiles.smi",'r') as f:
    lines = f.readlines()

# name the columns
curated_list = "SMILES,CSD_Code,Charge,S,Stoichiometry,MND\n"
for l in lines: 
    # remove all smiles strings with fragmented TM Complexes
    if "."in l:
        next
    elif "@"in l:
        next
    else:
        # make sure the molecule registers as an rdkit object
        l = l.strip()
        x = l.replace("|",",")
        y = x.replace("\tCSD_code =",",")
        z = y.replace(" Stoichiometry = ","")
        a = z.replace(" MND = ","")
        b = a.replace(" q = ","")
        c = b.replace(" ","")
        curated_list+= c + "\n"

w = open("tmQM_X2_smi_curated.smi",'w')         
w.write(curated_list)
w.close()

In [None]:
# import the curated data and analyze with pandas
df_X2 = pandas.read_csv("tmQM_X2_smi_curated.smi")
df_X2.head()
df_X2.describe()

In [None]:
# merge all molecules into one matrix 
# concatenate the 2 files containing molecular information
mols = pandas.concat([df_X1,df_X2])

# make sure the shapes add up 
print(mols.shape, df_X1.shape, df_X2.shape)

In [None]:
# analyze concatenation
mols.head()

In [None]:
# drop all molecules that don't work with RDKit
for i, r in mols.iterrows():
    smiles = r['SMILES']
    try:
        mol = Chem.MolFromSmiles(smiles)
        test= Chem.Descriptors.ExactMolWt(mol) 
    except:
        mols = mols.drop([i])

In [None]:
# analyze set with dropped molecules
mols.shape
mols.head(20)

# Test RDKit functions for a single molecule 

In [None]:
# test the molecules on RDKIT (CSD = ILOJOK)
mol = Chem.MolFromSmiles('[Y]1234(N(c5c(cccc5C)C)[C]5C=CC=C(N15)N(c1c(cccc1C)C)C(=Nc1ccccc21)C(F)(F)F)N(c1c(cccc1C)C)[C]1C=CC=C(N31)N(c1c(cccc1C)C)[C](C(F)(F)F)N4c1ccccc1')

In [None]:
# draw the molecule with default drawer 
mol

In [None]:
# function to make the drawing look better 
def DrawMol(mol):
    
    # create mol object from smiles string
    mol = Chem.MolFromSmiles(mol)
    molSize=(450,150)
    mc = Chem.Mol(mol.ToBinary())
    
    if not mc.GetNumConformers():
        rdDepictor.Compute2DCoords(mc)
        
    drawer = rdMolDraw2D.MolDraw2DSVG(molSize[0],molSize[1])
    
    # draw the molcule
    drawer.DrawMolecule(mc)
    drawer.FinishDrawing()
    
    # get the SVG string
    svg = drawer.GetDrawingText()
    display(SVG(svg.replace('svg:','')))

In [None]:
mol2 = '[Y]1234(N(c5c(cccc5C)C)[C]5C=CC=C(N15)N(c1c(cccc1C)C)C(=Nc1ccccc21)C(F)(F)F)N(c1c(cccc1C)C)[C]1C=CC=C(N31)N(c1c(cccc1C)C)[C](C(F)(F)F)N4c1ccccc1'
DrawMol(mol2)

# Import and Curate the Modeling endpoints

In [None]:
# location of CSV data for TM complexes
df_y = pandas.read_csv("tmQM_y.csv", ';')

In [None]:
# analyze the TM complex data
df_y.head()
df_y.describe()

 # Merge endpoints with molecular structure  

In [None]:
# join CSD code IDs into a complete CSD code list
id1_list = df_X1[['CSD_Code']]
id1_list = id1_list.values.tolist()
id2_list = df_X2[['CSD_Code']]
id2_list = id2_list.values.tolist()
ID_List = mols[['CSD_Code']]
ID_List = np.asarray(ID_List)

In [None]:
print(ID_List)

In [None]:
# get the list of indcies that need to be dropped 

# record indicies of rows to drop
drop_idx = []
for i, r in df_y.iterrows():
    code = r['CSD_Code']
    if code not in ID_List:
        drop_idx.append(i)

In [None]:
# analyze indicies of rows to drop
print(drop_idx)

In [None]:
# drop unmatched indicies from dataframe
for i in drop_idx:
    df_y = df_y.drop([i])

In [None]:
# compare shapes of curated endpoints and curated molecules
print("Properties:{},Molecules{}".format(df_y.shape, mols.shape))

In [None]:
# merge curated endpoints with curated molecules
DataMatrix = pandas.merge(mols,df_y,on = ['CSD_Code','CSD_Code'])

In [None]:
# analyze merged data
DataMatrix.head()
DataMatrix.describe()

In [None]:
# rearrange matrix with endpoints on left after ID 
DataMatrix = DataMatrix[["CSD_Code","Electronic_E","Dispersion_E","Dipole_M","Metal_q","HL_Gap","HOMO_Energy","LUMO_Energy","Polarizability","SMILES","Stoichiometry","Charge","S","MND"]]

In [None]:
# analyze rearranged matrix
DataMatrix.head()
DataMatrix.shape

In [None]:
# save final data to CSV file for later analysis
DataMatrix.to_csv("MolMatrix.csv", sep = ';')