In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import AllChem, PandasTools
from rdkit.ML.Descriptors import MoleculeDescriptors

In [3]:
from rdkit import Chem

def is_valid_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return mol

# Example usage




In [4]:
mof_3000 = pd.read_csv('OL_cooh_3178.csv')
mof_3000.dtypes

Sr No.          int64
OL             object
MOF_name       object
LCD           float64
Unnamed: 4    float64
dtype: object

In [5]:
mof_3000.drop(['LCD','Sr No.','Unnamed: 4'], axis = 1, inplace=True)

In [6]:

mof_3000['OL'] = mof_3000['OL'].astype(str)
mof_3000.dtypes

OL          object
MOF_name    object
dtype: object

In [7]:
organic_linker_list = []

for smile in mof_3000['OL']:
  mol = Chem.MolFromSmiles(smile)
  ##print(type(mol))
  ##molecule = Chem.AddHs(mol)
  ##image = Draw.MolToImage(mol)
  ##image.show()
  organic_linker_list.append(mol)

mof_3000 = pd.concat([mof_3000, pd.DataFrame(organic_linker_list, columns = (['mol']))], axis=1)
mof_3000

Unnamed: 0,OL,MOF_name,mol
0,B(c1c(c(C)c(C(=O)[O])c(C)c1C)C)(c1c(c(c(c(C)c1...,CIMTAV,"<img data-content=""rdkit/molecule"" src=""data:i..."
1,Brc1c(c2ccc(C(=O)[O])cc2)c(c2ccc(C(=O)[O])cc2)...,FUNCEX,"<img data-content=""rdkit/molecule"" src=""data:i..."
2,Brc1c(c(c(Br)c(c1c1ccc(cc1)C(=O)[O])c1ccc(cc1)...,FUNBEW,"<img data-content=""rdkit/molecule"" src=""data:i..."
3,Brc1ccc(C2=NC(=C([N]2)C(=O)[O])C(=O)[O])cc1,PEWKOT,"<img data-content=""rdkit/molecule"" src=""data:i..."
4,Brc1cccc(Br)c1/C/1=C/2\C=CC(=N2)/C(=C/2\[N]/C(...,ADARAA,"<img data-content=""rdkit/molecule"" src=""data:i..."
...,...,...,...
3173,[P@@](=O)([O])(O)c1cc(P(=O)([O])[O])cc(c1)C(=O)O,LAVZOA,"<img data-content=""rdkit/molecule"" src=""data:i..."
3174,P(=O)([O])([O])c1cc(P(=O)([O])[O])cc(c1)C(=O)O,LAVZOA,"<img data-content=""rdkit/molecule"" src=""data:i..."
3175,S1/C(=C\2/SC(=C(S2)c2ccc(C(=O)O)cc2)c2ccc(C(=O...,GUFBOA,"<img data-content=""rdkit/molecule"" src=""data:i..."
3176,[Si](c1ccc(cc1)C(=O)[O])(c1ccc(cc1)C(=O)[O])(c...,ODODES,"<img data-content=""rdkit/molecule"" src=""data:i..."


In [8]:
from rdkit import Chem
from rdkit.Chem import AllChem

problematic_indices = []

for idx,mol in enumerate(mof_3000['mol']):
    try:
        if mol.GetNumConformers() == 0:
            result = AllChem.EmbedMolecule(mol, maxAttempts = 1000, randomSeed = 42)  # Use a fixed seed for reproducibility
            if result != 0:
                result = AllChem.EmbedMolecule(mol, AllChem.ETKDG())
            if result != 0:  # Check if embedding was successful
                raise ValueError("Embedding failed")
            opt_result = AllChem.UFFOptimizeMolecule(mol, maxIters = 2000)
            if opt_result != 0:
                opt_result = AllChem.MMFFOptimizeMolecule(mol)
            if opt_result != 0:  # Check if optimization was successful
                raise ValueError("Optimization failed")
    except Exception as e:
        problematic_indices.append(idx)

mof_3000_cleaned = mof_3000.drop(index=problematic_indices)
mof_3000_cleaned.reset_index(drop=True, inplace=True)



In [9]:
mof_3000_cleaned.shape

(3112, 3)

In [10]:
des = []                
    
for mol in mof_3000_cleaned['mol']:
    temp = []
    temp.append(Chem.GraphDescriptors.BalabanJ(mol))
    temp.append(Chem.GraphDescriptors.BertzCT(mol))
    temp.append(Chem.GraphDescriptors.HallKierAlpha(mol))
    temp.append(Chem.Crippen.MolLogP(mol))
    temp.append(Chem.Descriptors.ExactMolWt(mol))
    temp.append(Chem.Descriptors.FpDensityMorgan1(mol))
    temp.append(Chem.Descriptors.MaxPartialCharge(mol))
    temp.append(Chem.Lipinski.FractionCSP3(mol))
    temp.append(Chem.Lipinski.NHOHCount(mol))
    temp.append(Chem.Lipinski.NOCount(mol))
    temp.append(Chem.rdMolDescriptors.CalcAsphericity(mol))
    temp.append(Chem.rdMolDescriptors.CalcChi0n(mol))
    temp.append(Chem.rdMolDescriptors.CalcChi1n(mol))
    temp.append(Chem.rdMolDescriptors.CalcChi1v(mol))
    temp.append(Chem.rdMolDescriptors.CalcEccentricity(mol))
    temp.append(Chem.rdMolDescriptors.CalcFractionCSP3(mol))
    temp.append(Chem.rdMolDescriptors.CalcHallKierAlpha(mol))
    temp.append(Chem.rdMolDescriptors.CalcInertialShapeFactor(mol))
    temp.append(Chem.rdMolDescriptors.CalcKappa1(mol))
    temp.append(Chem.rdMolDescriptors.CalcKappa2(mol))
    temp.append(Chem.rdMolDescriptors.CalcKappa3(mol))
    temp.append(Chem.rdMolDescriptors.CalcNPR1(mol))
    temp.append(Chem.rdMolDescriptors.CalcNPR2(mol))
    temp.append(Chem.rdMolDescriptors.CalcNumAliphaticCarbocycles(mol))
    temp.append(Chem.rdMolDescriptors.CalcNumAliphaticHeterocycles(mol))
    temp.append(Chem.rdMolDescriptors.CalcNumAliphaticRings(mol))
    temp.append(Chem.rdMolDescriptors.CalcNumAmideBonds(mol))
    temp.append(Chem.rdMolDescriptors.CalcNumAromaticCarbocycles(mol))
    temp.append(Chem.rdMolDescriptors.CalcNumAromaticHeterocycles(mol))
    temp.append(Chem.rdMolDescriptors.CalcNumAromaticRings(mol))
    temp.append(Chem.rdMolDescriptors.CalcNumHBA(mol))
    temp.append(Chem.rdMolDescriptors.CalcNumHBD(mol))
    temp.append(rdMolDescriptors.CalcNumHeteroatoms(mol))
    temp.append(rdMolDescriptors.CalcNumHeterocycles(mol))
    temp.append(rdMolDescriptors.CalcNumLipinskiHBA(mol))
    temp.append(rdMolDescriptors.CalcNumLipinskiHBD(mol))
    temp.append(rdMolDescriptors.CalcNumRings(mol))
    temp.append(rdMolDescriptors.CalcNumRotatableBonds(mol))
    temp.append(rdMolDescriptors.CalcNumSaturatedCarbocycles(mol))
    temp.append(rdMolDescriptors.CalcNumSaturatedHeterocycles(mol))
    temp.append(rdMolDescriptors.CalcNumSaturatedRings(mol))
    temp.append(Chem.rdMolDescriptors.CalcPBF(mol))
    temp.append(Chem.rdMolDescriptors.CalcPMI1(mol))
    temp.append(Chem.rdMolDescriptors.CalcSpherocityIndex(mol))
    temp.append(Chem.rdMolDescriptors.CalcTPSA(mol))
    des.append(temp)




In [11]:
std_column=['BalabanJ','BertzCT','HallKierAlpha','MolLogP','ExactMolWt','FpDensityMorgan1','MaxPartialCharge','FractionCSP3','NHOHCount','NOCount','Asphericity','CalcChi0n','CalcChi1n','CalcChi1v','Eccentricity','CalcFractionCSP3','CalcHallKierAlpha','InertialShapeFactor','Kappa1','Kappa2','Kappa3','NPR1', 'NPR2','NumAliphaticCarbocycles','NumAliphaticHeterocycles','AliphaticRings','AmideBonds','AromaticCarbocycles','AromaticHeterocycles','AromaticRings','NumHBA','NumHBD','Heteroatoms','Heterocycles','LipinskiHBA','LipinskiHBD','Rings','Rotable bonds','Saturated carbocyles','Saturated Heterocycles','Saturated Rings','PBF','PMI1','SpherocityIndex','TPSA']

In [12]:
mof_3000_cleaned = pd.concat([mof_3000_cleaned, pd.DataFrame(des, columns=(['BalabanJ','BertzCT','HallKierAlpha','MolLogP','ExactMolWt','FpDensityMorgan1','MaxPartialCharge','FractionCSP3','NHOHCount','NOCount','Asphericity','CalcChi0n','CalcChi1n','CalcChi1v','Eccentricity','CalcFractionCSP3','CalcHallKierAlpha','InertialShapeFactor','Kappa1','Kappa2','Kappa3','NPR1', 'NPR2','NumAliphaticCarbocycles','NumAliphaticHeterocycles','AliphaticRings','AmideBonds','AromaticCarbocycles','AromaticHeterocycles','AromaticRings','NumHBA','NumHBD','Heteroatoms','Heterocycles','LipinskiHBA','LipinskiHBD','Rings','Rotable bonds','Saturated carbocyles','Saturated Heterocycles','Saturated Rings','PBF','PMI1','SpherocityIndex','TPSA']))], axis=1)

In [13]:
mof_3000_cleaned.head()

Unnamed: 0,OL,MOF_name,mol,BalabanJ,BertzCT,HallKierAlpha,MolLogP,ExactMolWt,FpDensityMorgan1,MaxPartialCharge,...,LipinskiHBD,Rings,Rotable bonds,Saturated carbocyles,Saturated Heterocycles,Saturated Rings,PBF,PMI1,SpherocityIndex,TPSA
0,B(c1c(c(C)c(C(=O)[O])c(C)c1C)C)(c1c(c(c(c(C)c1...,CIMTAV,"<img data-content=""rdkit/molecule"" src=""data:i...",2.729821,1363.259608,-3.865065,4.61594,539.260494,0.325,0.386386,...,0,3,6,0,0,0,1.010173,5634.110407,0.229203,110.91
1,Brc1c(c2ccc(C(=O)[O])cc2)c(c2ccc(C(=O)[O])cc2)...,FUNCEX,"<img data-content=""rdkit/molecule"" src=""data:i...",1.903867,1662.23502,-5.06,8.1624,709.921192,0.318182,0.385831,...,0,5,8,0,0,0,0.621646,6165.977484,0.074981,147.88
2,Brc1c(c(c(Br)c(c1c1ccc(cc1)C(=O)[O])c1ccc(cc1)...,FUNBEW,"<img data-content=""rdkit/molecule"" src=""data:i...",1.903867,1662.23502,-5.06,8.1624,709.921192,0.318182,0.385831,...,0,5,8,0,0,0,0.636607,6176.986136,0.072621,147.88
3,Brc1ccc(C2=NC(=C([N]2)C(=O)[O])C(=O)[O])cc1,PEWKOT,"<img data-content=""rdkit/molecule"" src=""data:i...",2.362448,589.565705,-2.15,0.9395,306.935444,0.944444,0.406614,...,0,2,3,0,0,0,0.112451,483.625706,0.01242,100.4
4,Brc1cccc(Br)c1/C/1=C/2\C=CC(=N2)/C(=C/2\[N]/C(...,ADARAA,"<img data-content=""rdkit/molecule"" src=""data:i...",1.394064,3165.533938,-6.72,10.5596,1095.801462,0.390625,0.385879,...,0,9,8,0,0,0,0.972282,19278.561397,0.152337,200.8


In [14]:
mof_3000_cleaned.drop(['mol'],axis = 1,inplace=True)

In [15]:
mof_3000_cleaned.to_csv('mof_3112_std.csv',index=False)

In [16]:
mof_3000_cleaned.shape

(3112, 47)