# Importing and Defining 

In [2]:
import time
import numpy as np
import pandas as pd
import rdkit as rd
from rdkit import Chem
from rdkit.Chem import *
from rdkit.Chem import Descriptors
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
import os

In [17]:
## Function to Gather IUPAC NAMES ##
import requests


CACTUS = "https://cactus.nci.nih.gov/chemical/structure/{0}/{1}"


def smiles_to_iupac(smiles):
    try:
        rep = "iupac_name"
        url = CACTUS.format(smiles, rep)
        response = requests.get(url)
        response.raise_for_status()

        return response.text
    except requests.exceptions.HTTPError as err:
        return err.response.status_code



In [21]:
df1=pd.read_csv('/Users/bax/SepCon_lesson/DataSet.csv')

df1['name'] = df1['Solute SMILES'].apply(lambda x: smiles_to_iupac(x))
df1['mol']= df1['Solute SMILES'].apply(lambda x: Chem.MolFromSmiles(x))

In [20]:
df1

Unnamed: 0,Solute SMILES,logS,name
0,CCCCC(CC)CO,-2.137500,2-Ethylhexan-1-ol
1,CCCCCCCCCC,-4.437000,Decane
2,CO,1.544667,methanol
3,CC(C)=O,1.236000,propan-2-one
4,CC(C)O,0.825500,Propan-2-ol
...,...,...,...
111,C[C@H]1CCCC[C@H]1C,-4.271333,"(1S,2R)-1,2-dimethylcyclohexane"
112,CCCC(=O)O,-0.174667,butanoic acid
113,OCCOCCOCCO,0.823000,2-[2-(2-hydroxyethoxy)ethoxy]ethanol
114,OCCOCCOCCOCCO,0.712000,2-[2-[2-(2-hydroxyethoxy)ethoxy]ethoxy]ethanol


In [11]:
fp_names_prop = [
'BCUT2D_MWLOW',
'FpDensityMorgan1',
'MaxAbsPartialCharge',
'HeavyAtomMolWt',
'MinPartialCharge',
'Kappa1',
'ExactMolWt',
'MolWt',
'NumValenceElectrons',
'HeavyAtomCount',
'LabuteASA',
'MolMR',
'MolLogP',
'TPSA',
'NumRotatableBonds',
'NumHeteroatoms',
'NumHDonors',
'NumHAcceptors',
'MinAbsPartialCharge',
'NumAromaticHeterocycles',
'FractionCSP3',
'NumAliphaticRings',
'NumSaturatedRings',
]

In [14]:
from rdkit.ML.Descriptors import MoleculeDescriptors

calc = MoleculeDescriptors.MolecularDescriptorCalculator(fp_names_prop)

try:
    fp_columns = list(Y_target.columns) +fp_names_prop
except:
    fp_columns = ['logS'] +fp_names_prop 
    fp_columns = ['Solute SMILES']+fp_columns

X_features = df1['Solute SMILES']
Y_target = df1['logS']
fp_data = []
for solute,y in zip(X_features, Y_target.values):
    try:
        fp = calc.CalcDescriptors( Chem.MolFromSmiles(solute) )
        #fp_data.append( list(y)+list(fp) )
        fp_data.append( [solute]+ list(np.array([y]).flatten() )+list(fp) )
    except:
        print(smi, 'has problem')
df_props = pd.DataFrame(fp_data, columns=fp_columns)
df_props.to_csv('df_WatSol_features_prop.csv')
#X_features_prop = X_features_prop.dropna(axis='columns')
df_props

Unnamed: 0,Solute SMILES,logS,BCUT2D_MWLOW,FpDensityMorgan1,MaxAbsPartialCharge,HeavyAtomMolWt,MinPartialCharge,Kappa1,ExactMolWt,MolWt,...,TPSA,NumRotatableBonds,NumHeteroatoms,NumHDonors,NumHAcceptors,MinAbsPartialCharge,NumAromaticHeterocycles,FractionCSP3,NumAliphaticRings,NumSaturatedRings
0,CCCCC(CC)CO,-2.137500,10.008017,1.333333,0.396088,112.087,-0.396088,8.960,130.135765,130.231,...,20.23,5,1,1,1,0.045866,0,1.000000,0,0
1,CCCCCCCCCC,-4.437000,10.092933,0.500000,0.065382,120.110,-0.065382,10.000,142.172151,142.286,...,0.00,7,0,0,0,0.053306,0,1.000000,0,0
2,CO,1.544667,11.774297,1.500000,0.399630,28.010,-0.399630,1.960,32.026215,32.042,...,20.23,0,1,1,1,0.031941,0,1.000000,0,0
3,CC(C)=O,1.236000,10.550822,1.500000,0.300344,52.032,-0.300344,3.670,58.041865,58.080,...,17.07,0,1,0,1,0.126268,0,0.666667,0,0
4,CC(C)O,0.825500,10.503602,1.500000,0.393707,52.032,-0.393707,3.960,60.057515,60.096,...,20.23,0,1,1,1,0.048348,0,1.000000,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111,C[C@H]1CCCC[C@H]1C,-4.271333,9.818666,0.875000,0.062255,96.088,-0.062255,6.125,112.125201,112.216,...,0.00,0,0,0,0,0.041698,0,1.000000,1,1
112,CCCC(=O)O,-0.174667,10.350739,1.833333,0.481231,80.042,-0.481231,5.470,88.052429,88.106,...,37.30,2,2,1,1,0.302829,0,0.750000,0,0
113,OCCOCCOCCO,0.823000,10.686073,0.700000,0.393980,136.062,-0.393980,9.840,150.089209,150.174,...,58.92,7,4,2,4,0.070114,0,1.000000,0,0
114,OCCOCCOCCOCCO,0.712000,10.668862,0.538462,0.393980,176.083,-0.393980,12.800,194.115424,194.227,...,68.15,10,5,2,5,0.070114,0,1.000000,0,0
