# Merging datasets and adding Henry's Law Constants and Melting Points

In [5]:
# importing modules that will be used

import pandas as pd
import numpy as np
import seaborn as sns

from matplotlib import pyplot as plt

In [4]:
# load aqsoldb csv file as a pandas dataframe
aqsoldb = pd.read_csv('aqsoldb.csv')

# load s and kh csv file as a pandas dataframe
s_and_kh = pd.read_csv('S_and_KH_data.csv')

In [5]:
# inspect dataframes
aqsoldb.head()

Unnamed: 0,ID,Name,InChI,InChIKey,SMILES,Solubility,SD,Ocurrences,Group,MolWt,...,NumRotatableBonds,NumValenceElectrons,NumAromaticRings,NumSaturatedRings,NumAliphaticRings,RingCount,TPSA,LabuteASA,BalabanJ,BertzCT
0,A-3,"N,N,N-trimethyloctadecan-1-aminium bromide",InChI=1S/C21H46N.BrH/c1-5-6-7-8-9-10-11-12-13-...,SZEMGTQCPRNXEG-UHFFFAOYSA-M,[Br-].CCCCCCCCCCCCCCCCCC[N+](C)(C)C,-3.616127,0.0,1,G1,392.51,...,17.0,142.0,0.0,0.0,0.0,0.0,0.0,158.520601,0.0,210.377334
1,A-4,Benzo[cd]indol-2(1H)-one,InChI=1S/C11H7NO/c13-11-8-5-1-3-7-4-2-6-9(12-1...,GPYLCFQEKPUWLD-UHFFFAOYSA-N,O=C1Nc2cccc3cccc1c23,-3.254767,0.0,1,G1,169.183,...,0.0,62.0,2.0,0.0,1.0,3.0,29.1,75.183563,2.582996,511.229248
2,A-5,4-chlorobenzaldehyde,InChI=1S/C7H5ClO/c8-7-3-1-6(5-9)2-4-7/h1-5H,AVPYQKSLYISFPO-UHFFFAOYSA-N,Clc1ccc(C=O)cc1,-2.177078,0.0,1,G1,140.569,...,1.0,46.0,1.0,0.0,0.0,1.0,17.07,58.261134,3.009782,202.661065
3,A-8,"zinc bis[2-hydroxy-3,5-bis(1-phenylethyl)benzo...",InChI=1S/2C23H22O3.Zn/c2*1-15(17-9-5-3-6-10-17...,XTUPUYCJWKHGSW-UHFFFAOYSA-L,[Zn++].CC(c1ccccc1)c2cc(C(C)c3ccccc3)c(O)c(c2)...,-3.924409,0.0,1,G1,756.226,...,10.0,264.0,6.0,0.0,0.0,6.0,120.72,323.755434,2.322963e-07,1964.648666
4,A-9,4-({4-[bis(oxiran-2-ylmethyl)amino]phenyl}meth...,InChI=1S/C25H30N2O4/c1-5-20(26(10-22-14-28-22)...,FAUAZXVRLVIARB-UHFFFAOYSA-N,C1OC1CN(CC2CO2)c3ccc(Cc4ccc(cc4)N(CC5CO5)CC6CO...,-4.662065,0.0,1,G1,422.525,...,12.0,164.0,2.0,4.0,4.0,6.0,56.6,183.183268,1.084427,769.899934


In [6]:
# inspect dataframes
s_and_kh.head()

Unnamed: 0,InChI_Key,species,SLN,SMILES,LogS,LogKH,MW,MLOGP
0,AAEVYOVXGOFMJO-UHFFFAOYSA-N,Prometryn,S(CH3)C[6]=NC(=NC(=N@6)NHCH(CH3)CH3)NHCH(CH3)CH3,CSc1nc(NC(C)C)nc(NC(C)C)n1,-4.1,-6.709304,241.41,2.893
1,AEXMKKGTQYQZCS-UHFFFAOYSA-N,"3,3-Dimethylpentane",C(CH2CH3)(CH2CH3)(CH3)CH3,CCC(C)(C)CC,-4.23,12.129112,100.23,3.869
2,AFABGHUZZDYHJO-UHFFFAOYSA-N,2-Methylpentane,CH3CH(CH2CH2CH3)CH3,CCCC(C)C,-3.74,12.040558,86.2,3.516
3,AFBPFSWMIHJQDM-UHFFFAOYSA-N,N-Methylaniline,NH(CH3)C[7]=CHCH=CHCH=CH@7,CNc1ccccc1,-1.28,0.139262,107.17,1.859
4,AFFLGGQVNFXPEV-UHFFFAOYSA-N,1-Decene,CH2(CH2CH2CH2CH3)CH2CH2CH2CH=CH2,C=CCCCCCCCC,-5.51,12.507178,140.3,4.678


In [28]:
# adding LogKH where SMILES values match up
aqsoldb = aqsoldb.assign(LogKH=aqsoldb['SMILES'].map(s_and_kh.set_index('SMILES')['LogKH']))

In [29]:
# adding MLOGP where SMILES values match up
aqsoldb = aqsoldb.assign(MLOGP=aqsoldb['SMILES'].map(s_and_kh.set_index('SMILES')['MLOGP']))

In [30]:
aqsoldb.head()

Unnamed: 0,ID,Name,InChI,InChIKey,SMILES,Solubility,SD,Ocurrences,Group,MolWt,...,NumAromaticRings,NumSaturatedRings,NumAliphaticRings,RingCount,TPSA,LabuteASA,BalabanJ,BertzCT,LogKH,MLOGP
0,A-3,"N,N,N-trimethyloctadecan-1-aminium bromide",InChI=1S/C21H46N.BrH/c1-5-6-7-8-9-10-11-12-13-...,SZEMGTQCPRNXEG-UHFFFAOYSA-M,[Br-].CCCCCCCCCCCCCCCCCC[N+](C)(C)C,-3.616127,0.0,1,G1,392.51,...,0.0,0.0,0.0,0.0,0.0,158.520601,0.0,210.377334,,
1,A-4,Benzo[cd]indol-2(1H)-one,InChI=1S/C11H7NO/c13-11-8-5-1-3-7-4-2-6-9(12-1...,GPYLCFQEKPUWLD-UHFFFAOYSA-N,O=C1Nc2cccc3cccc1c23,-3.254767,0.0,1,G1,169.183,...,2.0,0.0,1.0,3.0,29.1,75.183563,2.582996,511.229248,,
2,A-5,4-chlorobenzaldehyde,InChI=1S/C7H5ClO/c8-7-3-1-6(5-9)2-4-7/h1-5H,AVPYQKSLYISFPO-UHFFFAOYSA-N,Clc1ccc(C=O)cc1,-2.177078,0.0,1,G1,140.569,...,1.0,0.0,0.0,1.0,17.07,58.261134,3.009782,202.661065,,
3,A-8,"zinc bis[2-hydroxy-3,5-bis(1-phenylethyl)benzo...",InChI=1S/2C23H22O3.Zn/c2*1-15(17-9-5-3-6-10-17...,XTUPUYCJWKHGSW-UHFFFAOYSA-L,[Zn++].CC(c1ccccc1)c2cc(C(C)c3ccccc3)c(O)c(c2)...,-3.924409,0.0,1,G1,756.226,...,6.0,0.0,0.0,6.0,120.72,323.755434,2.322963e-07,1964.648666,,
4,A-9,4-({4-[bis(oxiran-2-ylmethyl)amino]phenyl}meth...,InChI=1S/C25H30N2O4/c1-5-20(26(10-22-14-28-22)...,FAUAZXVRLVIARB-UHFFFAOYSA-N,C1OC1CN(CC2CO2)c3ccc(Cc4ccc(cc4)N(CC5CO5)CC6CO...,-4.662065,0.0,1,G1,422.525,...,2.0,4.0,4.0,6.0,56.6,183.183268,1.084427,769.899934,,


In [31]:
# how many have matched?
aqsoldb[aqsoldb['LogKH'].notnull()]

# 449 have a match

Unnamed: 0,ID,Name,InChI,InChIKey,SMILES,Solubility,SD,Ocurrences,Group,MolWt,...,NumAromaticRings,NumSaturatedRings,NumAliphaticRings,RingCount,TPSA,LabuteASA,BalabanJ,BertzCT,LogKH,MLOGP
50,A-73,S-ethyl dipropylthiocarbamate,InChI=1S/C9H19NOS/c1-4-7-10(8-5-2)9(11)12-6-3/...,GUVLYNGULCJVDO-UHFFFAOYSA-N,CCCN(CCC)C(=O)SCC,-2.703174,0.018736,2,G3,189.324,...,0.0,0.0,0.0,0.0,20.31,79.703254,3.810795,121.696943,0.579818,1.971
105,A-176,"1,2-dibutyl benzene-1,2-dicarboxylate",InChI=1S/C16H22O4/c1-3-5-11-19-15(17)13-9-7-8-...,DOIRQSBPFJWKBE-UHFFFAOYSA-N,CCCCOC(=O)c1ccccc1C(=O)OCCCC,-4.387683,0.057297,5,G5,278.348,...,1.0,0.0,0.0,1.0,52.60,119.630959,2.712391,398.857942,-2.230014,3.618
122,A-203,pyrene,InChI=1S/C16H10/c1-3-11-7-9-13-5-2-6-14-10-8-1...,BBEAQIROQSPTKN-UHFFFAOYSA-N,c1cc2ccc3cccc4ccc(c1)c2c34,-6.178797,0.058820,3,G5,202.256,...,4.0,0.0,0.0,4.0,0.00,93.455422,2.505956,666.619806,0.287682,4.760
153,A-256,1-chlorohexane,"InChI=1S/C6H13Cl/c1-2-3-4-5-6-7/h2-6H2,1H3",MLRVZFYXUZQSRU-UHFFFAOYSA-N,CCCCCCCl,-3.122389,0.227966,2,G3,120.623,...,0.0,0.0,0.0,0.0,0.00,50.867228,2.447473,23.360990,8.078938,2.957
156,A-262,octan-2-one,"InChI=1S/C8H16O/c1-3-4-5-6-7-8(2)9/h3-7H2,1-2H3",ZPVFWPFBNIEHGJ-UHFFFAOYSA-N,CCCCCCC(C)=O,-2.153696,0.051848,2,G3,128.215,...,0.0,0.0,0.0,0.0,17.07,57.455368,2.828868,76.636821,2.956512,2.129
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9914,H-463,Tetrachloroethylene,InChI=1S/C2Cl4/c3-1(4)2(5)6,CYTYCFOTNPOANT-UHFFFAOYSA-N,ClC(Cl)=C(Cl)Cl,-2.740000,0.189838,4,G5,165.834,...,0.0,0.0,0.0,0.0,0.00,55.627655,3.675949,55.617271,7.385791,2.459
9919,H-496,"1,1,1-Trichloroethane","InChI=1S/C2H3Cl3/c1-2(3,4)5/h1H3",UOCLXMDMGBRAIB-UHFFFAOYSA-N,CC(Cl)(Cl)Cl,-2.140000,0.256999,6,G5,133.405,...,0.0,0.0,0.0,0.0,0.00,46.013992,3.023716,20.364528,7.418581,2.226
9921,H-514,Vernolate,InChI=1S/C10H21NOS/c1-4-7-11(8-5-2)10(12)13-9-...,OKUGPJPKMAEJOE-UHFFFAOYSA-N,CCCSC(=O)N(CCC)CCC,-3.300000,0.027000,2,G3,203.351,...,0.0,0.0,0.0,0.0,20.31,86.068196,3.799339,132.886435,1.139434,2.275
9931,H-556,b-Endosulfan,InChI=1S/C9H6Cl6O3S/c10-5-6(11)8(13)4-2-18-19(...,RDYMFSUJUZBWLH-UHFFFAOYSA-N,O=S1OCC2C(CO1)C1(Cl)C(Cl)=C(Cl)C2(Cl)C1(Cl)Cl,-6.080000,0.090204,4,G5,406.929,...,0.0,2.0,3.0,3.0,35.53,141.790641,2.187726,470.183414,0.061875,3.115
