# Cleaning Henry's Law Constants Dataset

Here, we clean up the csv generated from Tabula and make sure everything is in the right format, and at the end there are 4632 unique species.

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('henrys_law_dataset.csv')

In [3]:
df

Unnamed: 0,0,1,2,3
0,,,Inor,ganic species
1,,,O,xygen (O)
2,,oxygen,1.2×10−5,1700 Warneck and Williams (2012) L
3,,O2,1.3 ×10−5,1500 Sander et al. (2011) L
4,,[7782-44-7],1.3×10−5,1500 Sander et al. (2006) L
...,...,...,...,...
23823,,(methyltriethyl lead),,
23824,,[1762-28-3],,
23825,,tetraethyllead,1.3×10−5,6400 Feldhake and Stevens (1963) M
23826,,C8H20Pb,1.3×10−5,Abraham (1979) ?


In [4]:
# as we can see above the first column contains no data so can be removed, the last column also contains reference data which isn't needed so can also be removed
df = df.drop(df.columns[[0, 3]], axis=1)

In [5]:
# NaN values in column 1 will not correspond to any Henry's law constants or IUPAC names so can be removed
df.dropna(inplace=True)

In [6]:
df

Unnamed: 0,1,2
2,oxygen,1.2×10−5
3,O2,1.3 ×10−5
4,[7782-44-7],1.3×10−5
16,ozone,1.0×10−4
17,O3,1.0 ×10−4
...,...,...
23814,ethyltrimethylplumbane,2.8×10−5
23817,diethyldimethylplumbane,2.1 ×10−5
23821,triethylmethylplumbane,1.6×10−5
23825,tetraethyllead,1.3×10−5


In [7]:
# renaming column names
df.rename(columns={'1': 'Substance', '2': 'Hcp'}, inplace=True)

In [8]:
df.loc[df.Substance == '2-ethyl-1-butanol']

Unnamed: 0,Substance,Hcp
3951,2-ethyl-1-butanol,4.7×10


In [9]:
df

Unnamed: 0,Substance,Hcp
2,oxygen,1.2×10−5
3,O2,1.3 ×10−5
4,[7782-44-7],1.3×10−5
16,ozone,1.0×10−4
17,O3,1.0 ×10−4
...,...,...
23814,ethyltrimethylplumbane,2.8×10−5
23817,diethyldimethylplumbane,2.1 ×10−5
23821,triethylmethylplumbane,1.6×10−5
23825,tetraethyllead,1.3×10−5


In [10]:
# we now need to get rid of rows which have Substance value representing chemical formula and CAS IDs as these represent duplicates
# we know that the IUPAC names all start with a lowercase letter or a digit so we can filter by that (as opposed to an uppercase letter or square bracket as the above would)
df_filtered = df[df.Substance.str.contains('^[0-9a-z]')]

# stereoisomers such as E, Z, S, R, - are written as (E) so these will need to be accounted for as well
df_stereo = df[df.Substance.str.contains('^\([^a-z]+\)')]

# some names start with a bracket followed by a digit or lowercase - how to differentiate this to the 'Other Names'?
# well the IUPAC names will contain a hyphen after any starting bracket
df_brackets = df[df.Substance.str.contains('^\(\S+\)\-')]

# combine two filtered df together
df_clean = pd.concat([df_filtered, df_stereo, df_brackets])

In [11]:
df_clean

Unnamed: 0,Substance,Hcp
2,oxygen,1.2×10−5
16,ozone,1.0×10−4
31,hydrogen atom,2.6 ×10−6
34,hydrogen,7.8×10−6
41,deuterium,7.9 10−6×
...,...,...
17262,"(2,4-dichlorophenoxy)-acetic acid,",1.7×10−1
20397,(bromomethyl)-benzene,1.4 10−3×
20444,(2-bromoethyl)-benzene,6.5×10−3
20939,"(2E)-N,N’-bis(2,4,6-tribromophenyl)-",9.0 ×109


In [12]:
df_clean.Substance.duplicated().sum()

71

In [13]:
# removing duplicates
df_clean = df_clean.drop_duplicates(subset=['Substance'])

In [14]:
# reset index
df_clean.reset_index(drop=True, inplace=True)
df_clean

Unnamed: 0,Substance,Hcp
0,oxygen,1.2×10−5
1,ozone,1.0×10−4
2,hydrogen atom,2.6 ×10−6
3,hydrogen,7.8×10−6
4,deuterium,7.9 10−6×
...,...,...
4706,"(2,4-dichlorophenoxy)-acetic acid 2-",5.5×10−1
4707,"(2,4-dichlorophenoxy)-acetic acid,",1.7×10−1
4708,(bromomethyl)-benzene,1.4 10−3×
4709,(2-bromoethyl)-benzene,6.5×10−3


In [15]:
# at the moment the Hcp values are a bit of a mess - we need to standardise them into a single format and then convert them into floats

# there are a few different situations to deal with - the first is the position of the x being in the wrong position
def correct_x_position(Hcp_value):
    if len(Hcp_value) <= 3:       # accounts for any values which aren't in standard form, e.g. 1.2
        Hcp_value = Hcp_value
    
    elif Hcp_value[3] != '×':
        Hcp_value = Hcp_value.replace('×', '').replace(' ','')    # any × characters at the end of the string are removed and then any whitespaces present as well
        Hcp_value = Hcp_value[0:3] + '×' + Hcp_value[3:]    # × characters placed in correct posittion before 10
    
    return Hcp_value
        
df_clean['Hcp'] = df_clean['Hcp'].apply(correct_x_position)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [16]:
df_clean

Unnamed: 0,Substance,Hcp
0,oxygen,1.2×10−5
1,ozone,1.0×10−4
2,hydrogen atom,2.6×10−6
3,hydrogen,7.8×10−6
4,deuterium,7.9×10−6
...,...,...
4706,"(2,4-dichlorophenoxy)-acetic acid 2-",5.5×10−1
4707,"(2,4-dichlorophenoxy)-acetic acid,",1.7×10−1
4708,(bromomethyl)-benzene,1.4×10−3
4709,(2-bromoethyl)-benzene,6.5×10−3


In [17]:
# next we convert the strings to floats to make it easier to work with
def standard_form(Hcp_value):
    if len(Hcp_value) <= 6:     # accounts for any values which aren't in standard form, e.g. 1.2
        Hcp_value = Hcp_value.replace('×10', 'e')
    
    elif Hcp_value[6] == '−':
        Hcp_value = Hcp_value.replace('−', '-')   # replace any − signs with the correct - sign
        Hcp_value = Hcp_value.replace('×10', 'e')    # converting values into scientific format that is understood by python
    
    elif Hcp_value[6] != '−':
        Hcp_value = Hcp_value[0:6] + '+' + Hcp_value[6:]   # insert + into any values without - sign to distinguish between 104 and 10^4
        Hcp_value = Hcp_value.replace('×10', 'e')    # converting values into scientific format that is understood by python
    
    return Hcp_value
    
df_clean['Hcp'] = df_clean['Hcp'].apply(standard_form)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


In [18]:
df_clean

Unnamed: 0,Substance,Hcp
0,oxygen,1.2e-5
1,ozone,1.0e-4
2,hydrogen atom,2.6e-6
3,hydrogen,7.8e-6
4,deuterium,7.9e-6
...,...,...
4706,"(2,4-dichlorophenoxy)-acetic acid 2-",5.5e-1
4707,"(2,4-dichlorophenoxy)-acetic acid,",1.7e-1
4708,(bromomethyl)-benzene,1.4e-3
4709,(2-bromoethyl)-benzene,6.5e-3


In [19]:
# there are some values which appear as just e rather than e-1 so we fix that here
df_clean['Hcp'] = [x + '-1' if x[-1] == 'e' else x for x in df_clean['Hcp']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [20]:
# converting string values to float 
df_clean['Hcp'] = df_clean['Hcp'].apply(pd.to_numeric, errors='coerce')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [22]:
# clean up the numbers a bit
df_clean.Hcp = df_clean.Hcp.map('{:g}'.format)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [23]:
# reset index
df_clean.reset_index(drop=True, inplace=True)
df_clean

Unnamed: 0,Substance,Hcp
0,oxygen,1.2e-05
1,ozone,0.0001
2,hydrogen atom,2.6e-06
3,hydrogen,7.8e-06
4,deuterium,7.9e-06
...,...,...
4706,"(2,4-dichlorophenoxy)-acetic acid 2-",0.55
4707,"(2,4-dichlorophenoxy)-acetic acid,",0.17
4708,(bromomethyl)-benzene,0.0014
4709,(2-bromoethyl)-benzene,0.0065


As we can see towards the bottom some of the names are missing their endings.

In [28]:
df_clean.loc[df_clean['Hcp'] == 'nan']

Unnamed: 0,Substance,Hcp
35,nitrosyl chloride,
37,chlorine nitrate,
43,hypobromous acid,
45,bromine nitrate,
49,hypoiodous acid,
...,...,...
4215,chlormephos,
4230,chlorphoxim,
4660,(DMSO),
4661,(DMSO2),


Looking back at the original dataset in the pdf we can see that these nan values correspond to uncertain values such as >4.9x10-4 for nitrosyl chloride or infinity for chlorine nitrate

In [29]:
# dropping these values from the dataframe
df_clean = df_clean[df_clean.Hcp != 'nan']

In [31]:
# reset index
df_clean.reset_index(drop=True, inplace=True)
df_clean

Unnamed: 0,Substance,Hcp
0,oxygen,1.2e-05
1,ozone,0.0001
2,hydrogen atom,2.6e-06
3,hydrogen,7.8e-06
4,deuterium,7.9e-06
...,...,...
4634,"(2,4-dichlorophenoxy)-acetic acid 2-",0.55
4635,"(2,4-dichlorophenoxy)-acetic acid,",0.17
4636,(bromomethyl)-benzene,0.0014
4637,(2-bromoethyl)-benzene,0.0065


We are now very close to 4632 unique species

In [None]:
df_clean.to