# Generate HSP of all NLM compounds

### What This Creates

* HSP values in a CSV table that can be merged into the `hsp.sqlite` database.

### Rationale

*  Why This?  This is the final step in converting the group counts provided by `substructure_search.py` into predicted HSP values.  

*  Why Me?  No one else has done this before to my knowledge

*  Why Now?  This data will provide a key functionality at a level sufficient for demonstration purposes at the ACS meeting on 26 August.  

### Requirements

* Pandas 0.25.0
* Numpy 1.17.1

### Input / Output

*  The notebook should be in `solubility_parameters/notebooks`.  The input files are expected to be in `solubility_parameters/aprl_ssp` where the `substructure_search.py` program created them.  Each file is named `ids_groups_{*}.csv` and contains up to 10,000 group counts (about 75 groups).  The group coefficients are in `solubility_parameters/data_sources/misc/transcribed_fedors_table.csv`.  

* Special note:  the complete Fedors table requires a volume-based correction to delta-p for the ester, ketone, (and by extnesion anhydride), and phosphate groups.  This information comes from the original publication (Fedors, R. F., Polym. Eng. Sci., Vol 14, pp 147-154,472 (1974) and is not encoded in the spreadsheet.

* The output file will be placed in `solubility_parameters/data_sources/db_file_source` as `computed_hsp.csv`.  

## Import Set-Up

In [1]:
import pandas as pd
import glob
import numpy as np

In [2]:
groups_df = pd.DataFrame()
for groups_file in glob.glob('../aprl-ssp/ids_groups_*.csv'):
    temp_df = pd.read_csv(groups_file)
    groups_df = groups_df.append(temp_df)
groups_df.head()

Unnamed: 0,compound,CH3,CH2<,-CH<,>C<,CH2=,-CH=,>C=,CH#,-C#,...,>NH_al,>NH_cycloal,>NH_arom,>NH_amide_al,>NH_amide_cycloal,>NH_amide_arom,=PO2_ester,anhydride_al,anhydride_cycloal,anhydride_arom
0,34742,1,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,35676,7,6,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,38266,9,11,24,7,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,50000,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,50011,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
len(groups_df)

107254

In [4]:
group_coeffs_df = pd.read_csv('../data_sources/misc/transcribed_fedors_table.csv')
group_coeffs_df.head()

Unnamed: 0,group,V,Ed,Ep,Eh
0,CH3,33.5,2468.599679,0.0,0.0
1,CH2<,16.1,4938.3,0.0,0.0
2,-CH<,-1.0,3431.7,0.0,0.0
3,>C<,-19.2,1464.75,0.0,0.0
4,CH2=,28.5,3557.25,104.625,753.3


### Compute HSP

* This is going to be one big-ol' dot product ... 

In [5]:
# Adjust df's so that columns of one are index of the other, and verify
group_matrix_df = groups_df.rename(columns = {'compound':'nlm_num'}).set_index('nlm_num')
coeffs_matrix_df = group_coeffs_df.set_index('group')
print(f'KEY TEST:  Group columns == coeffs rows? {group_matrix_df.columns == coeffs_matrix_df.index}')

KEY TEST:  Group columns == coeffs rows? [ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True]


Proceed if the above generates all 'True' values

In [6]:
hsp_matrix_df = group_matrix_df.dot(coeffs_matrix_df)
hsp_matrix_df.head()

Unnamed: 0_level_0,V,Ed,Ep,Eh
nlm_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
34742,177.9,75999.049679,17844.84,11964.915
35676,436.3,179616.347753,14597.28,33429.78
38266,564.2,245131.422111,72141.03,238377.6
50000,0.0,0.0,0.0,0.0
50011,0.0,0.0,0.0,0.0


In [7]:
hsp_matrix_df.isna().sum()

V     0
Ed    0
Ep    0
Eh    0
dtype: int64

In [8]:
# Drop anything with a zero volume -- this would be cases with no functional group identified
# It would also cause a division by zero which we would rather avoid
hsp_matrix_df = hsp_matrix_df[hsp_matrix_df.V > 0]
len(hsp_matrix_df)

104294

In [9]:
# Now compute the correction for delta-p that involves volume scaled to 100 cc/mol
correction_group_list = ['>C=O_al', '>C=0_cycloal','>C=0_arom','-COO_ester_al','-COO_ester_cycloal','-COO_ester_arom',
                         'anhydride_al','anhydride_cycloal','anhydride_arom','=PO2_ester']
correction_list = []
for compound in hsp_matrix_df.index:
    correction = 0
    for chem_group in correction_group_list:
        coeff = coeffs_matrix_df.loc[chem_group,'Ep']
        num_groups = group_matrix_df.loc[compound,chem_group]
        correction += coeff * num_groups * (100 / hsp_matrix_df.loc[compound,'V'] - 1)
    correction_list.append(correction)
    
hsp_matrix_df['correction'] = correction_list
hsp_matrix_df.head()

Unnamed: 0_level_0,V,Ed,Ep,Eh,correction
nlm_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
34742,177.9,75999.049679,17844.84,11964.915,-6194.035245
35676,436.3,179616.347753,14597.28,33429.78,0.0
38266,564.2,245131.422111,72141.03,238377.6,-1928.215385
50022,268.8,108745.574037,17276.487252,68089.95,-300.108096
50033,299.3,117199.274037,12354.927252,52354.35,-1878.798538


In [10]:
# Apply correction, and compute Hansen Solubility Parameters
hsp_matrix_df['Ep_corr'] = hsp_matrix_df['Ep'] + hsp_matrix_df['correction']
hsp_matrix_df['delta_d'] = (hsp_matrix_df['Ed'] / hsp_matrix_df['V']).apply(np.sqrt)
hsp_matrix_df['delta_p'] = (hsp_matrix_df['Ep_corr'] / hsp_matrix_df['V']).apply(np.sqrt)
hsp_matrix_df['delta_h'] = (hsp_matrix_df['Eh'] / hsp_matrix_df['V']).apply(np.sqrt)
hsp_matrix_df.head()

Unnamed: 0_level_0,V,Ed,Ep,Eh,correction,Ep_corr,delta_d,delta_p,delta_h
nlm_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
34742,177.9,75999.049679,17844.84,11964.915,-6194.035245,11650.804755,20.66884,8.092636,8.201
35676,436.3,179616.347753,14597.28,33429.78,0.0,14597.28,20.28992,5.784201,8.753347
38266,564.2,245131.422111,72141.03,238377.6,-1928.215385,70212.814615,20.844091,11.155567,20.554938
50022,268.8,108745.574037,17276.487252,68089.95,-300.108096,16976.379156,20.113663,7.947086,15.915741
50033,299.3,117199.274037,12354.927252,52354.35,-1878.798538,10476.128714,19.788328,5.916257,13.225833


In [11]:
# Prepare for export, keep only V as 'mol_vol' along with the 'delta' columns
computed_hsp = hsp_matrix_df[['delta_d','delta_p','delta_h','V']]
computed_hsp = computed_hsp.rename(columns = {'V':'mol_vol'})
computed_hsp.head()

Unnamed: 0_level_0,delta_d,delta_p,delta_h,mol_vol
nlm_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
34742,20.66884,8.092636,8.201,177.9
35676,20.28992,5.784201,8.753347,436.3
38266,20.844091,11.155567,20.554938,564.2
50022,20.113663,7.947086,15.915741,268.8
50033,19.788328,5.916257,13.225833,299.3


In [12]:
computed_hsp.to_csv('../data_sources/db_file_source/computed_hsp.csv')