In [1]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [1]:
import numpy as np # linear algebra
from scipy.stats.stats import pearsonr
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm_notebook as tqdm
import seaborn as sns 
import matplotlib.pyplot as plt
sns.set()
import os
import openbabel as ob

In [2]:
from sklearn.metrics import mean_absolute_error

In [3]:
file_folder = '../../data/input'
os.listdir(file_folder)

['sample_submission.csv',
 'magnetic_shielding_tensors.csv',
 'potential_energy.csv',
 'scalar_coupling_contributions.csv',
 'dipole_moments.csv',
 'mulliken_charges.csv',
 'train.csv',
 'test.csv',
 'structures.csv',
 'structures']

In [14]:
train = pd.read_csv(f'{file_folder}/train.csv')
test = pd.read_csv(f'{file_folder}/test.csv')
magnetic_shielding_tensors = pd.read_csv(f'{file_folder}/magnetic_shielding_tensors.csv')
dipole_moments = pd.read_csv(f'{file_folder}/dipole_moments.csv')
mulliken_charges = pd.read_csv(f'{file_folder}/mulliken_charges.csv')
potential_energy = pd.read_csv(f'{file_folder}/potential_energy.csv')
scalar_coupling_contributions = pd.read_csv(f'{file_folder}/scalar_coupling_contributions.csv')
structures = pd.read_csv(f'{file_folder}/structures.csv')

In [5]:
train_molecules = train.molecule_name.unique()
test_molecules  = test.molecule_name.unique()


mulliken   = []
mulliken_charges_idx = mulliken_charges.set_index(['molecule_name'])
# ensure mulliken charges are in same order as for partial charges
for molecule_name in train_molecules:
    mc  = mulliken_charges_idx.loc[molecule_name].sort_index()
    mulliken.extend(mc.mulliken_charge.values)

In [6]:
obConversion = ob.OBConversion()
#def read_ob_molecule(molecule_name, datadir="../input/champs-scalar-coupling/structures"):
def read_ob_molecule(molecule_name, datadir="../../data/input/structures"):
    mol = ob.OBMol()
    path = f"{datadir}/{molecule_name}.xyz"
    if not obConversion.ReadFile(mol, path):
        raise FileNotFoundError(f"Could not read molecule {path}")
    return mol
    

ob_methods = [ "eem", "mmff94", "gasteiger", "qeq", "qtpie", 
               "eem2015ha", "eem2015hm", "eem2015hn", "eem2015ba", "eem2015bm", "eem2015bn" ]

structures_idx = structures.set_index( ["molecule_name"] )
def get_charges_df(molecule_names):
    ob_methods_charges = [ [] for _ in ob_methods]
    ob_molecule_name = []  # container for output  DF
    ob_atom_index    = []  # container for output  DF
    ob_error         = []
    for molecule_name in molecule_names:
        # fill data for output DF
        ms = structures_idx.loc[molecule_name].sort_index()
        natoms = len(ms)
        ob_molecule_name.extend( [molecule_name] * natoms )
        ob_atom_index.extend(    ms.atom_index.values )

        # calculate open babel charge for each method
        mol = read_ob_molecule(molecule_name)
        assert( mol.NumAtoms() == natoms ) # consistency
        error = 0
        for method, charges in zip(ob_methods, ob_methods_charges):
            ob_charge_model = ob.OBChargeModel.FindType(method)
            if not ob_charge_model.ComputeCharges(mol):
                error = 1
            charges.extend( ob_charge_model.GetPartialCharges() )
        ob_error.extend([error] * natoms)
            
    ob_charges = pd.DataFrame({
        'molecule_name' : ob_molecule_name,
        'atom_index'    : ob_atom_index}
    )
    for method, charges in zip(ob_methods, ob_methods_charges):
        ob_charges[method] = charges
    ob_charges["error"] = ob_error
    display(ob_charges.head())
    return ob_charges

In [7]:
%time train_ob_charges = get_charges_df(train_molecules)

Unnamed: 0,molecule_name,atom_index,eem,mmff94,gasteiger,qeq,qtpie,eem2015ha,eem2015hm,eem2015hn,eem2015ba,eem2015bm,eem2015bn,error
0,dsgdb9nsd_000001,0,-0.644531,0.0,-0.077596,3.25114,-3.093807,0.014606,-0.813021,-0.784944,-0.067349,-0.806339,-0.851258,0
1,dsgdb9nsd_000001,1,0.161131,0.0,0.019399,-0.812772,0.773439,-0.003651,0.203254,0.196234,0.016837,0.201583,0.212813,0
2,dsgdb9nsd_000001,2,0.161132,0.0,0.019399,-0.812776,0.773442,-0.003651,0.203254,0.196235,0.016837,0.201584,0.212813,0
3,dsgdb9nsd_000001,3,0.161134,0.0,0.019399,-0.812797,0.773463,-0.003651,0.203256,0.196237,0.016837,0.201586,0.212816,0
4,dsgdb9nsd_000001,4,0.161134,0.0,0.019399,-0.812795,0.773462,-0.003651,0.203256,0.196237,0.016837,0.201586,0.212816,0


CPU times: user 5min 2s, sys: 1.81 s, total: 5min 3s
Wall time: 5min 3s


In [8]:
%time test_ob_charges = get_charges_df(test_molecules)

Unnamed: 0,molecule_name,atom_index,eem,mmff94,gasteiger,qeq,qtpie,eem2015ha,eem2015hm,eem2015hn,eem2015ba,eem2015bm,eem2015bn,error
0,dsgdb9nsd_000004,0,-0.140218,-0.177,-0.195499,0.162134,-0.104823,0.186739,-0.244665,-0.126079,0.188397,-0.232962,-0.126054,0
1,dsgdb9nsd_000004,1,-0.140218,-0.177,-0.195499,0.162134,-0.104823,0.186739,-0.244665,-0.126079,0.188397,-0.232962,-0.126054,0
2,dsgdb9nsd_000004,2,0.140218,0.177,0.195499,-0.162134,0.104823,-0.186739,0.244665,0.126079,-0.188397,0.232962,0.126054,0
3,dsgdb9nsd_000004,3,0.140218,0.177,0.195499,-0.162134,0.104823,-0.186739,0.244665,0.126079,-0.188397,0.232962,0.126054,0
4,dsgdb9nsd_000015,0,-0.200372,0.28,-0.018924,-0.438607,0.348798,0.462935,-0.319256,-0.257585,0.347919,-0.405719,-0.346147,0


CPU times: user 2min 42s, sys: 868 ms, total: 2min 43s
Wall time: 2min 43s


In [27]:
mean_absolute_error(train_ob_charges.eem2015bn, train_ob_charges.mulliken_charge)

0.11251960824466618

In [28]:
mean_absolute_error(train_ob_charges.eem, train_ob_charges.mulliken_charge)

0.07218768215285239

In [30]:
mean_absolute_error(train_ob_charges.eem2015hn, train_ob_charges.mulliken_charge)

0.11394033320654702

In [9]:
train_ob_charges.head()

Unnamed: 0,molecule_name,atom_index,eem,mmff94,gasteiger,qeq,qtpie,eem2015ha,eem2015hm,eem2015hn,eem2015ba,eem2015bm,eem2015bn,error
0,dsgdb9nsd_000001,0,-0.644531,0.0,-0.077596,3.25114,-3.093807,0.014606,-0.813021,-0.784944,-0.067349,-0.806339,-0.851258,0
1,dsgdb9nsd_000001,1,0.161131,0.0,0.019399,-0.812772,0.773439,-0.003651,0.203254,0.196234,0.016837,0.201583,0.212813,0
2,dsgdb9nsd_000001,2,0.161132,0.0,0.019399,-0.812776,0.773442,-0.003651,0.203254,0.196235,0.016837,0.201584,0.212813,0
3,dsgdb9nsd_000001,3,0.161134,0.0,0.019399,-0.812797,0.773463,-0.003651,0.203256,0.196237,0.016837,0.201586,0.212816,0
4,dsgdb9nsd_000001,4,0.161134,0.0,0.019399,-0.812795,0.773462,-0.003651,0.203256,0.196237,0.016837,0.201586,0.212816,0


In [15]:
def map_atom_info(df, structures, atom_idx):
    df = pd.merge(df, structures, how = 'left', left_on  = ['molecule_name', f'atom_index_{atom_idx}'], right_on = ['molecule_name',  'atom_index'])
    df = df.drop('atom_index', axis=1)
    df = df.rename(columns={'eem': f'eem_{atom_idx}'})
    return df

In [16]:
train = map_atom_info(train, train_ob_charges[['molecule_name','atom_index','eem']], 0)
train = map_atom_info(train, train_ob_charges[['molecule_name','atom_index','eem']], 1)

In [17]:
train.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,eem_0,eem_1
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076,0.161131,-0.644531
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257,0.161131,0.161132
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548,0.161131,0.161134
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543,0.161131,0.161134
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074,0.161132,-0.644531


In [18]:
test = map_atom_info(test, test_ob_charges[['molecule_name','atom_index','eem']], 0)
test = map_atom_info(test, test_ob_charges[['molecule_name','atom_index','eem']], 1)

In [19]:
train.drop(columns=['molecule_name','atom_index_0','atom_index_1','type','scalar_coupling_constant']).to_pickle('../../data/feature/eem_train.pkl')
test.drop(columns=['molecule_name','atom_index_0','atom_index_1','type']).to_pickle('../../data/feature/eem_test.pkl')