In [1]:
import pandas as pd
pd.options.display.max_columns = None

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import tensorflow as tf

                the kernel may be left running.  Please let us know
                about your system (bitness, Python, etc.) at
                ipython-dev@scipy.org
  ipython-dev@scipy.org""")


In [2]:
tf.keras.backend.clear_session()
from tensorflow import keras
from tensorflow.keras import layers

### Retrieve the data 

In [23]:
#Load all Files (they must be in data directory in a brother directory of the notebook)
data_load = {
    'dipole_moments': pd.read_csv('./data/dipole_moments.csv'),
    'magnetic_shielding_tensors': pd.read_csv('./data/magnetic_shielding_tensors.csv'),
    'mulliken_charges': pd.read_csv('./data/mulliken_charges.csv'),
    'potential_energy': pd.read_csv('./data/potential_energy.csv'),
    'sample_submission': pd.read_csv('./data/sample_submission.csv'),
    'scalar_coupling_contributions': pd.read_csv('./data/scalar_coupling_contributions.csv'),
    'structures': pd.read_csv('./data/structures.csv'),
    'train': pd.read_csv('./data/train.csv'), 
    'test': pd.read_csv('./data/test.csv')
    }

In [30]:
train_all = data_load['train']
MAX_MOL_ATOMS_NB = max(train_all.atom_index_0.max(),train_all.atom_index_1.max()) + 1
COUPLING_TYPE_NB = 8

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,scalar_coupling_constant,coupling_type_1JHC,coupling_type_1JHN,coupling_type_2JHC,coupling_type_2JHH,coupling_type_2JHN,coupling_type_3JHC,coupling_type_3JHH,coupling_type_3JHN
0,0,dsgdb9nsd_000001,1,0,84.8076,1,0,0,0,0,0,0,0
1,1,dsgdb9nsd_000001,1,2,-11.257,0,0,0,1,0,0,0,0
2,2,dsgdb9nsd_000001,1,3,-11.2548,0,0,0,1,0,0,0,0
3,3,dsgdb9nsd_000001,1,4,-11.2543,0,0,0,1,0,0,0,0
4,4,dsgdb9nsd_000001,2,0,84.8074,1,0,0,0,0,0,0,0


### Optional

In [5]:
#Confirmation with the molecule structure
structures = data_load['structures']

structures_col = structures.columns


# Create one_hot_encoding for the atom type
structures = pd.get_dummies(structures, prefix = 'atom', columns = ['atom'])
structures.head()

Unnamed: 0,molecule_name,atom_index,x,y,z,atom_C,atom_F,atom_H,atom_N,atom_O
0,dsgdb9nsd_000001,0,-0.012698,1.085804,0.008001,1,0,0,0,0
1,dsgdb9nsd_000001,1,0.00215,-0.006031,0.001976,0,0,1,0,0
2,dsgdb9nsd_000001,2,1.011731,1.463751,0.000277,0,0,1,0,0
3,dsgdb9nsd_000001,3,-0.540815,1.447527,-0.876644,0,0,1,0,0
4,dsgdb9nsd_000001,4,-0.523814,1.437933,0.906397,0,0,1,0,0


In [53]:
# No shuffle to the samples, this will be done later
train_all = data_load['train']
train_all =  pd.get_dummies(train_all, prefix = 'coupling_type', columns = ['type'])
train_col = train_all.columns

# Add the dipole_moments, per molecule
dipole_moments = data_load['dipole_moments'].rename(columns = {"X": 'DM_X', "Y": "DM_Y", "Z": "DM_Z" })
train_all = train_all.merge(dipole_moments, on = ['molecule_name'])

# Add the potential energy, per molecule
potential_energy = data_load['potential_energy']
train_all = train_all.merge(potential_energy, on = ['molecule_name'])


# Add the Mulliken charges, per atom
mulliken_charges = data_load['mulliken_charges']
train_all = train_all.merge(mulliken_charges, 
                            left_on = ['molecule_name', 'atom_index_0'], 
                            right_on = ['molecule_name', 'atom_index'] )
train_all = train_all.merge(mulliken_charges, 
                            left_on = ['molecule_name', 'atom_index_1'], 
                            right_on = ['molecule_name', 'atom_index'], suffixes = ('_atom_0', '_atom_1') 
                           ).drop(['atom_index_atom_0', 'atom_index_atom_1'], axis = 1)

# Add magnetic shileding, per atom
magnetic_shielding_tensors = data_load['magnetic_shielding_tensors']
train_all = train_all.merge(magnetic_shielding_tensors, 
                            left_on = ['molecule_name', 'atom_index_0'], 
                            right_on = ['molecule_name', 'atom_index'] )
train_all = train_all.merge(magnetic_shielding_tensors, 
                            left_on = ['molecule_name', 'atom_index_1'], 
                            right_on = ['molecule_name', 'atom_index'], suffixes = ('_atom_0', '_atom_1') 
                           ).drop(['atom_index_atom_0', 'atom_index_atom_1'], axis = 1)


train_all.head(5)

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,scalar_coupling_constant,coupling_type_1JHC,coupling_type_1JHN,coupling_type_2JHC,coupling_type_2JHH,coupling_type_2JHN,coupling_type_3JHC,coupling_type_3JHH,coupling_type_3JHN,DM_X,DM_Y,DM_Z,potential_energy,mulliken_charge_atom_0,mulliken_charge_atom_1,XX_atom_0,YX_atom_0,ZX_atom_0,XY_atom_0,YY_atom_0,ZY_atom_0,XZ_atom_0,YZ_atom_0,ZZ_atom_0,XX_atom_1,YX_atom_1,ZX_atom_1,XY_atom_1,YY_atom_1,ZY_atom_1,XZ_atom_1,YZ_atom_1,ZZ_atom_1
0,0,dsgdb9nsd_000001,1,0,84.8076,1,0,0,0,0,0,0,0,0.0,0.0,0.0,-40.52368,0.133921,-0.535689,31.341,-1.2317,4.0544,-1.2317,28.9546,-1.7173,4.0546,-1.7173,34.0861,195.315,0.0,-0.0001,0.0,195.317,0.0007,-0.0001,0.0007,195.317
1,4,dsgdb9nsd_000001,2,0,84.8074,1,0,0,0,0,0,0,0,0.0,0.0,0.0,-40.52368,0.133922,-0.535689,31.5814,1.2173,-4.1474,1.2173,28.9036,-1.6036,-4.1476,-1.6036,33.8967,195.315,0.0,-0.0001,0.0,195.317,0.0007,-0.0001,0.0007,195.317
2,7,dsgdb9nsd_000001,3,0,84.8093,1,0,0,0,0,0,0,0,0.0,0.0,0.0,-40.52368,0.133923,-0.535689,31.5172,4.1086,1.2723,4.1088,33.9068,1.695,1.2724,1.6951,28.9579,195.315,0.0,-0.0001,0.0,195.317,0.0007,-0.0001,0.0007,195.317
3,9,dsgdb9nsd_000001,4,0,84.8095,1,0,0,0,0,0,0,0,0.0,0.0,0.0,-40.52368,0.133923,-0.535689,31.4029,-4.0942,-1.1793,-4.0944,34.0776,1.6259,-1.1795,1.626,28.9013,195.315,0.0,-0.0001,0.0,195.317,0.0007,-0.0001,0.0007,195.317
4,1,dsgdb9nsd_000001,1,2,-11.257,0,0,0,1,0,0,0,0,0.0,0.0,0.0,-40.52368,0.133921,0.133922,31.341,-1.2317,4.0544,-1.2317,28.9546,-1.7173,4.0546,-1.7173,34.0861,31.5814,1.2173,-4.1474,1.2173,28.9036,-1.6036,-4.1476,-1.6036,33.8967


Remark: besides train features, data are not provided in the test dataset.

Should we use those as generative features for the scalar coupling constant?"

## Data preparation

#### Function to prepare the global set

In [73]:
from timeit import default_timer as timer
import re

def create_coupling_per_mol(molecules, coupled_atoms):

    #Transform the input into a dataframe
    #molecules = pd.DataFrame(molecules, columns = ['molecule_name', 'atom_index', 'x', 'y', 'z', 
    #                                               'atom_C', 'atom_F', 'atom_H', 'atom_N', 'atom_O'])
    #coupled_atoms = pd.DataFrame(coupled_atoms, columns = ['id', 'molecule_name', 'atom_index_0', 'atom_index_1', 
    #            'scalar_coupling_constant', 'coupling_type_1JHC', 'coupling_type_1JHN', 'coupling_type_2JHC', 
    #            'coupling_type_2JHH', 'coupling_type_2JHN', 'coupling_type_3JHC', 'coupling_type_3JHH', 'coupling_type_3JHN'])
    
    #0 Select all molecules in the batch first (to quicken the operations)
    molecules_name = coupled_atoms[['molecule_name']].values.reshape(-1)   
    molecules = molecules.loc[molecules['molecule_name'].isin(molecules_name)]

    #2 Prepare the features
    coupled_atoms.drop(['id'], axis = 1, inplace = True)

    coupling_id_col = ['molecule_name', 'atom_index_0', 'atom_index_1' ] 
    coupling_col = list(filter(re.compile("coupling_type.*").match,coupled_atoms.columns))
    outputs_atom_0_col = list(filter(re.compile(".*_atom_0").match,coupled_atoms.columns))
    outputs_atom_1_col = list(filter(re.compile(".*_atom_1").match,coupled_atoms.columns))
    outputs_mol_col = ['DM_X','DM_Y','DM_Z','potential_energy']
    
    #mean_col = ['x_mean', 'y_mean', 'z_mean']
    
    # Add type and coordinates for each atom_index_0 and atom_index_1
    df_tmp = coupled_atoms.merge(molecules, how='inner', 
                               left_on=['molecule_name', 'atom_index_0'], 
                               right_on=['molecule_name', 'atom_index'])
    df_features_dnn = df_tmp.merge(molecules, how='inner', 
                                          left_on=['molecule_name', 'atom_index_1'], 
                                          right_on=['molecule_name', 'atom_index'],
                                         suffixes=('_fa0', '_fa1')).drop(['atom_index_fa0','atom_index_fa1' ], axis = 1)
    
    # Return a tuple with 5 components grouped by molecule, in molecule order
    gb = df_features_dnn.groupby(['molecule_name'], sort = True, group_keys = False, as_index = True)
    
    coupling_pos_col = ['x_fa0' ,'y_fa0','z_fa0', 'x_fa1' ,'y_fa1','z_fa1']
    coupling_per_mol = gb.apply(lambda frame: frame[coupling_col + coupling_pos_col].to_numpy()).to_list()
    outputs_0 = gb.apply(lambda frame: frame[outputs_atom_0_col + outputs_mol_col].to_numpy()).to_list()
    outputs_1 = gb.apply(lambda frame: frame[outputs_atom_1_col + outputs_mol_col].to_numpy()).to_list()
    y_per_mol = gb.apply(lambda frame: frame['scalar_coupling_constant'].to_numpy()).to_list()
    
    # Return the molecule info per molecule, sorted by molecule order
    gb = molecules.groupby(['molecule_name'], sort = True, group_keys = False, as_index = True)
    molecules_output = gb.apply(lambda frame: frame.drop(['molecule_name', 'atom_index'], axis = 1).to_numpy()).to_list()
    
    return coupling_per_mol, y_per_mol, outputs_0, outputs_1, molecules_output

### Prepare the global train/eval sets (about 5 minutes)

In [77]:
from timeit import default_timer as timer
from sklearn.utils import shuffle 

validation_ratio = 0.2

# Retrieve the full list of molecules in the train/eval set
molecule_list = train_all['molecule_name'].unique()

# Split the train/eval sets on the molecule list
train_mol, eval_mol = np.split(molecule_list, [int((1-validation_ratio)*len(molecule_list))])
start = timer()

# Select the samples belonging to the molecule train set (WARNING: there is no shuffle there!!)
train_molecules = structures[structures['molecule_name'].isin(train_mol)]
train_coupling, train_y, train_outputs_0, train_outputs_1, train_mols = create_coupling_per_mol(
                                    train_molecules,
                                    train_all.loc[train_all['molecule_name'].isin(train_mol)]
                                    )
# Shuffle the samples by molecule
shuffle(train_coupling, train_y, train_outputs_0, outputs_1, train_mols)

train_stop = timer()

# Select the samples belonging to the molecule eval set (WARNING: there is no shuffle there!!)
eval_molecules = structures[structures['molecule_name'].isin(eval_mol)]
eval_coupling, eval_y, eval_outputs_0, eval_outputs_1, eval_mols = create_coupling_per_mol(
                                    eval_molecules,
                                    train_all.loc[train_all['molecule_name'].isin(eval_mol)]
                                    )

# Shuffle the samples by molecule
shuffle(eval_coupling, eval_y, eval_outputs_0, eval_1, eval_mols)

stop = timer()

print('Number of samples: \nfor train set:', len(train_coupling), '\nfor eval set:', len(eval_coupling))
print('Time for train / eval:', train_stop-start, '/', stop-train_stop)

Number of samples: 
for train set: 68002 
for eval set: 17001
Time for train / eval: 243.0299084369999 / 60.30858497400004


#### To save the global set of molecules

In [79]:
# Step 1
import pickle

global_data = {
    'train_coupling': train_coupling,
    'train_y': train_y,
    'train_outputs_0': train_outputs_0,
    'train_outputs_1': train_outputs_1,
    'train_mols': train_mols,
    'eval_coupling': eval_coupling,
    'eval_y': eval_y,
    'eval_outputs_0': eval_outputs_0,
    'eval_outputs_1': eval_outputs_1,
    'eval_mols': eval_mols,
    }

# Step 2
with open('train-eval-dump-2', 'wb') as dump_file:
 
    # Step 3
    pickle.dump(global_data, dump_file)

#### To load the global set of molecules

In [6]:
import pickle

with open('train-eval-dump', 'rb') as dump_file:
    global_data = pickle.load(dump_file)

### Generation of the dataset

#### Creation of the train data set

In [7]:
# Alternative with masking and large batch (2500)

from timeit import default_timer as timer

MASKING_VALUE = 666

@tf.function
def treat_molecule_set(features, molecules, labels):
    
    coupled_atoms_set = tf.convert_to_tensor(features)
    molecule = tf.convert_to_tensor(molecules)
    labels = tf.convert_to_tensor(labels)
    
    mol_atoms_nb = tf.shape(molecule)[0]
    init_molecule = molecule
    
    # Extend the molecule to 3D and transpose it 
    molecule = tf.reshape(molecule[:,:3], [-1, 3, 1])
    molecule = tf.transpose(molecule, [2, 1, 0])
    
    # We repeat the molecules and the mean values, so they have the same n_samples * n_atoms dimension
    molecule = tf.tile(molecule, [tf.shape(coupled_atoms_set)[0], 1, 1])

    mean_3d = tf.reshape(coupled_atoms_set[:, -3:], [-1, 3, 1])  
    mean_3d = tf.tile(mean_3d, [1, 1, mol_atoms_nb])
    
    # We calculate the distance 
    distance = tf.reduce_sum(tf.square(tf.subtract(molecule, mean_3d)), 1)
    
    # This function defines for each sample the atoms of the molecule, sorted by decreasing order
    ordered_idx = tf.argsort(
        distance,
        axis=1,
        direction='DESCENDING',
        stable=False,
        name=None
    )
    
    # We apply this sorting order to the initial molecule to calculate, for each couple of atoms, 
    # the atoms sorted by decreasing distance
    sort_mol = tf.gather(
        init_molecule,
        ordered_idx
    )
    
    # Pad with 666 to keep the same size (is that even needed?)
    sort_mol = tf.pad(sort_mol, [[0,0],[0,MAX_MOL_ATOMS_NB-mol_atoms_nb],[0,0]], constant_values=MASKING_VALUE)
    
    return { 'input_dnn': coupled_atoms_set, 
          'input_rnn': sort_mol
        }, { 'labels': labels }


features = global_data['train_set']
molecules = global_data['train_mol']
labels = global_data['train_labels']

def gen_mol():
    for i in range(len(features)):     
        yield features[i], molecules[i], labels[i] 

dataset = tf.data.Dataset.from_generator(
    gen_mol , (tf.float32, tf.float32, tf.float32), 
    (tf.TensorShape([None, 27]), tf.TensorShape([None,8]), tf.TensorShape([None,]))
    )

dataset = dataset.flat_map(lambda x, y, z: tf.data.Dataset.from_tensor_slices(treat_molecule_set(x, y, z)))
dataset = dataset.batch(2500,
    drop_remainder=True)
dataset = dataset.repeat()
dataset = dataset.prefetch(1)

dataset.take(1)

W0708 20:52:09.635479  6044 deprecation.py:323] From D:\Program_Files\Anaconda3\envs\tensorflow2\lib\site-packages\tensorflow\python\data\ops\dataset_ops.py:505: py_func (from tensorflow.python.ops.script_ops) is deprecated and will be removed in a future version.
Instructions for updating:
tf.py_func is deprecated in TF V2. Instead, there are two
    options available in V2.
    - tf.py_function takes a python function which manipulates tf eager
    tensors instead of numpy arrays. It's easy to convert a tf eager tensor to
    an ndarray (just call tensor.numpy()) but having access to eager tensors
    means `tf.py_function`s can use accelerators such as GPUs as well as
    being differentiable using a gradient tape.
    - tf.numpy_function maintains the semantics of the deprecated tf.py_func
    (it is not differentiable, and manipulates numpy arrays). It drops the
    stateful argument making all functions stateful.
    


<DatasetV1Adapter shapes: ({input_dnn: (2500, 27), input_rnn: (2500, None, 8)}, {labels: (2500,)}), types: ({input_dnn: tf.float32, input_rnn: tf.float32}, {labels: tf.float32})>

In [8]:
# Alternative with batch of 1000

eval_features = global_data['eval_set']
eval_molecules = global_data['eval_mol']
eval_labels = global_data['eval_labels']

def eval_gen_mol():
    for i in range(len(eval_features)):     
        yield eval_features[i], eval_molecules[i], eval_labels[i]
    
eval_dataset = tf.data.Dataset.from_generator(
    eval_gen_mol , (tf.float32, tf.float32, tf.float32), 
    (tf.TensorShape([None, 27]), tf.TensorShape([None,8]), tf.TensorShape([None,]) ))

eval_dataset = eval_dataset.flat_map(lambda x, y, z: tf.data.Dataset.from_tensor_slices(treat_molecule_set(x, y, z)))
eval_dataset = eval_dataset.batch(2500,
    drop_remainder=True)

eval_dataset = eval_dataset.repeat()
eval_dataset = eval_dataset.prefetch(1)

eval_dataset.take(1)

<DatasetV1Adapter shapes: ({input_dnn: (2500, 27), input_rnn: (2500, None, 8)}, {labels: (2500,)}), types: ({input_dnn: tf.float32, input_rnn: tf.float32}, {labels: tf.float32})>

## Create the model

In [10]:
from tensorflow.keras.utils import plot_model
MASKING_VALUE = 666

input_dnn = keras.Input(shape =(27 ,), name='input_dnn' )
input_rnn = keras.Input(shape = ( MAX_MOL_ATOMS_NB, 8), name = 'input_rnn' )

x = layers.Dropout(0.25)(input_dnn)
x = layers.Dense(64, activation='relu')(x)
x = layers.Dropout(0.25)(x)
x = layers.Dense(64, activation='relu')(x)
x = layers.Dropout(0.25)(x)

y = layers.Masking(mask_value=MASKING_VALUE, input_shape=(MAX_MOL_ATOMS_NB, 8))(input_rnn)
y = layers.LSTM(32, return_sequences=True, dropout = 0.25)(y)
y = layers.LSTM(32, return_sequences=False, dropout = 0.25)(y)

both = layers.concatenate([x, y])
both = layers.Dropout(0.5)(both)
both = layers.Dense(16, activation='relu')(both)

output = layers.Dense(1, activation='linear', name = 'labels')(both)
model = keras.Model((input_dnn,input_rnn), output, name = 'model')

# Customer score to calculate the competition score

def custom_loss_wrapper(input_tensor):
    
    def custom_loss(y_true, y_pred):
        
        # Coupling one-hot
        c = tf.slice(input_tensor, [0,0], [-1,8])
        
        # without explicit broadcasting
        absolute_error_pred = tf.math.abs(tf.subtract(y_true,y_pred))
        absolute_error_per_type = tf.multiply(c, absolute_error_pred)
        
        zerocount_per_type = tf.math.count_nonzero(absolute_error_per_type, axis = 0)
        sum_per_type = tf.math.reduce_sum(absolute_error_per_type, axis = 0) 
        
        zerocount_per_type = tf.boolean_mask(zerocount_per_type, tf.greater(zerocount_per_type,0))
        sum_per_type = tf.boolean_mask(sum_per_type, tf.greater(sum_per_type,0.0))
        
        mean_per_type = sum_per_type / tf.cast(zerocount_per_type, tf.float32)
        
        mean_per_type = tf.math.log(mean_per_type)
        #mean_per_type = tf.boolean_mask(mean_per_type, tf.math.is_finite(mean_per_type))

        score = tf.math.reduce_mean(mean_per_type)
        
        return score
    return custom_loss

model.summary()

W0708 20:52:28.792822  6044 deprecation.py:323] From D:\Program_Files\Anaconda3\envs\tensorflow2\lib\site-packages\tensorflow\python\keras\backend.py:3868: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_dnn (InputLayer)          [(None, 27)]         0                                            
__________________________________________________________________________________________________
dropout (Dropout)               (None, 27)           0           input_dnn[0][0]                  
__________________________________________________________________________________________________
dense (Dense)                   (None, 64)           1792        dropout[0][0]                    
__________________________________________________________________________________________________
input_rnn (InputLayer)          [(None, 29, 8)]      0                                            
______________________________________________________________________________________________

In [13]:
model.load_weights('weights-improvement-201977-10-1.22.hdf5')

In [None]:
from tensorflow.keras.callbacks import Callback

import datetime
now = datetime.datetime.now()

adam = keras.optimizers.Adam()
model.compile(loss=custom_loss_wrapper(input_dnn), optimizer=adam)

# checkpoint
filepath="weights-improvement-" + str(now.year) + str(now.month) + str(now.day) + "-{epoch:02d}-{val_loss:.2f}.hdf5"
checkpoint = keras.callbacks.ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

# Fit the model
history = model.fit(dataset, epochs=25, validation_data=eval_dataset, 
                    steps_per_epoch = 100, validation_steps=200, callbacks = callbacks_list)


Epoch 1/25
Epoch 00001: val_loss improved from inf to 1.31639, saving model to weights-improvement-201978-01-1.32.hdf5
Epoch 2/25
Epoch 00002: val_loss improved from 1.31639 to 1.30549, saving model to weights-improvement-201978-02-1.31.hdf5
Epoch 3/25
Epoch 00003: val_loss improved from 1.30549 to 1.28907, saving model to weights-improvement-201978-03-1.29.hdf5
Epoch 4/25
Epoch 00004: val_loss improved from 1.28907 to 1.25070, saving model to weights-improvement-201978-04-1.25.hdf5
Epoch 5/25
Epoch 00005: val_loss did not improve from 1.25070
Epoch 6/25
Epoch 00006: val_loss did not improve from 1.25070
Epoch 7/25
Epoch 00007: val_loss did not improve from 1.25070
Epoch 8/25
Epoch 00008: val_loss did not improve from 1.25070
Epoch 9/25
Epoch 00009: val_loss did not improve from 1.25070
Epoch 10/25
Epoch 00010: val_loss did not improve from 1.25070
Epoch 11/25
Epoch 00011: val_loss did not improve from 1.25070
Epoch 12/25
Epoch 00012: val_loss did not improve from 1.25070
Epoch 13/25
E

### DNN model

In [None]:
import matplotlib.pyplot as plt

# list all data in history
print(history.history.keys())

# summarize history for loss
plt.plot(history.history['loss'])
#plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'eval'], loc='upper left')
plt.show()

#### Questions: 
* how do we bring the dataset to the model with several inputs, and outputs?

In [None]:
model.evaluate(eval_dataset)

## Submissions

Submissions are evaluated on the Log of the Mean Absolute Error, calculated for each scalar coupling type, and then averaged across types, so that a 1% decrease in MAE for one type provides the same improvement in score as a 1% decrease for another type.

score=1T∑t=1Tlog(1nt∑i=1nt|yi−yi^|)
Where:

T is the number of scalar coupling types
nt is the number of observations of type t
yi is the actual scalar coupling constant for the observation
yi^ is the predicted scalar coupling constant for the observation
For this metric, the MAE for any group has a floor of 1e-9, so that the minimum (best) possible score for perfect predictions is approximately -20.7232.

## Annex

#### Features description

* dipole_moments.csv - contains the molecular electric dipole moments. These are three dimensional vectors that indicate the charge distribution in the molecule. The first column (molecule_name) are the names of the molecule, the second to fourth column are the X, Y and Z components respectively of the dipole moment.
* magnetic_shielding_tensors.csv - contains the magnetic shielding tensors for all atoms in the molecules. The first column (molecule_name) contains the molecule name, the second column (atom_index) contains the index of the atom in the molecule, the  third to eleventh columns contain the XX, YX, ZX, XY, YY, ZY, XZ, YZ and ZZ elements of the tensor/matrix respectively.
* mulliken_charges.csv - contains the mulliken charges for all atoms in the molecules. The first column (molecule_name) contains the name of the molecule, the second column (atom_index) contains the index of the atom in the molecule, the third column (mulliken_charge) contains the mulliken charge of the atom.
* potential_energy.csv - contains the potential energy of the molecules. The first column (molecule_name) contains the name of the molecule, the second column (potential_energy) contains the potential energy of the molecule.
* scalar_coupling_contributions.csv - The scalar coupling constants in train.csv (or corresponding files) are a sum of four terms. scalar_coupling_contributions.csv contain all these terms. The first column (molecule_name) are the name of the molecule, the second (atom_index_0) and third column (atom_index_1) are the atom indices of the atom-pair, the fourth column indicates the type of coupling, the fifth column (fc) is the Fermi Contact contribution, the sixth column (sd) is the Spin-dipolar contribution, the seventh column (pso) is the Paramagnetic spin-orbit contribution and the eighth column (dso) is the Diamagnetic spin-orbit contribution.

### Test set to validate the code

In [None]:
ut_coupling = pd.DataFrame({
    'molecule_name': ['mol_a', 'mol_b', 'mol_a', 'mol_b', 'mol_b', 'mol_c'],
    'atom_index_0':[ 0, 0, 1, 1, 1, 0],
    'atom_index_1': [ 1, 1, 2, 2, 3, 1],
    'scalar_coupling_constant': [5, 7, 6, 8, 9, 10],
    'coupling_type_1JHC':[ 1, 0, 0, 0, 0, 0], 
    'coupling_type_1JHN':[ 0, 1, 0, 0, 0, 0], 
    'coupling_type_2JHC':[ 0, 0, 1, 0, 0, 0], 
    'coupling_type_2JHH':[ 0, 0, 0, 1, 0, 0], 
    'coupling_type_2JHN':[ 0, 0, 0, 0, 1, 0], 
    'coupling_type_3JHC':[ 0, 0, 0, 0, 0, 1],
    'coupling_type_3JHH':[ 0, 0, 0, 0, 0, 0], 
    'coupling_type_3JHN':[ 0, 0, 0, 0, 0, 0]
})

ut_coupling.head(6)

In [None]:
ut_molecules = pd.DataFrame({
    'molecule_name': ['mol_a', 'mol_a', 'mol_a', 'mol_b', 'mol_b', 'mol_b', 'mol_b', 'mol_b', 'mol_c', 'mol_c'],
    'atom_index':[ 0, 1, 2, 0, 1, 2, 3, 4, 0, 1],
    'x': [ -0.5, 0.5, 1, -2, -1, 0, 1, 2, 0.5, 1.5  ],
    'y': [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  ],
    'z': [ -0.5, 0.5, 1, -2, -1, 0, 1, 2, 0.5, 1.5  ],
    'atom_C': [1,0,0,0,0,1,0,0,0,0],
    'atom_F': [0,1,0,0,0,0,1,0,0,0],
    'atom_H': [0,0,1,0,0,0,0,1,0,0], 
    'atom_N': [0,0,0,1,0,0,0,0,1,0],
    'atom_O': [0,0,0,0,1,0,0,0,0,1]
})

ut_molecules.head(10)