In [1]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [2]:
# import sys
# !{sys.executable} -m pip install lightgbm

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np

import math
import gc
import copy

from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_absolute_error

import matplotlib.pyplot as plt
import seaborn as sns

from lightgbm import LGBMRegressor

In [2]:
DATA_PATH = '../input'
SUBMISSIONS_PATH = './'
# use atomic numbers to recode atomic names
ATOMIC_NUMBERS = {
    'H': 1,
    'C': 6,
    'N': 7,
    'O': 8,
    'F': 9
}

In [3]:
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 120)
pd.set_option('display.max_columns', 120)

In [4]:
DATA_PATH = '../../data/input'

In [5]:
train_dtypes = {
    'molecule_name': 'category',
    'atom_index_0': 'int8',
    'atom_index_1': 'int8',
    'type': 'category',
    'scalar_coupling_constant': 'float32'
}
train_csv = pd.read_csv(f'{DATA_PATH}/train.csv', dtype=train_dtypes)
train_csv['molecule_index'] = train_csv.molecule_name.str.replace('dsgdb9nsd_', '').astype('int32')

scalar_coupling_contributions_csv = pd.read_csv(f'{DATA_PATH}/scalar_coupling_contributions.csv',dtype=train_dtypes)
scalar_coupling_contributions_csv['molecule_index'] = scalar_coupling_contributions_csv.molecule_name.str.replace('dsgdb9nsd_', '').astype('int32')
train_csv = pd.merge(train_csv, scalar_coupling_contributions_csv[['molecule_index', 'atom_index_0', 'atom_index_1', 'fc', 'sd', 'pso', 'dso']], on=['molecule_index', 'atom_index_0', 'atom_index_1'])


train_csv = train_csv[['id', 'molecule_index', 'atom_index_0', 'atom_index_1', 'type', 'scalar_coupling_constant', 'fc', 'sd', 'pso', 'dso']]
train_csv.index = train_csv['id']
train_csv = train_csv.drop(columns=['id'])
train_csv.head(10)

Unnamed: 0_level_0,molecule_index,atom_index_0,atom_index_1,type,scalar_coupling_constant,fc,sd,pso,dso
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,1,1,0,1JHC,84.807602,83.0224,0.254579,1.25862,0.27201
1,1,1,2,2JHH,-11.257,-11.0347,0.352978,2.85839,-3.4336
2,1,1,3,2JHH,-11.2548,-11.0325,0.352944,2.85852,-3.43387
3,1,1,4,2JHH,-11.2543,-11.0319,0.352934,2.85855,-3.43393
4,1,2,0,1JHC,84.807404,83.0222,0.254585,1.25861,0.272013
5,1,2,3,2JHH,-11.2541,-11.0317,0.352932,2.85856,-3.43395
6,1,2,4,2JHH,-11.2548,-11.0324,0.352943,2.85853,-3.43387
7,1,3,0,1JHC,84.809303,83.0241,0.254634,1.25856,0.272012
8,1,3,4,2JHH,-11.2543,-11.0319,0.352943,2.85856,-3.43393
9,1,4,0,1JHC,84.809502,83.0243,0.254628,1.25856,0.272012


In [6]:
# from sklearn.preprocessing import LabelEncoder

# col = 'type'
# le = LabelEncoder()
# le.fit(list(train_csv[col].values) + list(test_csv[col].values))
# print(le.classes_)

In [7]:
print('Shape: ', train_csv.shape)
print('Total: ', train_csv.memory_usage().sum())
train_csv.memory_usage()

Shape:  (4658147, 9)
Total:  237565881


Index                       37265176
molecule_index              18632588
atom_index_0                4658147 
atom_index_1                4658147 
type                        4658531 
scalar_coupling_constant    18632588
fc                          37265176
sd                          37265176
pso                         37265176
dso                         37265176
dtype: int64

In [8]:
submission_csv = pd.read_csv(f'{DATA_PATH}/sample_submission.csv', index_col='id')

  mask |= (ar1 == a)


In [9]:
test_csv = pd.read_csv(f'{DATA_PATH}/test.csv', index_col='id', dtype=train_dtypes)
test_csv['molecule_index'] = test_csv['molecule_name'].str.replace('dsgdb9nsd_', '').astype('int32')
test_csv = test_csv[['molecule_index', 'atom_index_0', 'atom_index_1', 'type']]
test_csv.head(10)

Unnamed: 0_level_0,molecule_index,atom_index_0,atom_index_1,type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4658147,4,2,0,2JHC
4658148,4,2,1,1JHC
4658149,4,2,3,3JHH
4658150,4,3,0,1JHC
4658151,4,3,1,2JHC
4658152,15,3,0,1JHC
4658153,15,3,2,3JHC
4658154,15,3,4,2JHH
4658155,15,3,5,2JHH
4658156,15,4,0,1JHC


In [10]:
structures_dtypes = {
    'molecule_name': 'category',
    'atom_index': 'int8',
    'atom': 'category',
    'x': 'float32',
    'y': 'float32',
    'z': 'float32'
}
structures_csv = pd.read_csv(f'{DATA_PATH}/structures.csv', dtype=structures_dtypes)
structures_csv['molecule_index'] = structures_csv.molecule_name.str.replace('dsgdb9nsd_', '').astype('int32')
structures_csv = structures_csv[['molecule_index', 'atom_index', 'atom', 'x', 'y', 'z']]
structures_csv['atom'] = structures_csv['atom'].replace(ATOMIC_NUMBERS).astype('int8')
structures_csv.head(10)

Unnamed: 0,molecule_index,atom_index,atom,x,y,z
0,1,0,6,-0.012698,1.085804,0.008001
1,1,1,1,0.00215,-0.006031,0.001976
2,1,2,1,1.011731,1.463751,0.000277
3,1,3,1,-0.540815,1.447527,-0.876644
4,1,4,1,-0.523814,1.437933,0.906397
5,2,0,7,-0.040426,1.024108,0.062564
6,2,1,1,0.017257,0.012545,-0.027377
7,2,2,1,0.915789,1.358745,-0.028758
8,2,3,1,-0.520278,1.343532,-0.775543
9,3,0,8,-0.03436,0.97754,0.007602


In [11]:
print('Shape: ', structures_csv.shape)
print('Total: ', structures_csv.memory_usage().sum())
structures_csv.memory_usage()

Shape:  (2358657, 6)
Total:  42455906


Index             80     
molecule_index    9434628
atom_index        2358657
atom              2358657
x                 9434628
y                 9434628
z                 9434628
dtype: int64

In [12]:
def build_type_dataframes(base, structures, coupling_type):
    base = base[base['type'] == coupling_type].drop('type', axis=1).copy()
    base = base.reset_index()
    base['id'] = base['id'].astype('int32')
    structures = structures[structures['molecule_index'].isin(base['molecule_index'])]
    return base, structures

In [13]:
def add_coordinates(base, structures, index):
    df = pd.merge(base, structures, how='inner',
                  left_on=['molecule_index', f'atom_index_{index}'],
                  right_on=['molecule_index', 'atom_index']).drop(['atom_index'], axis=1)
    df = df.rename(columns={
        'atom': f'atom_{index}',
        'x': f'x_{index}',
        'y': f'y_{index}',
        'z': f'z_{index}'
    })
    return df

In [14]:
def add_atoms(base, atoms):
    df = pd.merge(base, atoms, how='inner',
                  on=['molecule_index', 'atom_index_0', 'atom_index_1'])
    return df

In [15]:
def merge_all_atoms(base, structures):
    df = pd.merge(base, structures, how='left',
                  left_on=['molecule_index'],
                  right_on=['molecule_index'])
    df = df[(df.atom_index_0 != df.atom_index) & (df.atom_index_1 != df.atom_index)]
    return df

In [16]:
def add_center(df):
    df['x_c'] = ((df['x_1'] + df['x_0']) * np.float32(0.5))
    df['y_c'] = ((df['y_1'] + df['y_0']) * np.float32(0.5))
    df['z_c'] = ((df['z_1'] + df['z_0']) * np.float32(0.5))

def add_distance_to_center(df):
    df['d_c'] = ((
        (df['x_c'] - df['x'])**np.float32(2) +
        (df['y_c'] - df['y'])**np.float32(2) + 
        (df['z_c'] - df['z'])**np.float32(2)
    )**np.float32(0.5))

def add_distance_between(df, suffix1, suffix2):
    df[f'd_{suffix1}_{suffix2}'] = ((
        (df[f'x_{suffix1}'] - df[f'x_{suffix2}'])**np.float32(2) +
        (df[f'y_{suffix1}'] - df[f'y_{suffix2}'])**np.float32(2) + 
        (df[f'z_{suffix1}'] - df[f'z_{suffix2}'])**np.float32(2)
    )**np.float32(0.5))

In [17]:
def add_distances(df):
    n_atoms = 1 + max([int(c.split('_')[1]) for c in df.columns if c.startswith('x_')])
    
    for i in range(1, n_atoms):
        for vi in range(min(4, i)):
            add_distance_between(df, i, vi)

In [18]:
def add_n_atoms(base, structures):
    dfs = structures['molecule_index'].value_counts().rename('n_atoms').to_frame()
    return pd.merge(base, dfs, left_on='molecule_index', right_index=True)

In [19]:
def build_couple_dataframe(some_csv, structures_csv, coupling_type, n_atoms=10):
    base, structures = build_type_dataframes(some_csv, structures_csv, coupling_type)
    base = add_coordinates(base, structures, 0)
    base = add_coordinates(base, structures, 1)
    
    base = base.drop(['atom_0', 'atom_1'], axis=1)
    atoms = base.drop('id', axis=1).copy()
    for y_i in ['scalar_coupling_constant', 'fc', 'sd', 'pso', 'dso']:
        if y_i in some_csv:
            atoms = atoms.drop([y_i], axis=1)
        
    add_center(atoms)
    atoms = atoms.drop(['x_0', 'y_0', 'z_0', 'x_1', 'y_1', 'z_1'], axis=1)

    atoms = merge_all_atoms(atoms, structures)
    
    add_distance_to_center(atoms)
    
    atoms = atoms.drop(['x_c', 'y_c', 'z_c', 'atom_index'], axis=1)
    atoms.sort_values(['molecule_index', 'atom_index_0', 'atom_index_1', 'd_c'], inplace=True)
    atom_groups = atoms.groupby(['molecule_index', 'atom_index_0', 'atom_index_1'])
    atoms['num'] = atom_groups.cumcount() + 2
    atoms = atoms.drop(['d_c'], axis=1)
    atoms = atoms[atoms['num'] < n_atoms]

    atoms = atoms.set_index(['molecule_index', 'atom_index_0', 'atom_index_1', 'num']).unstack()
    atoms.columns = [f'{col[0]}_{col[1]}' for col in atoms.columns]
    atoms = atoms.reset_index()
    
    # downcast back to int8
    for col in atoms.columns:
        if col.startswith('atom_'):
            atoms[col] = atoms[col].fillna(0).astype('int8')
            
    atoms['molecule_index'] = atoms['molecule_index'].astype('int32')
    
    full = add_atoms(base, atoms)
    add_distances(full)
    
    full.sort_values('id', inplace=True)
    
    return full

In [20]:
# from numpy import (array, dot, arccos, clip)
# from numpy.linalg import norm

# u = array([1.,2,3,4])
# v = ...
# c = dot(u,v)/norm(u)/norm(v) # -> cosine of the angle
# angle = arccos(clip(c, -1, 1)) # if you really want the angle

In [41]:
def take_n_atoms(df, n_atoms, four_start=4, include_id=True, y_name = 'scalar_coupling_constant'):
    labels = []
    for i in range(2, n_atoms):
        label = f'atom_{i}'
        labels.append(label)
    
    
    for i in range(n_atoms):
        num = min(i, 4) if i < four_start else 4
        for j in range(num):
            labels.append(f'd_{i}_{j}')
    if y_name in df:
        labels.append(y_name)
    if include_id:
        labels = labels + ['id']
    return df[labels]

In [22]:
# %%time
# full = build_couple_dataframe(train_csv, structures_csv, '1JHN', n_atoms=10)
# print(full.shape)

In [141]:
full.columns

Index(['id', 'molecule_index', 'atom_index_0', 'atom_index_1',
       'scalar_coupling_constant', 'fc', 'sd', 'pso', 'dso', 'x_0', 'y_0',
       'z_0', 'x_1', 'y_1', 'z_1', 'atom_2', 'atom_3', 'atom_4', 'atom_5',
       'atom_6', 'atom_7', 'atom_8', 'atom_9', 'x_2', 'x_3', 'x_4', 'x_5',
       'x_6', 'x_7', 'x_8', 'x_9', 'y_2', 'y_3', 'y_4', 'y_5', 'y_6', 'y_7',
       'y_8', 'y_9', 'z_2', 'z_3', 'z_4', 'z_5', 'z_6', 'z_7', 'z_8', 'z_9',
       'd_1_0', 'd_2_0', 'd_2_1', 'd_3_0', 'd_3_1', 'd_3_2', 'd_4_0', 'd_4_1',
       'd_4_2', 'd_4_3', 'd_5_0', 'd_5_1', 'd_5_2', 'd_5_3', 'd_6_0', 'd_6_1',
       'd_6_2', 'd_6_3', 'd_7_0', 'd_7_1', 'd_7_2', 'd_7_3', 'd_8_0', 'd_8_1',
       'd_8_2', 'd_8_3', 'd_9_0', 'd_9_1', 'd_9_2', 'd_9_3'],
      dtype='object')

In [23]:
column_list=[{'columns': ['cos_center1', 'adC3', 'dist_H_4_y', 'inv_dist1R', 'eem_1', 'dist_C_0_y', 'dist_to_type_std', 'cos_f0', 'molecule_dist_min', 'inv_dist1', 'eem_0', 'molecule_atom_index_0_dist_max_div', 'molecule_type_dist_std_diff', 'tertiary_angle_0', 'tertiary_distance_5', 'dist_O_1_x', 'tertiary_distance_3', 'tertiary_distance_2', 'dist_C_4_y', 'dist_to_type_mean', 'inv_distPR', 'inv_dist1E', 'adC1', 'dist_C_0_x', 'dist_H_1_x', 'dist_O_0_x', 'adN1', 'dist_to_type_1_mean', 'atom_index_1_cycle_size_mean', 'dist_N_0_y', 'dist_H_3_x', 'tertiary_angle_1', 'linkM0', 'tertiary_angle_4', 'dist_C_1_x', 'distance_farthest_0', 'molecule_type_dist_mean_diff', 'dist_to_type_0_mean', 'dist_C_3_x', 'tertiary_distance_4', 'max_molecule_atom_0_dist_xyz', 'inv_dist0R', 'dist_H_0_y', 'tertiary_distance_0', 'dist_C_2_x', 'tertiary_angle_5', 'dist_H_0_x', 'coulomb_H.y', 'tertiary_distance_1', 'dist_H_2_y', 'dist_O_1_y', 'atom_1_n_bonds', 'atom_1_bond_lengths_std', 'dist_O_0_y', 'dist_C_3_y', 'atom_1_bond_lengths_mean', 'distC1', 'dist_xyz', 'dist_C_2_y', 'tertiary_angle_2', 'tertiary_atom_0', 'dist_H_3_y', 'tertiary_atom_1', 'tertiary_angle_3', 'max_molecule_atom_1_dist_xyz', 'dist_C_1_y', 'atom_1_bond_lengths_max', 'distC0', 'vander_H.x', 'dist_H_2_x', 'adC2', 'yukawa_H.y', 'molecule_atom_index_1_dist_min_diff', 'cos_center0', 'dist_C_4_x', 'dist_N_0_x', 'atom_index_farthest_0', 'dist_H_1_y'], 'cv': {'cls': 'KFold', 'init': {'n_splits': 5, 'shuffle': True, 'random_state': 42}}, 'scaler': {'cls': 'StandardScaler', 'init': {}, 'fit': {}}, 'model': {'cls': 'lgb.LGBMRegressor', 'init': {'learning_rate': 0.2833769330240482, 'feature_fraction': 0.8818248470204605, 'bagging_fraction': 0.8205197060908092, 'min_data_in_leaf': 202, 'lambda_l1': 0.017039063121824582, 'lambda_l2': 0.8318702431636841, 'max_bin': 100, 'num_leaves': 255, 'random_state': 3895, 'n_jobs': 16}, 'fit': {}}, 'metric': 'mean_absolute_error'},
{'columns': ['min_molecule_atom_1_dist_xyz', 'tertiary_distance_1', 'adC3', 'dist_H_2_y', 'inv_dist1R', 'dist_C_0_y', 'cos_f0', 'atom_1_bond_lengths_std', 'inv_dist1', 'eem_0', 'dist_O_0_y', 'molecule_atom_index_0_dist_max_div', 'cos_center0_center1', 'tertiary_angle_0', 'tertiary_distance_5', 'tertiary_distance_3', 'tertiary_distance_2', 'dist_C_3_y', 'atom_1_bond_lengths_mean', 'dist_C_2_y', 'distance_center0', 'distN0', 'vander_N.x', 'inv_distPR', 'tertiary_angle_2', 'tertiary_atom_0', 'dist_H_3_y', 'adC4', 'dist_C_0_x', 'dist_H_1_x', 'tertiary_angle_3', 'dist_O_0_x', 'dist_C_1_y', 'atom_1_bond_lengths_max', 'molecule_atom_index_1_dist_max_div', 'atom_index_1_cycle_size_mean', 'dist_N_0_y', 'tertiary_angle_1', 'dist_C_1_x', 'vander_H.x', 'dist_to_type_0_mean', 'inv_distP', 'dist_C_3_x', 'tertiary_distance_4', 'bond_atom', 'max_molecule_atom_0_dist_xyz', 'inv_dist0R', 'vander_H.y', 'dist_N_1_x', 'dist_H_0_y', 'yukawa_H.y', 'tertiary_distance_0', 'yukawa_H.x', 'dist_C_2_x', 'linkN', 'dist_N_0_x', 'dist_H_0_x', 'dist_H_1_y'], 'cv': {'cls': 'KFold', 'init': {'n_splits': 5, 'shuffle': True, 'random_state': 42}}, 'scaler': {'cls': 'StandardScaler', 'init': {}, 'fit': {}}, 'model': {'cls': 'lgb.LGBMRegressor', 'init': {'learning_rate': 0.2833769330240482, 'feature_fraction': 0.8818248470204605, 'bagging_fraction': 0.8205197060908092, 'min_data_in_leaf': 202, 'lambda_l1': 0.017039063121824582, 'lambda_l2': 0.8318702431636841, 'max_bin': 100, 'num_leaves': 255, 'random_state': 3895, 'n_jobs': 16}, 'fit': {}}, 'metric': 'mean_absolute_error'},
{'columns': ['tertiary_distance_6', 'adC3', 'inv_dist1R', 'eem_1', 'cos_f0', 'dist_C_0_y', 'cos_c1', 'yukawa_O.x', 'inv_dist1', 'tertiary_atom_2', 'molecule_atom_index_0_dist_max_div', 'tertiary_distance_2', 'tertiary_angle_0', 'dist_O_1_x', 'tertiary_distance_3', 'mean_molecule_atom_0_dist_xyz', 'inv_distPR', 'min_molecule_atom_0_dist_xyz', 'molecule_type_dist_max', 'dist_C_0_x', 'dist_H_1_x', 'adC1', 'dist_O_0_x', 'molecule_atom_index_0_dist_max_diff', 'adN1', 'atom_index_1_cycle_size_mean', 'dist_N_0_y', 'tertiary_angle_1', 'tertiary_angle_4', 'dist_C_1_x', 'distance_farthest_0', 'inv_dist0', 'dist_C_3_x', 'atom_0_bond_lengths_max', 'atom_1_bond_lengths_min', 'bond_atom', 'inv_dist0R', 'dist_H_0_y', 'molecule_atom_index_1_dist_min_div', 'dist_C_2_x', 'cos_c0', 'dist_H_0_x', 'tertiary_distance_1', 'dist_H_2_y', 'tertiary_atom_3', 'atom_1_n_bonds', 'atom_1_bond_lengths_std', 'dist_O_0_y', 'vander_O.y', 'dist_C_3_y', 'atom_1_bond_lengths_mean', 'dist_C_2_y', 'yukawa_O.y', 'molecule_atom_index_0_dist_min_div', 'tertiary_angle_2', 'tertiary_atom_0', 'tertiary_atom_1', 'tertiary_angle_3', 'max_molecule_atom_1_dist_xyz', 'atom_1_bond_lengths_max', 'dist_C_1_y', 'distC0', 'adC2', 'tertiary_atom_4', 'cos_c0_c1', 'atom_index_1_n_cycle', 'dist_N_0_x', 'dist_H_1_y'], 'cv': {'cls': 'KFold', 'init': {'n_splits': 5, 'shuffle': True, 'random_state': 42}}, 'scaler': {'cls': 'StandardScaler', 'init': {}, 'fit': {}}, 'model': {'cls': 'lgb.LGBMRegressor', 'init': {'learning_rate': 0.2833769330240482, 'feature_fraction': 0.8818248470204605, 'bagging_fraction': 0.8205197060908092, 'min_data_in_leaf': 202, 'lambda_l1': 0.017039063121824582, 'lambda_l2': 0.8318702431636841, 'max_bin': 100, 'num_leaves': 255, 'random_state': 3895, 'n_jobs': 16}, 'fit': {}}, 'metric': 'mean_absolute_error'},
{'columns': ['tertiary_distance_2', 'dist_C_0_y', 'dist_H_1_x', 'molecule_atom_index_0_dist_min_diff', 'molecule_atom_index_0_dist_max_div', 'adC3', 'dist_C_3_y', 'tertiary_angle_1', 'yukawa_H.y', 'cos_f0_f1', 'dist_C_1_y', 'dist_to_type_1_mean', 'dist_O_0_y', 'cos_c1', 'adC2', 'dist_C_0_x', 'molecule_atom_index_0_dist_min_div', 'dist_to_type_std', 'adC1', 'tertiary_distance_1', 'dist_H_0_y', 'molecule_dist_min', 'max_distance_y', 'inv_distPE', 'dist_xyz', 'eem_0', 'dist_O_0_x', 'dist_to_type_mean', 'cos_c0_c1', 'cos_c0', 'adN1', 'tertiary_angle_0', 'tertiary_distance_4', 'dist_H_0_x', 'dist_C_1_x', 'inv_distP', 'molecule_atom_index_0_dist_mean_diff', 'tertiary_atom_1', 'tertiary_angle_2', 'mean_molecule_atom_0_dist_xyz', 'dist_C_2_y', 'dist_H_1_y', 'dist_C_3_x', 'dist_H_2_y', 'dist_H_3_y', 'link0', 'yukawa_H.x', 'dist_C_2_x', 'dist_N_0_y', 'dist_to_type_0_mean', 'dist_N_0_x', 'eem_1', 'tertiary_angle_3', 'distance_c1', 'dist_H_3_x', 'tertiary_distance_3', 'cos_f0', 'cos_f1', 'tertiary_atom_2'], 'cv': {'cls': 'KFold', 'init': {'n_splits': 5, 'shuffle': True, 'random_state': 42}}, 'scaler': {'cls': 'StandardScaler', 'init': {}, 'fit': {}}, 'model': {'cls': 'lgb.LGBMRegressor', 'init': {'learning_rate': 0.2833769330240482, 'feature_fraction': 0.8818248470204605, 'bagging_fraction': 0.8205197060908092, 'min_data_in_leaf': 202, 'lambda_l1': 0.017039063121824582, 'lambda_l2': 0.8318702431636841, 'max_bin': 100, 'num_leaves': 255, 'random_state': 3895, 'n_jobs': 16}, 'fit': {}}, 'metric': 'mean_absolute_error'},
{'columns': ['tertiary_distance_2', 'dist_C_0_y', 'dist_H_1_x', 'molecule_atom_index_0_dist_min_diff', 'molecule_atom_index_0_dist_max_div', 'adC3', 'dist_C_3_y', 'tertiary_angle_1', 'yukawa_H.y', 'cos_f0_f1', 'dist_C_1_y', 'dist_to_type_1_mean', 'dist_O_0_y', 'cos_c1', 'adC2', 'dist_C_0_x', 'molecule_atom_index_0_dist_min_div', 'dist_to_type_std', 'adC1', 'tertiary_distance_1', 'dist_H_0_y', 'molecule_dist_min', 'max_distance_y', 'inv_distPE', 'dist_xyz', 'eem_0', 'dist_O_0_x', 'dist_to_type_mean', 'cos_c0_c1', 'cos_c0', 'adN1', 'tertiary_angle_0', 'tertiary_distance_4', 'dist_H_0_x', 'dist_C_1_x', 'inv_distP', 'molecule_atom_index_0_dist_mean_diff', 'tertiary_atom_1', 'tertiary_angle_2', 'mean_molecule_atom_0_dist_xyz', 'dist_C_2_y', 'dist_H_1_y', 'dist_C_3_x', 'dist_H_2_y', 'dist_H_3_y', 'link0', 'yukawa_H.x', 'dist_C_2_x', 'dist_N_0_y', 'dist_to_type_0_mean', 'dist_N_0_x', 'eem_1', 'tertiary_angle_3', 'distance_c1', 'dist_H_3_x', 'tertiary_distance_3', 'cos_f0', 'cos_f1', 'tertiary_atom_2'], 'cv': {'cls': 'KFold', 'init': {'n_splits': 5, 'shuffle': True, 'random_state': 42}}, 'scaler': {'cls': 'StandardScaler', 'init': {}, 'fit': {}}, 'model': {'cls': 'lgb.LGBMRegressor', 'init': {'learning_rate': 0.2833769330240482, 'feature_fraction': 0.8818248470204605, 'bagging_fraction': 0.8205197060908092, 'min_data_in_leaf': 202, 'lambda_l1': 0.017039063121824582, 'lambda_l2': 0.8318702431636841, 'max_bin': 100, 'num_leaves': 255, 'random_state': 3895, 'n_jobs': 16}, 'fit': {}}, 'metric': 'mean_absolute_error'},
{'columns': ['tertiary_distance_2', 'dist_C_0_y', 'dist_H_1_x', 'molecule_atom_index_0_dist_min_diff', 'molecule_atom_index_0_dist_max_div', 'adC3', 'dist_C_3_y', 'tertiary_angle_1', 'yukawa_H.y', 'cos_f0_f1', 'dist_C_1_y', 'dist_to_type_1_mean', 'dist_O_0_y', 'cos_c1', 'adC2', 'dist_C_0_x', 'molecule_atom_index_0_dist_min_div', 'dist_to_type_std', 'adC1', 'tertiary_distance_1', 'dist_H_0_y', 'molecule_dist_min', 'max_distance_y', 'inv_distPE', 'dist_xyz', 'eem_0', 'dist_O_0_x', 'dist_to_type_mean', 'cos_c0_c1', 'cos_c0', 'adN1', 'tertiary_angle_0', 'tertiary_distance_4', 'dist_H_0_x', 'dist_C_1_x', 'inv_distP', 'molecule_atom_index_0_dist_mean_diff', 'tertiary_atom_1', 'tertiary_angle_2', 'mean_molecule_atom_0_dist_xyz', 'dist_C_2_y', 'dist_H_1_y', 'dist_C_3_x', 'dist_H_2_y', 'dist_H_3_y', 'link0', 'yukawa_H.x', 'dist_C_2_x', 'dist_N_0_y', 'dist_to_type_0_mean', 'dist_N_0_x', 'eem_1', 'tertiary_angle_3', 'distance_c1', 'dist_H_3_x', 'tertiary_distance_3', 'cos_f0', 'cos_f1', 'tertiary_atom_2'], 'cv': {'cls': 'KFold', 'init': {'n_splits': 5, 'shuffle': True, 'random_state': 42}}, 'scaler': {'cls': 'StandardScaler', 'init': {}, 'fit': {}}, 'model': {'cls': 'lgb.LGBMRegressor', 'init': {'learning_rate': 0.2833769330240482, 'feature_fraction': 0.8818248470204605, 'bagging_fraction': 0.8205197060908092, 'min_data_in_leaf': 202, 'lambda_l1': 0.017039063121824582, 'lambda_l2': 0.8318702431636841, 'max_bin': 100, 'num_leaves': 255, 'random_state': 3895, 'n_jobs': 16}, 'fit': {}}, 'metric': 'mean_absolute_error'},
{'columns': ['tertiary_distance_2', 'dist_C_0_y', 'dist_H_1_x', 'molecule_atom_index_0_dist_min_diff', 'molecule_atom_index_0_dist_max_div', 'adC3', 'dist_C_3_y', 'tertiary_angle_1', 'yukawa_H.y', 'cos_f0_f1', 'dist_C_1_y', 'dist_to_type_1_mean', 'dist_O_0_y', 'cos_c1', 'adC2', 'dist_C_0_x', 'molecule_atom_index_0_dist_min_div', 'dist_to_type_std', 'adC1', 'tertiary_distance_1', 'dist_H_0_y', 'molecule_dist_min', 'max_distance_y', 'inv_distPE', 'dist_xyz', 'eem_0', 'dist_O_0_x', 'dist_to_type_mean', 'cos_c0_c1', 'cos_c0', 'adN1', 'tertiary_angle_0', 'tertiary_distance_4', 'dist_H_0_x', 'dist_C_1_x', 'inv_distP', 'molecule_atom_index_0_dist_mean_diff', 'tertiary_atom_1', 'tertiary_angle_2', 'mean_molecule_atom_0_dist_xyz', 'dist_C_2_y', 'dist_H_1_y', 'dist_C_3_x', 'dist_H_2_y', 'dist_H_3_y', 'link0', 'yukawa_H.x', 'dist_C_2_x', 'dist_N_0_y', 'dist_to_type_0_mean', 'dist_N_0_x', 'eem_1', 'tertiary_angle_3', 'distance_c1', 'dist_H_3_x', 'tertiary_distance_3', 'cos_f0', 'cos_f1', 'tertiary_atom_2'], 'cv': {'cls': 'KFold', 'init': {'n_splits': 5, 'shuffle': True, 'random_state': 42}}, 'scaler': {'cls': 'StandardScaler', 'init': {}, 'fit': {}}, 'model': {'cls': 'lgb.LGBMRegressor', 'init': {'learning_rate': 0.2833769330240482, 'feature_fraction': 0.8818248470204605, 'bagging_fraction': 0.8205197060908092, 'min_data_in_leaf': 202, 'lambda_l1': 0.017039063121824582, 'lambda_l2': 0.8318702431636841, 'max_bin': 100, 'num_leaves': 255, 'random_state': 3895, 'n_jobs': 16}, 'fit': {}}, 'metric': 'mean_absolute_error'},
{'columns': ['max_molecule_atom_0_dist_xyz', 'tertiary_distance_2', 'dist_C_0_y', 'dist_H_1_x', 'molecule_atom_index_0_dist_min_diff', 'molecule_atom_index_1_dist_min_diff', 'dist_C_4_x', 'adC3', 'distN0', 'atom_0_bond_lengths_max', 'dist_C_3_y', 'tertiary_angle_1', 'atom_1_bond_lengths_mean', 'dist_C_1_y', 'dist_to_type_1_mean', 'dist_O_0_y', 'cos_c1', 'adC2', 'molecule_atom_index_1_dist_max_div', 'molecule_atom_index_0_dist_min_div', 'dist_C_0_x', 'adC1', 'tertiary_distance_1', 'atom_1_bond_lengths_std', 'dist_H_0_y', 'dist_xyz', 'dist_O_0_x', 'inv_distPR', 'cos_c0_c1', 'cos_c0', 'tertiary_angle_0', 'tertiary_distance_4', 'dist_H_0_x', 'dist_C_1_x', 'tertiary_atom_1', 'tertiary_angle_2', 'mean_molecule_atom_0_dist_xyz', 'dist_C_2_y', 'dist_C_3_x', 'atom_1_n_bonds', 'bond_atom', 'dist_C_2_x', 'dist_N_0_y', 'dist_N_0_x', 'atom_index_1_cycle_size_mean', 'min_molecule_atom_0_dist_xyz', 'tertiary_angle_3', 'tertiary_distance_3', 'cos_f0'], 'cv': {'cls': 'KFold', 'init': {'n_splits': 5, 'shuffle': True, 'random_state': 42}}, 'scaler': {'cls': 'StandardScaler', 'init': {}, 'fit': {}}, 'model': {'cls': 'lgb.LGBMRegressor', 'init': {'learning_rate': 0.2833769330240482, 'feature_fraction': 0.8818248470204605, 'bagging_fraction': 0.8205197060908092, 'min_data_in_leaf': 202, 'lambda_l1': 0.017039063121824582, 'lambda_l2': 0.8318702431636841, 'max_bin': 100, 'num_leaves': 255, 'random_state': 3895, 'n_jobs': 16}, 'fit': {}}, 'metric': 'mean_absolute_error'}]

In [24]:
unique_columns = []
for col in column_list:
    unique_columns +=col['columns'][:30]

In [25]:
unique_columns = list(set(unique_columns))

In [26]:
# df = take_n_atoms(full, 7)
# # LightGBM performs better with 0-s then with NaN-s
# df = df.fillna(0)
# df.columns

In [27]:
file_folder =  '../../data/feature'
df_data = pd.read_pickle(f'{file_folder}/df_train.gzde', compression='gzip')
df_data = df_data.rename(columns={'index':'id'})
df_data = df_data[unique_columns+['id']]

In [28]:
file_folder =  '../../data/feature'
df_data_test = pd.read_pickle(f'{file_folder}/df_test.gzde', compression='gzip')
df_data_test = df_data_test.rename(columns={'index':'id'})
df_data_test = df_data_test[unique_columns+['id']]

In [None]:
# df = pd.merge(df, df_data[unique_columns+['id']], on='id')

In [100]:
# X_data = df.drop(['scalar_coupling_constant', 'id'], axis=1).values.astype('float32')
# y_data = df['scalar_coupling_constant'].values.astype('float32')

# X_train, X_val, y_train, y_val = train_test_split(X_data, y_data, test_size=0.2, random_state=128)
# X_train.shape, X_val.shape, y_train.shape, y_val.shape

((34690, 77), (8673, 77), (34690,), (8673,))

In [29]:
# configuration params are copied from @artgor kernel:
# https://www.kaggle.com/artgor/brute-force-feature-engineering
LGB_PARAMS = {
    'objective': 'regression',
    'metric': 'mae',
    'verbosity': -1,
    'boosting_type': 'gbdt',
    'learning_rate': 0.2,
    'num_leaves': 256,
    'min_child_samples': 79,
    'max_depth': 9,
    'subsample_freq': 1,
    'subsample': 0.9,
    'bagging_seed': 11,
    'reg_alpha': 0.1,
    'reg_lambda': 0.3,
    'colsample_bytree': 1.0
}

In [32]:
# model = LGBMRegressor(**LGB_PARAMS, n_estimators=1500, n_jobs = -1)
# model.fit(X_train, y_train, 
#         eval_set=[(X_train, y_train), (X_val, y_val)], eval_metric='mae',
#         verbose=100, early_stopping_rounds=200)

# y_pred = model.predict(X_val)
# np.log(mean_absolute_error(y_val, y_pred))

In [34]:
# mean_absolute_error(y_val, y_pred)

In [35]:
# cols = df.columns.drop(['id', 'scalar_coupling_constant'])
# df_importance = pd.DataFrame({'feature': cols, 'importance': model.feature_importances_})
# sns.barplot(x="importance", y="feature", data=df_importance.sort_values('importance', ascending=False).head(20));

In [205]:
def build_x_y_data(some_csv, coupling_type, n_atoms, is_train=True, y_name='scalar_coupling_constant', oof_train=None, oof_test=None):
    full = build_couple_dataframe(some_csv, structures_csv, coupling_type, n_atoms=n_atoms)
    
    df = take_n_atoms(full, n_atoms, y_name = y_name)
    df = df.fillna(0)
    print(df.columns)
    id_index = df.id
    if is_train:
        df = pd.merge(df, df_data[unique_columns+['id']], on='id')
        if type(oof_train) != type(None):
            df = pd.merge(df, oof_train, on='id')
        df = df.drop(columns=['id'])
    else:
        df = pd.merge(df, df_data_test[unique_columns+['id']], on='id')
        if type(oof_test) != type(None):
            df = pd.merge(df, oof_test, on='id')
        df = df.drop(columns=['id'])
        
    if y_name in df:
        X_data = df.drop([y_name], axis=1).values.astype('float32')
        y_data = df[y_name].values.astype('float32')
    else:
        X_data = df.values.astype('float32')
        y_data = None
    
    return X_data, y_data, id_index

In [104]:
# oof = pd.DataFrame(np.zeros((train_csv.shape[0],1)), columns=['value'])
# oof.index = train_csv.index

In [181]:
# oof.loc[oof.index[[2,3]], 'value']=9

In [200]:
df_oof_test.head()

Unnamed: 0_level_0,oof_scalar_coupling_constant,oof_fc,oof_sd,oof_pso,oof_dso,id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
4658147,9.149982,12.249523,0.906575,5.359693,-1.267781,4658147
4658148,179.073181,180.952896,0.900566,-0.84744,0.303198,4658148
4658149,12.809299,13.23485,0.012303,2.276132,-3.326517,4658149
4658150,179.713409,181.207138,0.916309,-0.84393,0.297011,4658150
4658151,8.526583,12.077095,0.901909,5.355651,-1.267441,4658151


In [201]:
# df_oof_train['id'] = df_oof_train.index
# df_oof_test['id'] = df_oof_test.index

df_oof_train = df_oof_train.reset_index(drop=True)
df_oof_test = df_oof_test.reset_index(drop=True)

In [208]:
def train_and_predict_for_one_coupling_type(trial, coupling_type, submission, n_atoms, n_folds=5, n_splits=5, random_state=128, oof=None, y_name='scalar_coupling_constant'):
    print(f'*** Training Model for {coupling_type} ***')
    
    X_data, y_data, X_data_index  = build_x_y_data(train_csv, coupling_type, n_atoms, y_name=y_name, oof_train=df_oof_train, oof_test=df_oof_test)
    X_test, _, _  = build_x_y_data(test_csv, coupling_type, n_atoms, is_train=False, oof_train=df_oof_train, oof_test=df_oof_test)
    y_pred = np.zeros(X_test.shape[0], dtype='float32')

    cv_score = 0
    
    if n_folds > n_splits:
        n_splits = n_folds
    
    kfold = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    for fold, (train_index, val_index) in enumerate(kfold.split(X_data, y_data)):
        if fold >= n_folds:
            break

        X_train, X_val = X_data[train_index], X_data[val_index]
        y_train, y_val = y_data[train_index], y_data[val_index]

        model = LGBMRegressor(**LGB_PARAMS, n_estimators=1500, n_jobs = -1)
        model.fit(X_train, y_train, 
            eval_set=[(X_train, y_train), (X_val, y_val)], eval_metric='mae',
            early_stopping_rounds=200, verbose=False)

        y_val_pred = model.predict(X_val)
        
        if type(oof)!=type(None):
            oof.loc[X_data_index.values[val_index],f'oof_{y_name}'] = y_val_pred
            trial.append({'X_data_index':X_data_index, 'val_index':val_index, 'y_val_pred':y_val_pred})
        
        val_score = np.log(mean_absolute_error(y_val, y_val_pred))
        print(f'{coupling_type} Fold {fold}, logMAE: {val_score}')
        
        cv_score += val_score / n_folds
        y_pred += model.predict(X_test) / n_folds
        
        
    submission.loc[test_csv['type'] == coupling_type, 'scalar_coupling_constant'] = y_pred
    return cv_score

In [None]:
def train_and_predict_for_one_coupling_type_xgboost(trial, coupling_type, submission, n_atoms, n_folds=5, n_splits=5, random_state=128, oof=None, y_name='scalar_coupling_constant'):
    print(f'*** Training Model for {coupling_type} ***')
    
    X_data, y_data, X_data_index  = build_x_y_data(train_csv, coupling_type, n_atoms, y_name=y_name, oof_train=df_oof_train, oof_test=df_oof_test)
    X_test, _, _  = build_x_y_data(test_csv, coupling_type, n_atoms, is_train=False, oof_train=df_oof_train, oof_test=df_oof_test)
    y_pred = np.zeros(X_test.shape[0], dtype='float32')

    cv_score = 0
    
    if n_folds > n_splits:
        n_splits = n_folds
    
    kfold = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    for fold, (train_index, val_index) in enumerate(kfold.split(X_data, y_data)):
        if fold >= n_folds:
            break

        X_train, X_val = X_data[train_index], X_data[val_index]
        y_train, y_val = y_data[train_index], y_data[val_index]

        model = LGBMRegressor(**LGB_PARAMS, n_estimators=1500, n_jobs = -1)
        model.fit(X_train, y_train, 
            eval_set=[(X_train, y_train), (X_val, y_val)], eval_metric='mae',
            early_stopping_rounds=200, verbose=False)

        y_val_pred = model.predict(X_val)
        
        if type(oof)!=type(None):
            oof.loc[X_data_index.values[val_index],f'oof_{y_name}'] = y_val_pred
            trial.append({'X_data_index':X_data_index, 'val_index':val_index, 'y_val_pred':y_val_pred})
        
        val_score = np.log(mean_absolute_error(y_val, y_val_pred))
        print(f'{coupling_type} Fold {fold}, logMAE: {val_score}')
        
        cv_score += val_score / n_folds
        y_pred += model.predict(X_test) / n_folds
        
        
    submission.loc[test_csv['type'] == coupling_type, 'scalar_coupling_constant'] = y_pred
    return cv_score

In [87]:
# def train_and_predict_for_one_coupling_type2(trial, coupling_type, submission, n_atoms, n_folds=5, n_splits=5, random_state=128, oof=None, y_name='scalar_coupling_constant'):
#     print(f'*** Training Model for {coupling_type} ***')
    
#     X_data, y_data, X_data_index  = build_x_y_data(train_csv, coupling_type, n_atoms, y_name)
#     X_test, _, _  = build_x_y_data(test_csv, coupling_type, n_atoms, is_train=False)
#     y_pred = np.zeros(X_test.shape[0], dtype='float32')
#     print(X_data.shape)
#     cv_score = 0
    
#     if n_folds > n_splits:
#         n_splits = n_folds
    
#     kfold = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)

#     for fold, (train_index, val_index) in enumerate(kfold.split(X_data, y_data)):
#         if fold >= n_folds:
#             break

        
#         if type(oof)!=type(None):
#             tmp = ['1JHC', '2JHH', '1JHN', '2JHN', '2JHC', '3JHH', '3JHC', '3JHN'].index(coupling_type)
#             print(coupling_type, tmp)
#             trial.append({'X_data_index':X_data_index, 'val_index':val_index, 'tmp':tmp, 'fold':fold})
#             oof.loc[X_data_index[val_index],f'oof_{y_name}'] = tmp*100+fold
            
#     return cv_score

In [138]:
model_params = {
    '1JHN': 7,
    '1JHC': 10,
    '2JHH': 9,
    '2JHN': 9,
    '2JHC': 9,
    '3JHH': 9,
    '3JHC': 10,
    '3JHN': 10
}
N_FOLDS = 5


y_name = 'scalar_coupling_constant'

submission = submission_csv.copy()
oof_df = pd.DataFrame(np.zeros((train_csv.shape[0],1)), columns=[f'oof_{y_name}'])
oof_df.index = train_csv.index

trial = []
cv_scores = {}
for coupling_type in model_params.keys():
    cv_score = train_and_predict_for_one_coupling_type(trial, coupling_type, submission, n_atoms=model_params[coupling_type], n_folds=N_FOLDS, oof=oof_df, y_name=y_name)
    cv_scores[coupling_type] = cv_score

*** Training Model for 1JHN ***
Index(['atom_2', 'atom_3', 'atom_4', 'atom_5', 'atom_6', 'd_1_0', 'd_2_0',
       'd_2_1', 'd_3_0', 'd_3_1', 'd_3_2', 'd_4_0', 'd_4_1', 'd_4_2', 'd_4_3',
       'd_5_0', 'd_5_1', 'd_5_2', 'd_5_3', 'd_6_0', 'd_6_1', 'd_6_2', 'd_6_3',
       'scalar_coupling_constant', 'id'],
      dtype='object')
Index(['atom_2', 'atom_3', 'atom_4', 'atom_5', 'atom_6', 'd_1_0', 'd_2_0',
       'd_2_1', 'd_3_0', 'd_3_1', 'd_3_2', 'd_4_0', 'd_4_1', 'd_4_2', 'd_4_3',
       'd_5_0', 'd_5_1', 'd_5_2', 'd_5_3', 'd_6_0', 'd_6_1', 'd_6_2', 'd_6_3',
       'id'],
      dtype='object')
1JHN Fold 0, logMAE: -1.124728668711976
1JHN Fold 1, logMAE: -1.1514029443479736
1JHN Fold 2, logMAE: -1.1340268510358888
1JHN Fold 3, logMAE: -1.1651947909527416
1JHN Fold 4, logMAE: -1.1357569127902596
*** Training Model for 1JHC ***
Index(['atom_2', 'atom_3', 'atom_4', 'atom_5', 'atom_6', 'atom_7', 'atom_8',
       'atom_9', 'd_1_0', 'd_2_0', 'd_2_1', 'd_3_0', 'd_3_1', 'd_3_2', 'd_4_0',
       'd

In [176]:
list_ = []

In [177]:
# list_ = []
for y_name in ['scalar_coupling_constant','fc', 'sd', 'pso', 'dso']:
    submission = submission_csv.copy()
    submission = submission.rename(columns={'scalar_coupling_constant':y_name})
    oof_df = pd.DataFrame(np.zeros((train_csv.shape[0],1)), columns=[f'oof_{y_name}'])
    oof_df.index = train_csv.index

    trial = []
    cv_scores = {}
    for coupling_type in model_params.keys():
        cv_score = train_and_predict_for_one_coupling_type(trial, coupling_type, submission, n_atoms=model_params[coupling_type], n_folds=N_FOLDS, oof=oof_df, y_name=y_name)
        cv_scores[coupling_type] = cv_score
    list_.append({'y_name':y_name, 'submission':submission, 'oof':oof_df, 'cv_scores':cv_scores})

*** Training Model for 1JHN ***
Index(['atom_2', 'atom_3', 'atom_4', 'atom_5', 'atom_6', 'd_1_0', 'd_2_0',
       'd_2_1', 'd_3_0', 'd_3_1', 'd_3_2', 'd_4_0', 'd_4_1', 'd_4_2', 'd_4_3',
       'd_5_0', 'd_5_1', 'd_5_2', 'd_5_3', 'd_6_0', 'd_6_1', 'd_6_2', 'd_6_3',
       'scalar_coupling_constant', 'id'],
      dtype='object')
Index(['atom_2', 'atom_3', 'atom_4', 'atom_5', 'atom_6', 'd_1_0', 'd_2_0',
       'd_2_1', 'd_3_0', 'd_3_1', 'd_3_2', 'd_4_0', 'd_4_1', 'd_4_2', 'd_4_3',
       'd_5_0', 'd_5_1', 'd_5_2', 'd_5_3', 'd_6_0', 'd_6_1', 'd_6_2', 'd_6_3',
       'id'],
      dtype='object')
1JHN Fold 0, logMAE: -1.124728668711976
1JHN Fold 1, logMAE: -1.1514029443479736
1JHN Fold 2, logMAE: -1.1340268510358888
1JHN Fold 3, logMAE: -1.1651947909527416
1JHN Fold 4, logMAE: -1.1357569127902596
*** Training Model for 1JHC ***
Index(['atom_2', 'atom_3', 'atom_4', 'atom_5', 'atom_6', 'atom_7', 'atom_8',
       'atom_9', 'd_1_0', 'd_2_0', 'd_2_1', 'd_3_0', 'd_3_1', 'd_3_2', 'd_4_0',
       'd

Index(['atom_2', 'atom_3', 'atom_4', 'atom_5', 'atom_6', 'd_1_0', 'd_2_0',
       'd_2_1', 'd_3_0', 'd_3_1', 'd_3_2', 'd_4_0', 'd_4_1', 'd_4_2', 'd_4_3',
       'd_5_0', 'd_5_1', 'd_5_2', 'd_5_3', 'd_6_0', 'd_6_1', 'd_6_2', 'd_6_3',
       'id'],
      dtype='object')
1JHN Fold 0, logMAE: -1.091296086177825
1JHN Fold 1, logMAE: -1.1464416846309393
1JHN Fold 2, logMAE: -1.130463000997343
1JHN Fold 3, logMAE: -1.1388098457511384
1JHN Fold 4, logMAE: -1.1203175036412554
*** Training Model for 1JHC ***
Index(['atom_2', 'atom_3', 'atom_4', 'atom_5', 'atom_6', 'atom_7', 'atom_8',
       'atom_9', 'd_1_0', 'd_2_0', 'd_2_1', 'd_3_0', 'd_3_1', 'd_3_2', 'd_4_0',
       'd_4_1', 'd_4_2', 'd_4_3', 'd_5_0', 'd_5_1', 'd_5_2', 'd_5_3', 'd_6_0',
       'd_6_1', 'd_6_2', 'd_6_3', 'd_7_0', 'd_7_1', 'd_7_2', 'd_7_3', 'd_8_0',
       'd_8_1', 'd_8_2', 'd_8_3', 'd_9_0', 'd_9_1', 'd_9_2', 'd_9_3', 'fc',
       'id'],
      dtype='object')
Index(['atom_2', 'atom_3', 'atom_4', 'atom_5', 'atom_6', 'atom_7', 'a

1JHN Fold 0, logMAE: -5.807197008095338
1JHN Fold 1, logMAE: -5.797200722150968
1JHN Fold 2, logMAE: -5.8227775533592085
1JHN Fold 3, logMAE: -5.797562349181874
1JHN Fold 4, logMAE: -5.809263532833999
*** Training Model for 1JHC ***
Index(['atom_2', 'atom_3', 'atom_4', 'atom_5', 'atom_6', 'atom_7', 'atom_8',
       'atom_9', 'd_1_0', 'd_2_0', 'd_2_1', 'd_3_0', 'd_3_1', 'd_3_2', 'd_4_0',
       'd_4_1', 'd_4_2', 'd_4_3', 'd_5_0', 'd_5_1', 'd_5_2', 'd_5_3', 'd_6_0',
       'd_6_1', 'd_6_2', 'd_6_3', 'd_7_0', 'd_7_1', 'd_7_2', 'd_7_3', 'd_8_0',
       'd_8_1', 'd_8_2', 'd_8_3', 'd_9_0', 'd_9_1', 'd_9_2', 'd_9_3', 'sd',
       'id'],
      dtype='object')
Index(['atom_2', 'atom_3', 'atom_4', 'atom_5', 'atom_6', 'atom_7', 'atom_8',
       'atom_9', 'd_1_0', 'd_2_0', 'd_2_1', 'd_3_0', 'd_3_1', 'd_3_2', 'd_4_0',
       'd_4_1', 'd_4_2', 'd_4_3', 'd_5_0', 'd_5_1', 'd_5_2', 'd_5_3', 'd_6_0',
       'd_6_1', 'd_6_2', 'd_6_3', 'd_7_0', 'd_7_1', 'd_7_2', 'd_7_3', 'd_8_0',
       'd_8_1', 'd_8_2', 

Index(['atom_2', 'atom_3', 'atom_4', 'atom_5', 'atom_6', 'atom_7', 'atom_8',
       'atom_9', 'd_1_0', 'd_2_0', 'd_2_1', 'd_3_0', 'd_3_1', 'd_3_2', 'd_4_0',
       'd_4_1', 'd_4_2', 'd_4_3', 'd_5_0', 'd_5_1', 'd_5_2', 'd_5_3', 'd_6_0',
       'd_6_1', 'd_6_2', 'd_6_3', 'd_7_0', 'd_7_1', 'd_7_2', 'd_7_3', 'd_8_0',
       'd_8_1', 'd_8_2', 'd_8_3', 'd_9_0', 'd_9_1', 'd_9_2', 'd_9_3', 'id'],
      dtype='object')
1JHC Fold 0, logMAE: -4.272384582468315
1JHC Fold 1, logMAE: -4.2814265471565935
1JHC Fold 2, logMAE: -4.27600257903718
1JHC Fold 3, logMAE: -4.277348131716801
1JHC Fold 4, logMAE: -4.283201526166264
*** Training Model for 2JHH ***
Index(['atom_2', 'atom_3', 'atom_4', 'atom_5', 'atom_6', 'atom_7', 'atom_8',
       'd_1_0', 'd_2_0', 'd_2_1', 'd_3_0', 'd_3_1', 'd_3_2', 'd_4_0', 'd_4_1',
       'd_4_2', 'd_4_3', 'd_5_0', 'd_5_1', 'd_5_2', 'd_5_3', 'd_6_0', 'd_6_1',
       'd_6_2', 'd_6_3', 'd_7_0', 'd_7_1', 'd_7_2', 'd_7_3', 'd_8_0', 'd_8_1',
       'd_8_2', 'd_8_3', 'pso', 'id'],
 

1JHC Fold 0, logMAE: -5.178088813996847
1JHC Fold 1, logMAE: -5.175524224229746
1JHC Fold 2, logMAE: -5.173648244373126
1JHC Fold 3, logMAE: -5.183028392213993
1JHC Fold 4, logMAE: -5.178086670030704
*** Training Model for 2JHH ***
Index(['atom_2', 'atom_3', 'atom_4', 'atom_5', 'atom_6', 'atom_7', 'atom_8',
       'd_1_0', 'd_2_0', 'd_2_1', 'd_3_0', 'd_3_1', 'd_3_2', 'd_4_0', 'd_4_1',
       'd_4_2', 'd_4_3', 'd_5_0', 'd_5_1', 'd_5_2', 'd_5_3', 'd_6_0', 'd_6_1',
       'd_6_2', 'd_6_3', 'd_7_0', 'd_7_1', 'd_7_2', 'd_7_3', 'd_8_0', 'd_8_1',
       'd_8_2', 'd_8_3', 'dso', 'id'],
      dtype='object')
Index(['atom_2', 'atom_3', 'atom_4', 'atom_5', 'atom_6', 'atom_7', 'atom_8',
       'd_1_0', 'd_2_0', 'd_2_1', 'd_3_0', 'd_3_1', 'd_3_2', 'd_4_0', 'd_4_1',
       'd_4_2', 'd_4_3', 'd_5_0', 'd_5_1', 'd_5_2', 'd_5_3', 'd_6_0', 'd_6_1',
       'd_6_2', 'd_6_3', 'd_7_0', 'd_7_1', 'd_7_2', 'd_7_3', 'd_8_0', 'd_8_1',
       'd_8_2', 'd_8_3', 'id'],
      dtype='object')
2JHH Fold 0, logMAE: -4.0

In [181]:
for item in list_:
#     cv_scores = item['cv_scores']
#     df_res = pd.DataFrame({'type': list(cv_scores.keys()), 'cv_score': list(cv_scores.values())})
#     df_res['int_type'] = df_res['type'].apply(lambda x:['1JHC','1JHN','2JHC','2JHH','2JHN','3JHC','3JHH','3JHN'].index(x))
#     df_res['mae'] = df_res['cv_score'].apply(lambda x : np.exp(x))
    print(item['y_name'], mean_absolute_error(train_csv[item['y_name']].values, item['oof'].values))

scalar_coupling_constant 0.2893108003340329
fc 0.29082081949745403
sd 0.003091299517806892
pso 0.011193640391564577
dso 0.00822388551264211


In [228]:
# import pickle
# with open('distance-is-all-you-need_list_.pickle', 'wb') as f:
#     pickle.dump(list_, f)

In [190]:
df_oof_train = pd.DataFrame()
df_oof_test = pd.DataFrame()
for item in list_:
    y_name = item['y_name']
    df_oof_train[f'oof_{y_name}'] = item['oof'][f'oof_{y_name}']
    df_oof_test[f'oof_{y_name}'] = item['submission']['scalar_coupling_constant']

In [191]:
df_oof_train.head()

Unnamed: 0_level_0,oof_scalar_coupling_constant,oof_fc,oof_sd,oof_pso,oof_dso
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,85.004863,83.077955,0.252714,1.252832,0.2841
1,-11.306925,-10.940348,0.354127,2.869538,-3.420384
2,-11.029535,-10.544198,0.354161,2.860473,-3.412776
3,-11.197567,-10.71694,0.357179,2.850868,-3.44212
4,84.605554,83.714519,0.262712,1.246529,0.282191


In [192]:
df_oof_test.head()

Unnamed: 0_level_0,oof_scalar_coupling_constant,oof_fc,oof_sd,oof_pso,oof_dso
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
4658147,9.149982,12.249523,0.906575,5.359693,-1.267781
4658148,179.073181,180.952896,0.900566,-0.84744,0.303198
4658149,12.809299,13.23485,0.012303,2.276132,-3.326517
4658150,179.713409,181.207138,0.916309,-0.84393,0.297011
4658151,8.526583,12.077095,0.901909,5.355651,-1.267441


In [188]:
for item in list_:
    print(item['submission'].shape, item['submission']['scalar_coupling_constant'].mean(), item['submission']['scalar_coupling_constant'].std())

(2505542, 1) 15.869698436540466 34.838989370302954
(2505542, 1) 15.641493823034676 34.298174175301305
(2505542, 1) 0.08206719641686595 0.13818854651377852
(2505542, 1) 0.3745125150202676 0.7424503400277441
(2505542, 1) -0.23308052856791028 0.9292101164285638


In [189]:
for col in ['scalar_coupling_constant','fc','sd','pso','dso']:
    print(train_csv[col].mean(), train_csv[col].std())

15.89904499053955 34.7704963684082
15.697763344513131 34.41460738906247
0.0823363734693203 0.13865160387829645
0.37516193045108553 0.7422132894555231
-0.23361170378048032 0.929603586096539


In [209]:
y_name = 'scalar_coupling_constant'

submission = submission_csv.copy()
trial = []
cv_scores = {}
for coupling_type in model_params.keys():
    cv_score = train_and_predict_for_one_coupling_type(trial, coupling_type, submission, n_atoms=model_params[coupling_type], n_folds=N_FOLDS, oof=None, y_name=y_name)
    cv_scores[coupling_type] = cv_score

*** Training Model for 1JHN ***
Index(['atom_2', 'atom_3', 'atom_4', 'atom_5', 'atom_6', 'd_1_0', 'd_2_0',
       'd_2_1', 'd_3_0', 'd_3_1', 'd_3_2', 'd_4_0', 'd_4_1', 'd_4_2', 'd_4_3',
       'd_5_0', 'd_5_1', 'd_5_2', 'd_5_3', 'd_6_0', 'd_6_1', 'd_6_2', 'd_6_3',
       'scalar_coupling_constant', 'id'],
      dtype='object')
Index(['atom_2', 'atom_3', 'atom_4', 'atom_5', 'atom_6', 'd_1_0', 'd_2_0',
       'd_2_1', 'd_3_0', 'd_3_1', 'd_3_2', 'd_4_0', 'd_4_1', 'd_4_2', 'd_4_3',
       'd_5_0', 'd_5_1', 'd_5_2', 'd_5_3', 'd_6_0', 'd_6_1', 'd_6_2', 'd_6_3',
       'id'],
      dtype='object')
1JHN Fold 0, logMAE: -1.1425606359443712
1JHN Fold 1, logMAE: -1.199826531874221
1JHN Fold 2, logMAE: -1.1767419509858865
1JHN Fold 3, logMAE: -1.1960522875714708
1JHN Fold 4, logMAE: -1.1741968667294234
*** Training Model for 1JHC ***
Index(['atom_2', 'atom_3', 'atom_4', 'atom_5', 'atom_6', 'atom_7', 'atom_8',
       'atom_9', 'd_1_0', 'd_2_0', 'd_2_1', 'd_3_0', 'd_3_1', 'd_3_2', 'd_4_0',
       'd

In [221]:
df_trial = pd.DataFrame(trial)

In [212]:
cv_scores = item['cv_scores']
df_res = pd.DataFrame({'type': list(cv_scores.keys()), 'cv_score': list(cv_scores.values())})
df_res['int_type'] = df_res['type'].apply(lambda x:['1JHC','1JHN','2JHC','2JHH','2JHN','3JHC','3JHH','3JHN'].index(x))
df_res['mae'] = df_res['cv_score'].apply(lambda x : np.exp(x))
df_res.mae.mean(), df_res.cv_score.mean()

(0.007527959126488897, -5.280117448334904)

In [213]:
df_res

Unnamed: 0,type,cv_score,int_type,mae
0,1JHN,-6.201646,1,0.002026
1,1JHC,-5.177675,0,0.005641
2,2JHH,-4.018574,3,0.017979
3,2JHN,-6.364669,4,0.001721
4,2JHC,-5.136394,2,0.005879
5,3JHH,-3.953908,6,0.01918
6,3JHC,-5.138408,5,0.005867
7,3JHN,-6.249665,7,0.001931


In [210]:
submission.head(10)

Unnamed: 0_level_0,scalar_coupling_constant
id,Unnamed: 1_level_1
4658147,9.495814
4658148,190.056198
4658149,12.912923
4658150,190.100845
4658151,9.446241
4658152,90.104355
4658153,2.683022
4658154,-7.708962
4658155,-9.749232
4658156,90.090927


In [215]:
submission.head()

Unnamed: 0_level_0,scalar_coupling_constant
id,Unnamed: 1_level_1
4658147,9.495814
4658148,190.056198
4658149,12.912923
4658150,190.100845
4658151,9.446241


In [216]:
submission['id'] =submission.index

In [225]:
submission.to_csv('../../data/submission/submission_+distance-is-all-you-need-lb-1-481_data1_oof_.csv', index=False)