In [1]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [3]:
import os
import pandas as pd
import numpy as np

In [11]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [5]:
file_folder =  '../../data/input'
os.listdir(file_folder)

['sample_submission.csv',
 'magnetic_shielding_tensors.csv',
 'potential_energy.csv',
 'scalar_coupling_contributions.csv',
 'dipole_moments.csv',
 'mulliken_charges.csv',
 'train.csv',
 'test.csv',
 'structures.csv',
 'structures']

In [6]:
train = pd.read_csv(f'{file_folder}/train.csv')
test = pd.read_csv(f'{file_folder}/test.csv')
magnetic_shielding_tensors = pd.read_csv(f'{file_folder}/magnetic_shielding_tensors.csv')
dipole_moments = pd.read_csv(f'{file_folder}/dipole_moments.csv')
mulliken_charges = pd.read_csv(f'{file_folder}/mulliken_charges.csv')
potential_energy = pd.read_csv(f'{file_folder}/potential_energy.csv')
scalar_coupling_contributions = pd.read_csv(f'{file_folder}/scalar_coupling_contributions.csv')
structures = pd.read_csv(f'{file_folder}/structures.csv')

In [7]:
%%time
def map_atom_info(df, atom_idx):
    df = pd.merge(df, structures, how = 'left', left_on  = ['molecule_name', f'atom_index_{atom_idx}'], right_on = ['molecule_name',  'atom_index'])
    df = df.drop('atom_index', axis=1)
    df = df.rename(columns={'atom': f'atom_{atom_idx}', 'x': f'x_{atom_idx}', 'y': f'y_{atom_idx}', 'z': f'z_{atom_idx}'})
    return df

train = map_atom_info(train, 0)
train = map_atom_info(train, 1)
train_p_0 = train[['x_0', 'y_0', 'z_0']].values
train_p_1 = train[['x_1', 'y_1', 'z_1']].values
train['dist'] = np.linalg.norm(train_p_0 - train_p_1, axis=1)
train['dist_x'] = (train['x_0'] - train['x_1']) ** 2
train['dist_y'] = (train['y_0'] - train['y_1']) ** 2
train['dist_z'] = (train['z_0'] - train['z_1']) ** 2
train['type_0'] = train['type'].apply(lambda x: x[0])
train['type_1'] = train['type'].apply(lambda x: x[1:])
train['dist_to_type_mean'] = train['dist'] / train.groupby('type')['dist'].transform('mean')
train['dist_to_type_0_mean'] = train['dist'] / train.groupby('type_0')['dist'].transform('mean')
train['dist_to_type_1_mean'] = train['dist'] / train.groupby('type_1')['dist'].transform('mean')
train[f'molecule_type_dist_mean'] = train.groupby(['molecule_name', 'type'])['dist'].transform('mean')

test = map_atom_info(test, 0)
test = map_atom_info(test, 1)
test_p_0 = test[['x_0', 'y_0', 'z_0']].values
test_p_1 = test[['x_1', 'y_1', 'z_1']].values
test['dist'] = np.linalg.norm(test_p_0 - test_p_1, axis=1)
test['dist_x'] = (test['x_0'] - test['x_1']) ** 2
test['dist_y'] = (test['y_0'] - test['y_1']) ** 2
test['dist_z'] = (test['z_0'] - test['z_1']) ** 2
test['type_0'] = test['type'].apply(lambda x: x[0])
test['type_1'] = test['type'].apply(lambda x: x[1:])
test['dist_to_type_mean'] = test['dist'] / test.groupby('type')['dist'].transform('mean')
test['dist_to_type_0_mean'] = test['dist'] / test.groupby('type_0')['dist'].transform('mean')
test['dist_to_type_1_mean'] = test['dist'] / test.groupby('type_1')['dist'].transform('mean')
test[f'molecule_type_dist_mean'] = test.groupby(['molecule_name', 'type'])['dist'].transform('mean')

In [9]:
train.shape, test.shape

((4658147, 24), (2505542, 23))

In [8]:
train.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom_0,x_0,y_0,z_0,...,dist,dist_x,dist_y,dist_z,type_0,type_1,dist_to_type_mean,dist_to_type_0_mean,dist_to_type_1_mean,molecule_type_dist_mean
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076,H,0.00215,-0.006031,0.001976,...,1.091953,0.00022,1.192105,3.6e-05,1,JHC,0.999134,1.003367,0.463061,1.09195
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257,H,0.00215,-0.006031,0.001976,...,1.78312,1.019253,2.160261,3e-06,2,JHH,1.004634,0.852949,0.761935,1.783146
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548,H,0.00215,-0.006031,0.001976,...,1.783147,0.294812,2.112831,0.771973,2,JHH,1.004649,0.852963,0.761947,1.783146
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543,H,0.00215,-0.006031,0.001976,...,1.783157,0.276638,2.085032,0.817978,2,JHH,1.004655,0.852967,0.761951,1.783146
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074,H,1.011731,1.463751,0.000277,...,1.091952,1.049455,0.142844,6e-05,1,JHC,0.999133,1.003365,0.463061,1.09195


In [10]:
test.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,atom_0,x_0,y_0,z_0,atom_1,...,dist,dist_x,dist_y,dist_z,type_0,type_1,dist_to_type_mean,dist_to_type_0_mean,dist_to_type_1_mean,molecule_type_dist_mean
0,4658147,dsgdb9nsd_000004,2,0,2JHC,H,-1.661639,0.0,1.0,C,...,2.261178,5.112926,0.0,0.0,2,JHC,1.032563,1.081848,0.958712,2.261178
1,4658148,dsgdb9nsd_000004,2,1,1JHC,H,-1.661639,0.0,1.0,C,...,1.062099,1.128054,0.0,0.0,1,JHC,0.971801,0.976079,0.450317,1.062099
2,4658149,dsgdb9nsd_000004,2,3,3JHH,H,-1.661639,0.0,1.0,H,...,3.323277,11.044171,0.0,0.0,3,JHH,1.230386,1.115759,1.420469,3.323277
3,4658150,dsgdb9nsd_000004,3,0,1JHC,H,1.661639,0.0,1.0,C,...,1.062099,1.128054,0.0,0.0,1,JHC,0.971801,0.976079,0.450317,1.062099
4,4658151,dsgdb9nsd_000004,3,1,2JHC,H,1.661639,0.0,1.0,C,...,2.261178,5.112926,0.0,0.0,2,JHC,1.032563,1.081848,0.958712,2.261178


In [12]:
train = reduce_mem_usage(train)

Mem. usage decreased to 408.70 Mb (54.0% reduction)


In [15]:
test = reduce_mem_usage(test)

Mem. usage decreased to 215.05 Mb (53.1% reduction)


In [18]:
train.drop(columns=['molecule_name','atom_index_0','atom_index_1','type','scalar_coupling_constant']).to_pickle('../../data/feats/molecular-properties-eda-and-models_train.pkl')

In [19]:
test.drop(columns=['molecule_name','atom_index_0','atom_index_1','type']).to_pickle('../../data/feats/molecular-properties-eda-and-models_test.pkl')