In [1]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [1]:
import sys
sys.path.append("../..") # Adds higher directory to python modules path.
from utilities import aggregate_feature_calculators
from utilities import aggregate_feature_calculators_setting as aggcal
from utilities.parallel import Parallel

In [2]:
import os
import pandas as pd
import numpy as np
import math
import psutil

from tqdm import tqdm_notebook, tqdm

In [3]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [4]:
file_folder =  '../../data/input'
os.listdir(file_folder)

['test.csv',
 'structures',
 'sample_submission.csv',
 'magnetic_shielding_tensors.csv',
 'mulliken_charges.csv',
 'potential_energy.csv',
 'scalar_coupling_contributions.csv',
 'dipole_moments.csv',
 'structures.csv',
 'train.csv']

In [5]:
df_train = pd.read_csv('{}/train.csv'.format(file_folder))
df_test = pd.read_csv('{}/test.csv'.format(file_folder))
# df_train_sub_tensor = pd.read_csv('{}/magnetic_shielding_tensors.csv'.format(file_folder))
# dipole_moments = pd.read_csv('{}/dipole_moments.csv'.format(file_folder))
# df_train_sub_charge = pd.read_csv('{}/mulliken_charges.csv'.format(file_folder))
# potential_energy = pd.read_csv('{}/potential_energy.csv'.format(file_folder))
# scalar_coupling_contributions = pd.read_csv('{}/scalar_coupling_contributions.csv'.format(file_folder))
df_struct = pd.read_csv('{}/structures.csv'.format(file_folder))

In [6]:
# spherical_list = []
# for idx, row in df_struct.iterrows():
#     spherical_list.append({'r':np.sqrt(row['x']**2+row['y']**2+row['z']**2) , 'delt':np.arctan((np.sqrt(row['x']**2+row['y']**2)/row['z'])), 'fi':np.arctan((row['y']/row['x']))})

In [10]:
df_struct.head()

Unnamed: 0,molecule_name,atom_index,atom,x,y,z,r,delt,fi
0,dsgdb9nsd_000001,0,C,-0.012698,1.085804,0.008001,1.085908,1.563428,-1.559102
1,dsgdb9nsd_000001,1,H,0.00215,-0.006031,0.001976,0.006701,1.271456,-1.228306
2,dsgdb9nsd_000001,2,H,1.011731,1.463751,0.000277,1.779373,1.570641,0.966007
3,dsgdb9nsd_000001,3,H,-0.540815,1.447527,-0.876644,1.776603,-1.054758,-1.213242
4,dsgdb9nsd_000001,4,H,-0.523814,1.437933,0.906397,1.778648,1.036077,-1.221455


In [8]:
df_struct['r'] = df_struct.apply(lambda row : np.sqrt(row['x']**2+row['y']**2+row['z']**2), axis=1)
df_struct['delt'] = df_struct.apply(lambda row : np.arctan((np.sqrt(row['x']**2+row['y']**2)/row['z'])), axis=1)
df_struct['fi'] = df_struct.apply(lambda row : np.arctan((row['y']/row['x'])), axis=1)

  


In [20]:
df_struct.head()

Unnamed: 0,molecule_name,atom_index,atom,x,y,z,r,delt,fi
0,dsgdb9nsd_000001,0,C,-0.012698,1.085804,0.008001,1.085908,1.563428,-1.559102
1,dsgdb9nsd_000001,1,H,0.00215,-0.006031,0.001976,0.006701,1.271456,-1.228306
2,dsgdb9nsd_000001,2,H,1.011731,1.463751,0.000277,1.779373,1.570641,0.966007
3,dsgdb9nsd_000001,3,H,-0.540815,1.447527,-0.876644,1.776603,-1.054758,-1.213242
4,dsgdb9nsd_000001,4,H,-0.523814,1.437933,0.906397,1.778648,1.036077,-1.221455


In [12]:
molecule_lvl_prop = []
for c, g in df_struct.groupby(by=['molecule_name']):
    
    d_ = {}
    d_['molecule_name'] = c
    d_['r_mol_max'] = g['r'].max()
    d_['r_mol_min'] = g['r'].min()
    d_['r_mol_mean'] = g['r'].mean()
    d_['r_mol_std'] = g['r'].std()
    
    d_['delt_mol_max'] = g['delt'].max()
    d_['delt_mol_min'] = g['delt'].min()
    d_['delt_mol_mean'] = g['delt'].mean()
    d_['delt_mol_std'] = g['delt'].std()
    
    d_['fi_mol_max'] = g['fi'].max()
    d_['fi_mol_min'] = g['fi'].min()
    d_['fi_mol_mean'] = g['fi'].mean()
    d_['fi_mol_std'] = g['fi'].std()
    
    molecule_lvl_prop.append(d_)

In [15]:
df_molecule_lvl_prop = pd.DataFrame(molecule_lvl_prop)
df_molecule_lvl_prop.head()

Unnamed: 0,delt_mol_max,delt_mol_mean,delt_mol_min,delt_mol_std,fi_mol_max,fi_mol_mean,fi_mol_min,fi_mol_std,molecule_name,r_mol_max,r_mol_mean,r_mol_min,r_mol_std
0,1.570641,0.877369,-1.054758,1.102827,0.966007,-0.851219,-1.559102,1.02637,dsgdb9nsd_000001,1.779373,1.285446,0.006701,0.775153
1,1.509828,-0.445602,-1.553247,1.353522,0.977736,-0.281589,-1.531342,1.267823,dsgdb9nsd_000002,1.638806,1.084138,0.034709,0.756537
2,1.570354,1.560532,1.548217,0.011277,0.980352,-0.082584,-1.535661,1.302589,dsgdb9nsd_000003,1.565912,0.870686,0.067973,0.754732
3,1.029043,0.784562,0.540081,0.282302,0.0,0.0,0.0,0.0,dsgdb9nsd_000004,1.939341,1.552647,1.165954,0.446515
4,1.56436,1.53301,1.47118,0.053548,-1.450766,-1.52265,-1.559031,0.062255,dsgdb9nsd_000005,2.199171,1.117046,0.019394,1.089971


In [6]:
# df_graph_feats_structures = pd.read_pickle('graph_feats_structures.pkl', compression='gzip')[['molecule_name', 'atom_index', 'electronegativity', 'charge', 'etat',
#        'masse', 'volume', 'rayon_am', 'rayon_ac', 'rayon_c', 'rayon_i',
#        'rayon_vdw', 'fusion', 'ebulution_min', 'enthalpie_fusion',
#        'enthalpie_vaporisation', 'capacite_thermique',
#        'conductivite_thermique', 'isotopes', 'isotopes_emeteurs']]

In [16]:
for atom_idx in [0,1]:
    df_train = pd.merge(df_train, df_struct[['molecule_name','atom_index','r','delt','fi']], how='left', left_on  = ['molecule_name', f'atom_index_{atom_idx}'], right_on = ['molecule_name',  'atom_index'])
    rename_dict = {}
    for col in df_struct.columns.drop(['molecule_name',  'atom_index']):
        rename_dict[col] = f'{col}_atom_index_{atom_idx}'
    df_train = df_train.rename(columns = rename_dict)
    df_train = df_train.drop(columns=['atom_index'])
# print(rename_dict)

In [17]:
df_train.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,r_atom_index_0,delt_atom_index_0,fi_atom_index_0,r_atom_index_1,delt_atom_index_1,fi_atom_index_1
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076,0.006701,1.271456,-1.228306,1.085908,1.563428,-1.559102
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257,0.006701,1.271456,-1.228306,1.779373,1.570641,0.966007
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548,0.006701,1.271456,-1.228306,1.776603,-1.054758,-1.213242
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543,0.006701,1.271456,-1.228306,1.778648,1.036077,-1.221455
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074,1.779373,1.570641,0.966007,1.085908,1.563428,-1.559102


In [18]:
df_train['r_diff'] = df_train['r_atom_index_0'] - df_train['r_atom_index_1']
df_train['fi_diff'] = df_train['fi_atom_index_0'] - df_train['fi_atom_index_1']
df_train['delt_diff'] = df_train['delt_atom_index_0'] - df_train['delt_atom_index_1']

df_train['abs_r_diff'] = np.abs(df_train['r_atom_index_0'] - df_train['r_atom_index_1'])
df_train['abs_fi_diff'] = np.abs(df_train['fi_atom_index_0'] - df_train['fi_atom_index_1'])
df_train['abs_delt_diff'] = np.abs(df_train['delt_atom_index_0'] - df_train['delt_atom_index_1'])

In [21]:
df_train.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,r_atom_index_0,delt_atom_index_0,fi_atom_index_0,r_atom_index_1,delt_atom_index_1,fi_atom_index_1,r_diff,fi_diff,delt_diff,abs_r_diff,abs_fi_diff,abs_delt_diff
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076,0.006701,1.271456,-1.228306,1.085908,1.563428,-1.559102,-1.079207,0.330797,-0.291973,1.079207,0.330797,0.291973
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257,0.006701,1.271456,-1.228306,1.779373,1.570641,0.966007,-1.772671,-2.194313,-0.299185,1.772671,2.194313,0.299185
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548,0.006701,1.271456,-1.228306,1.776603,-1.054758,-1.213242,-1.769902,-0.015064,2.326214,1.769902,0.015064,2.326214
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543,0.006701,1.271456,-1.228306,1.778648,1.036077,-1.221455,-1.771946,-0.006851,0.235378,1.771946,0.006851,0.235378
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074,1.779373,1.570641,0.966007,1.085908,1.563428,-1.559102,0.693465,2.525109,0.007213,0.693465,2.525109,0.007213


In [22]:
molecule_lvl_prop2 = []
for c, g in df_train.groupby(by=['molecule_name']):
    d_ = {}
    d_['molecule_name'] =  c
    for col in ['r_diff','fi_diff','delt_diff','abs_r_diff','abs_fi_diff','abs_delt_diff']:
        d_[f'{col}_min'] = g[col].min()
        d_[f'{col}_max'] = g[col].max()
        d_[f'{col}_std'] = g[col].std()
        d_[f'{col}_mean'] = g[col].mean()
    
    molecule_lvl_prop2.append(d_)    
    
        

In [23]:
df_molecule_lvl_prop2 = pd.DataFrame(molecule_lvl_prop2)
df_molecule_lvl_prop2.head()

Unnamed: 0,abs_delt_diff_max,abs_delt_diff_mean,abs_delt_diff_min,abs_delt_diff_std,abs_fi_diff_max,abs_fi_diff_mean,abs_fi_diff_min,abs_fi_diff_std,abs_r_diff_max,abs_r_diff_mean,...,delt_diff_std,fi_diff_max,fi_diff_mean,fi_diff_min,fi_diff_std,molecule_name,r_diff_max,r_diff_mean,r_diff_min,r_diff_std
0,2.625399,1.15563,0.007213,1.104021,2.525109,1.013057,0.006851,1.095742,1.772671,0.847616,...,1.643965,2.525109,0.569811,-2.194313,1.407198,dsgdb9nsd_000001,0.693465,-0.431538,-1.772671,1.062231
1,3.063076,1.600702,0.414987,1.149849,2.509079,1.559525,0.33001,0.9691,1.604097,0.903617,...,1.66354,2.509079,1.44314,-0.349155,1.165553,dsgdb9nsd_000002,0.611993,-0.495622,-1.604097,1.038685
2,0.022137,0.022137,0.022137,,0.672794,0.672794,0.672794,,1.497939,1.497939,...,,-0.672794,-0.672794,-0.672794,,dsgdb9nsd_000003,-1.497939,-1.497939,-1.497939,
3,0.093181,0.047026,0.000871,0.065273,0.107387,0.054133,0.000878,0.075314,2.179777,1.623187,...,0.065273,0.000878,-0.053255,-0.107387,0.076555,dsgdb9nsd_000005,2.179777,1.623187,1.066596,0.787138
4,2.73172,1.200191,0.000603,0.957907,2.655428,1.071285,0.004275,0.80022,2.17515,0.844546,...,1.532905,2.655428,0.51895,-1.694903,1.245984,dsgdb9nsd_000007,2.17515,0.748922,-0.431234,0.759435


In [24]:
df_train = pd.merge(df_train, df_molecule_lvl_prop, how='left', on='molecule_name')

In [25]:
df_train = pd.merge(df_train, df_molecule_lvl_prop2, how='left', on='molecule_name')

In [27]:
df_train = df_train.fillna(0)

In [29]:
df_train.to_pickle(f'../../data/feature/knn_.pkl', compression='gzip')

In [30]:
df_train.drop(columns=['molecule_name', 'atom_index_0', 'atom_index_1', 'type', 'scalar_coupling_constant']).to_pickle(f'../../data/feature/knn_.pkl', compression='gzip')

In [9]:
weight = 'charge'

In [10]:
df_struct['x'] = df_struct['x']*df_struct[weight]
df_struct['y'] = df_struct['y']*df_struct[weight]
df_struct['z'] = df_struct['z']*df_struct[weight]


In [11]:
df_struct = df_struct[['molecule_name','atom_index','atom','x','y','z']]

In [41]:
''' 
Map atom info from the structures.csv into the train/test files
'''


def map_atom_info(df_1,df_2, atom_idx):
    print('Mapping...', df_1.shape, df_2.shape, atom_idx)
    
    df = pd.merge(df_1, df_2.drop_duplicates(subset=['molecule_name', 'atom_index']), how = 'left', left_on  = ['molecule_name', 'atom_index_{}'.format(atom_idx)],
                  right_on = ['molecule_name',  'atom_index'])
    
    df = df.drop('atom_index', axis=1)

    return df

def show_ram_usage():
    py = psutil.Process(os.getpid())
    print('RAM usage: {} GB'.format(py.memory_info()[0]/2. ** 30))

show_ram_usage()

for atom_idx in [0,1]:
    df_train = map_atom_info(df_train,df_struct, atom_idx)
    df_train = df_train.rename(columns={'atom': 'atom_{}'.format(atom_idx),
                                        'x': 'x_{}'.format(atom_idx),
                                        'y': 'y_{}'.format(atom_idx),
                                        'z': 'z_{}'.format(atom_idx)})
    df_test = map_atom_info(df_test,df_struct, atom_idx)
    df_test = df_test.rename(columns={'atom': 'atom_{}'.format(atom_idx),
                                'x': 'x_{}'.format(atom_idx),
                                'y': 'y_{}'.format(atom_idx),
                                'z': 'z_{}'.format(atom_idx)})
    
    df_struct['c_x']=df_struct.groupby('molecule_name')['x'].transform('mean')
    df_struct['c_y']=df_struct.groupby('molecule_name')['y'].transform('mean')
    df_struct['c_z']=df_struct.groupby('molecule_name')['z'].transform('mean')
#     df_struct['atom_n']=df_struct.groupby('molecule_name')['atom_index'].transform('max')
    
    show_ram_usage()
    print(df_train.shape, df_test.shape)

RAM usage: 2.1337432861328125 GB
Mapping... (4658147, 6) (2358657, 9) 0
Mapping... (2505542, 5) (2358657, 9) 0
RAM usage: 2.9105873107910156 GB
(4658147, 13) (2505542, 12)
Mapping... (4658147, 13) (2358657, 12) 1
Mapping... (2505542, 12) (2358657, 12) 1
RAM usage: 3.4430923461914062 GB
(4658147, 23) (2505542, 22)


In [42]:
df_train.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom_0,x_0,y_0,z_0,...,atom_1,x_1,y_1,z_1,r_y,delt_y,fi_y,c_x,c_y,c_z
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076,H,0.00215,-0.006031,0.001976,...,C,-0.012698,1.085804,0.008001,1.085908,1.563428,-1.559102,-0.012689,1.085797,0.008001
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257,H,0.00215,-0.006031,0.001976,...,H,1.011731,1.463751,0.000277,1.779373,1.570641,0.966007,-0.012689,1.085797,0.008001
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548,H,0.00215,-0.006031,0.001976,...,H,-0.540815,1.447527,-0.876644,1.776603,-1.054758,-1.213242,-0.012689,1.085797,0.008001
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543,H,0.00215,-0.006031,0.001976,...,H,-0.523814,1.437933,0.906397,1.778648,1.036077,-1.221455,-0.012689,1.085797,0.008001
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074,H,1.011731,1.463751,0.000277,...,C,-0.012698,1.085804,0.008001,1.085908,1.563428,-1.559102,-0.012689,1.085797,0.008001


In [44]:
df_train = df_train.rename(columns={'r_x':'r_0', 'delt_x':'delt_0', 'fi_x':'fi_0', 'r_y':'r_1', 'delt_y':'delt_1', 'fi_y':'fi_1'})

In [45]:
df_train.columns

Index(['id', 'molecule_name', 'atom_index_0', 'atom_index_1', 'type',
       'scalar_coupling_constant', 'atom_0', 'x_0', 'y_0', 'z_0', 'r_0',
       'delt_0', 'fi_0', 'atom_1', 'x_1', 'y_1', 'z_1', 'r_1', 'delt_1',
       'fi_1', 'c_x', 'c_y', 'c_z'],
      dtype='object')

In [49]:
for c, g in df_struct.groupby(by=['molecule_name']):
    break
#     df_train['diff'] = 

In [55]:
df_train['r_diff'] = df_train['r_0'] - df_train['r_1']
df_train['delt_diff'] = df_train['delt_0'] - df_train['delt_1']
df_train['fi_diff'] = df_train['fi_0'] - df_train['fi_1']

In [60]:
df_struct['r_mol_mean'] = df_struct.groupby(by=['molecule_name'])['r'].transform('mean')
df_struct['r_mol_std'] = df_struct.groupby(by=['molecule_name'])['r'].transform('std')

In [62]:
df_struct['delt_mol_mean'] = df_struct.groupby(by=['molecule_name'])['delt'].transform('mean')
df_struct['delt_mol_std'] = df_struct.groupby(by=['molecule_name'])['delt'].transform('std')

df_struct['fi_mol_std'] = df_struct.groupby(by=['molecule_name'])['fi'].transform('mean')
df_struct['fi_mol_std'] = df_struct.groupby(by=['molecule_name'])['fi'].transform('std')

In [63]:
df_struct.columns

Index(['molecule_name', 'atom_index', 'atom', 'x', 'y', 'z', 'r', 'delt', 'fi',
       'c_x', 'c_y', 'c_z', 'r_mol_mean', 'r_mol_std', 'delt_mol_mean',
       'delt_mol_std'],
      dtype='object')

In [13]:
def make_features(df):
    df['dx']=df['x_1']-df['x_0']
    df['dy']=df['y_1']-df['y_0']
    df['dz']=df['z_1']-df['z_0']
    df['distance']=(df['dx']**2+df['dy']**2+df['dz']**2)**(1/2)
    return df

df_train=make_features(df_train)
df_test=make_features(df_test) 
#df_train = reduce_mem_usage(df_train)
#df_test = reduce_mem_usage(df_test)
test_prediction=np.zeros(len(df_test))
show_ram_usage()
print(df_train.shape, df_test.shape)

def get_dist(df):
    df_temp=df.loc[:,["molecule_name","atom_index_0","atom_index_1","distance","x_0","y_0","z_0","x_1","y_1","z_1"]].copy()
    df_temp_=df_temp.copy()
    df_temp_= df_temp_.rename(columns={'atom_index_0': 'atom_index_1',
                                       'atom_index_1': 'atom_index_0',
                                       'x_0': 'x_1',
                                       'y_0': 'y_1',
                                       'z_0': 'z_1',
                                       'x_1': 'x_0',
                                       'y_1': 'y_0',
                                       'z_1': 'z_0'})
    df_temp_all=pd.concat((df_temp,df_temp_),axis=0)

    df_temp_all["min_distance"]=df_temp_all.groupby(['molecule_name', 'atom_index_0'])['distance'].transform('min')
    df_temp_all["max_distance"]=df_temp_all.groupby(['molecule_name', 'atom_index_0'])['distance'].transform('max')
    
    df_temp= df_temp_all[df_temp_all["min_distance"]==df_temp_all["distance"]].copy()
    df_temp=df_temp.drop(['x_0','y_0','z_0','min_distance'], axis=1)
    df_temp= df_temp.rename(columns={'atom_index_0': 'atom_index',
                                         'atom_index_1': 'atom_index_closest',
                                         'distance': 'distance_closest',
                                         'x_1': 'x_closest',
                                         'y_1': 'y_closest',
                                         'z_1': 'z_closest'})
    
    
    for atom_idx in [0,1]:
        df = map_atom_info(df,df_temp, atom_idx)
        df = df.rename(columns={'atom_index_closest': f'atom_index_closest_{atom_idx}',
                                        'distance_closest': f'distance_closest_{atom_idx}',
                                        'x_closest': f'x_closest_{atom_idx}',
                                        'y_closest': f'y_closest_{atom_idx}',
                                        'z_closest': f'z_closest_{atom_idx}'})
        
    df_temp= df_temp_all[df_temp_all["max_distance"]==df_temp_all["distance"]].copy()
    df_temp=df_temp.drop(['x_0','y_0','z_0','max_distance'], axis=1)
    df_temp= df_temp.rename(columns={'atom_index_0': 'atom_index',
                                         'atom_index_1': 'atom_index_farthest',
                                         'distance': 'distance_farthest',
                                         'x_1': 'x_farthest',
                                         'y_1': 'y_farthest',
                                         'z_1': 'z_farthest'})
        
    for atom_idx in [0,1]:
        df = map_atom_info(df,df_temp, atom_idx)
        df = df.rename(columns={'atom_index_farthest': f'atom_index_farthest_{atom_idx}',
                                        'distance_farthest': f'distance_farthest_{atom_idx}',
                                        'x_farthest': f'x_farthest_{atom_idx}',
                                        'y_farthest': f'y_farthest_{atom_idx}',
                                        'z_farthest': f'z_farthest_{atom_idx}'})
    return df

df_test=(get_dist(df_test))    
df_train=(get_dist(df_train)) 
print(df_train.shape, df_test.shape)
show_ram_usage()

RAM usage: 2.088642120361328 GB
(4658147, 22) (2505542, 21)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




Mapping... (2505542, 21) (3052975, 8) 0
Mapping... (2505542, 27) (3052975, 8) 1
Mapping... (2505542, 33) (2407533, 8) 0
Mapping... (2505542, 39) (2407533, 8) 1
Mapping... (4658147, 22) (5676534, 8) 0
Mapping... (4658147, 28) (5676534, 8) 1
Mapping... (4658147, 34) (4475344, 8) 0
Mapping... (4658147, 40) (4475344, 8) 1
(4658147, 46) (2505542, 45)
RAM usage: 3.558849334716797 GB


In [14]:
def add_features(df):
    
    df["distance_center0"]=((df['x_0']-df['c_x'])**2+(df['y_0']-df['c_y'])**2+(df['z_0']-df['c_z'])**2)**(1/2)
    df["distance_center1"]=((df['x_1']-df['c_x'])**2+(df['y_1']-df['c_y'])**2+(df['z_1']-df['c_z'])**2)**(1/2)
    df["distance_c0"]=((df['x_0']-df['x_closest_0'])**2+(df['y_0']-df['y_closest_0'])**2+(df['z_0']-df['z_closest_0'])**2)**(1/2)
    df["distance_c1"]=((df['x_1']-df['x_closest_1'])**2+(df['y_1']-df['y_closest_1'])**2+(df['z_1']-df['z_closest_1'])**2)**(1/2)
    df["distance_f0"]=((df['x_0']-df['x_farthest_0'])**2+(df['y_0']-df['y_farthest_0'])**2+(df['z_0']-df['z_farthest_0'])**2)**(1/2)
    df["distance_f1"]=((df['x_1']-df['x_farthest_1'])**2+(df['y_1']-df['y_farthest_1'])**2+(df['z_1']-df['z_farthest_1'])**2)**(1/2)
    df["vec_center0_x"]=(df['x_0']-df['c_x'])/(df["distance_center0"]+1e-10)
    df["vec_center0_y"]=(df['y_0']-df['c_y'])/(df["distance_center0"]+1e-10)
    df["vec_center0_z"]=(df['z_0']-df['c_z'])/(df["distance_center0"]+1e-10)
    df["vec_center1_x"]=(df['x_1']-df['c_x'])/(df["distance_center1"]+1e-10)
    df["vec_center1_y"]=(df['y_1']-df['c_y'])/(df["distance_center1"]+1e-10)
    df["vec_center1_z"]=(df['z_1']-df['c_z'])/(df["distance_center1"]+1e-10)
    df["vec_c0_x"]=(df['x_0']-df['x_closest_0'])/(df["distance_c0"]+1e-10)
    df["vec_c0_y"]=(df['y_0']-df['y_closest_0'])/(df["distance_c0"]+1e-10)
    df["vec_c0_z"]=(df['z_0']-df['z_closest_0'])/(df["distance_c0"]+1e-10)
    df["vec_c1_x"]=(df['x_1']-df['x_closest_1'])/(df["distance_c1"]+1e-10)
    df["vec_c1_y"]=(df['y_1']-df['y_closest_1'])/(df["distance_c1"]+1e-10)
    df["vec_c1_z"]=(df['z_1']-df['z_closest_1'])/(df["distance_c1"]+1e-10)
    df["vec_f0_x"]=(df['x_0']-df['x_farthest_0'])/(df["distance_f0"]+1e-10)
    df["vec_f0_y"]=(df['y_0']-df['y_farthest_0'])/(df["distance_f0"]+1e-10)
    df["vec_f0_z"]=(df['z_0']-df['z_farthest_0'])/(df["distance_f0"]+1e-10)
    df["vec_f1_x"]=(df['x_1']-df['x_farthest_1'])/(df["distance_f1"]+1e-10)
    df["vec_f1_y"]=(df['y_1']-df['y_farthest_1'])/(df["distance_f1"]+1e-10)
    df["vec_f1_z"]=(df['z_1']-df['z_farthest_1'])/(df["distance_f1"]+1e-10)
    df["vec_x"]=(df['x_1']-df['x_0'])/df["distance"]
    df["vec_y"]=(df['y_1']-df['y_0'])/df["distance"]
    df["vec_z"]=(df['z_1']-df['z_0'])/df["distance"]
    df["cos_c0_c1"]=df["vec_c0_x"]*df["vec_c1_x"]+df["vec_c0_y"]*df["vec_c1_y"]+df["vec_c0_z"]*df["vec_c1_z"]
    df["cos_f0_f1"]=df["vec_f0_x"]*df["vec_f1_x"]+df["vec_f0_y"]*df["vec_f1_y"]+df["vec_f0_z"]*df["vec_f1_z"]
    df["cos_center0_center1"]=df["vec_center0_x"]*df["vec_center1_x"]+df["vec_center0_y"]*df["vec_center1_y"]+df["vec_center0_z"]*df["vec_center1_z"]
    df["cos_c0"]=df["vec_c0_x"]*df["vec_x"]+df["vec_c0_y"]*df["vec_y"]+df["vec_c0_z"]*df["vec_z"]
    df["cos_c1"]=df["vec_c1_x"]*df["vec_x"]+df["vec_c1_y"]*df["vec_y"]+df["vec_c1_z"]*df["vec_z"]
    df["cos_f0"]=df["vec_f0_x"]*df["vec_x"]+df["vec_f0_y"]*df["vec_y"]+df["vec_f0_z"]*df["vec_z"]
    df["cos_f1"]=df["vec_f1_x"]*df["vec_x"]+df["vec_f1_y"]*df["vec_y"]+df["vec_f1_z"]*df["vec_z"]
    df["cos_center0"]=df["vec_center0_x"]*df["vec_x"]+df["vec_center0_y"]*df["vec_y"]+df["vec_center0_z"]*df["vec_z"]
    df["cos_center1"]=df["vec_center1_x"]*df["vec_x"]+df["vec_center1_y"]*df["vec_y"]+df["vec_center1_z"]*df["vec_z"]
    df=df.drop(['vec_c0_x','vec_c0_y','vec_c0_z','vec_c1_x','vec_c1_y','vec_c1_z',
                'vec_f0_x','vec_f0_y','vec_f0_z','vec_f1_x','vec_f1_y','vec_f1_z',
                'vec_center0_x','vec_center0_y','vec_center0_z','vec_center1_x','vec_center1_y','vec_center1_z',
                'vec_x','vec_y','vec_z'], axis=1)
    return df
    
df_train=add_features(df_train)
df_test=add_features(df_test)
print(df_train.shape, df_test.shape)
show_ram_usage()

(4658147, 61) (2505542, 60)
RAM usage: 4.257698059082031 GB


In [15]:
df_train = df_train.drop(columns=['molecule_name','atom_index_0','atom_index_1','type','scalar_coupling_constant', 'atom_0', 'atom_1'])

In [16]:
df_test = df_test.drop(columns=['molecule_name','atom_index_0','atom_index_1','type', 'atom_0', 'atom_1'])

In [19]:
df_train.head()

Unnamed: 0,id,x_0,y_0,z_0,x_1,y_1,z_1,c_x,c_y,c_z,...,distance_f1,cos_c0_c1,cos_f0_f1,cos_center0_center1,cos_c0,cos_c1,cos_f0,cos_f1,cos_center0,cos_center1
0,0,0.0,-0.0,0.0,-0.012698,1.085804,0.008001,-0.00254,0.217161,0.0016,...,1.085908,0.0,-1.0,-1.0,0.0,1.0,-1.0,1.0,-1.0,1.0
1,1,0.0,-0.0,0.0,0.0,0.0,0.0,-0.00254,0.217161,0.0016,...,1.085908,0.0,1.0,1.0,,,,,,
2,2,0.0,-0.0,0.0,-0.0,0.0,-0.0,-0.00254,0.217161,0.0016,...,1.085908,0.0,1.0,1.0,,,,,,
3,3,0.0,-0.0,0.0,-0.0,0.0,0.0,-0.00254,0.217161,0.0016,...,1.085908,0.0,1.0,1.0,,,,,,
4,4,0.0,0.0,0.0,-0.012698,1.085804,0.008001,-0.00254,0.217161,0.0016,...,1.085908,0.0,-1.0,-1.0,0.0,1.0,-1.0,1.0,-1.0,1.0


In [20]:
df_train.columns = [f'{weight}_{c}' for c in df_train.columns]

In [21]:
df_test.columns = [f'{weight}_{c}' for c in df_test.columns]

In [22]:
df_train = reduce_mem_usage(df_train)

Mem. usage decreased to 501.99 Mb (74.3% reduction)


In [23]:
df_test = reduce_mem_usage(df_test)

Mem. usage decreased to 308.24 Mb (71.7% reduction)


In [24]:
df_train.to_pickle(f'../../data/feature/keras-neural-net-for-champs_train_{weight}.pkl', compression='gzip')

In [25]:
df_test.to_pickle(f'../../data/feature/keras-neural-net-for-champs_test_{weight}.pkl', compression='gzip')

In [26]:
f'../../data/feature/keras-neural-net-for-champs_train_{weight}.pkl'

'../../data/feature/keras-neural-net-for-champs_train_charge.pkl'