In [2]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [14]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
from tqdm import tqdm_notebook

from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor

In [2]:
import sys
sys.path.append("../..") # Adds higher directory to python modules path.
from utilities import aggregate_feature_calculators
from utilities import aggregate_feature_calculators_setting as aggcal
from utilities.parallel import Parallel

In [3]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [4]:
file_folder =  '../../data/input'
os.listdir(file_folder)

['sample_submission.csv',
 'magnetic_shielding_tensors.csv',
 'potential_energy.csv',
 'scalar_coupling_contributions.csv',
 'dipole_moments.csv',
 'mulliken_charges.csv',
 'train.csv',
 'test.csv',
 'structures.csv',
 'structures']

In [5]:
train = pd.read_csv(f'{file_folder}/train.csv')
test = pd.read_csv(f'{file_folder}/test.csv')
magnetic_shielding_tensors = pd.read_csv(f'{file_folder}/magnetic_shielding_tensors.csv')
dipole_moments = pd.read_csv(f'{file_folder}/dipole_moments.csv')
mulliken_charges = pd.read_csv(f'{file_folder}/mulliken_charges.csv')
potential_energy = pd.read_csv(f'{file_folder}/potential_energy.csv')
scalar_coupling_contributions = pd.read_csv(f'{file_folder}/scalar_coupling_contributions.csv')
structures = pd.read_csv(f'{file_folder}/structures.csv')

In [6]:
def get_dist_matrix(df_structures_idx, molecule):
    df_temp = df_structures_idx.loc[molecule]
    locs = df_temp[['x','y','z']].values
    num_atoms = len(locs)
    loc_tile = np.tile(locs.T, (num_atoms,1,1))
    dist_mat = ((loc_tile - loc_tile.T)**2).sum(axis=1)
    return dist_mat

In [7]:
def assign_atoms_index(df_idx, molecule):
    se_0 = df_idx.loc[molecule]['atom_index_0']
    se_1 = df_idx.loc[molecule]['atom_index_1']
    if type(se_0) == np.int64:
        se_0 = pd.Series(se_0)
    if type(se_1) == np.int64:
        se_1 = pd.Series(se_1)
    assign_idx = pd.concat([se_0, se_1]).unique()
    assign_idx.sort()
    return assign_idx

In [8]:
def get_pickup_dist_matrix(df_idx, df_structures_idx, molecule, num_pickup=5, atoms=['H', 'C', 'N', 'O', 'F']):
    pickup_dist_matrix = np.zeros([0, len(atoms)*num_pickup])
    assigned_idxs = assign_atoms_index(df_idx, molecule) # [0, 1, 2, 3, 4, 5, 6] -> [1, 2, 3, 4, 5, 6]
    dist_mat = get_dist_matrix(df_structures_idx, molecule)
    for idx in assigned_idxs: # [1, 2, 3, 4, 5, 6] -> [2]
        df_temp = df_structures_idx.loc[molecule]
        locs = df_temp[['x','y','z']].values

        dist_arr = dist_mat[idx] # (7, 7) -> (7, )

        atoms_mole = df_structures_idx.loc[molecule]['atom'].values # ['O', 'C', 'C', 'N', 'H', 'H', 'H']
        atoms_mole_idx = df_structures_idx.loc[molecule]['atom_index'].values # [0, 1, 2, 3, 4, 5, 6]

        mask_atoms_mole_idx = atoms_mole_idx != idx # [ True,  True, False,  True,  True,  True,  True]
        masked_atoms = atoms_mole[mask_atoms_mole_idx] # ['O', 'C', 'N', 'H', 'H', 'H']
        masked_atoms_idx = atoms_mole_idx[mask_atoms_mole_idx]  # [0, 1, 3, 4, 5, 6]
        masked_dist_arr = dist_arr[mask_atoms_mole_idx]  # [ 5.48387003, 2.15181049, 1.33269675, 10.0578779, 4.34733927, 4.34727838]
        masked_locs = locs[masked_atoms_idx]

        sorting_idx = np.argsort(masked_dist_arr) # [2, 1, 5, 4, 0, 3]
        sorted_atoms_idx = masked_atoms_idx[sorting_idx] # [3, 1, 6, 5, 0, 4]
        sorted_atoms = masked_atoms[sorting_idx] # ['N', 'C', 'H', 'H', 'O', 'H']
        sorted_dist_arr = 1/masked_dist_arr[sorting_idx] #[0.75035825,0.46472494,0.23002898,0.23002576,0.18235297,0.09942455]

        target_matrix = np.zeros([len(atoms), num_pickup])
        for a, atom in enumerate(atoms):
            pickup_atom = sorted_atoms == atom # [False, False,  True,  True, False,  True]
            pickup_dist = sorted_dist_arr[pickup_atom] # [0.23002898, 0.23002576, 0.09942455]

            num_atom = len(pickup_dist)
            if num_atom > num_pickup:
                target_matrix[a, :num_pickup] = pickup_dist[:num_pickup]
            else:
                target_matrix[a, :num_atom] = pickup_dist
        
        pickup_dist_matrix = np.vstack([pickup_dist_matrix, target_matrix.reshape(-1)])
    return pickup_dist_matrix #(num_atoms, num_pickup*5)

In [9]:
df_structures = structures
df_train = train
df_test = test

In [10]:
df_structures_idx = df_structures.set_index('molecule_name')
df_train_idx = df_train.set_index('molecule_name')
df_test_idx = df_test.set_index('molecule_name')

In [12]:
def gen_df_distance(mols, df_train_idx, df_structures_idx):
    num = 5
#     mols = df_train['molecule_name'].unique()
    num_div = len(mols) // 5
    dist_mat = np.zeros([0, num*5])
    atoms_idx = np.zeros([0], dtype=np.int32)
    molecule_names = np.empty([0])

    for mol in tqdm_notebook(mols):

        assigned_idxs = assign_atoms_index(df_train_idx, mol)
        dist_mat_mole = get_pickup_dist_matrix(df_train_idx, df_structures_idx, mol, num_pickup=num)
        mol_name_arr = [mol] * len(assigned_idxs) 

        molecule_names = np.hstack([molecule_names, mol_name_arr])
        atoms_idx = np.hstack([atoms_idx, assigned_idxs])
        dist_mat = np.vstack([dist_mat, dist_mat_mole])

    col_name_list = []
    atoms = ['H', 'C', 'N', 'O', 'F']
    for a in atoms:
        for n in range(num):
            col_name_list.append('dist_{}_{}'.format(a, n))

    se_mole = pd.Series(molecule_names, name='molecule_name')
    se_atom_idx = pd.Series(atoms_idx, name='atom_index')
    df_dist = pd.DataFrame(dist_mat, columns=col_name_list)
    df_distance = pd.concat([se_mole, se_atom_idx,df_dist], axis=1)

    return df_distance

In [15]:
df_distance_train = gen_df_distance(df_train['molecule_name'].unique(), df_train_idx, df_structures_idx)

HBox(children=(IntProgress(value=0, max=85003), HTML(value='')))




In [16]:
df_distance_test = gen_df_distance(df_test['molecule_name'].unique(), df_test_idx, df_structures_idx)

HBox(children=(IntProgress(value=0, max=45772), HTML(value='')))




In [18]:
df_distance_train.head()

Unnamed: 0,molecule_name,atom_index,dist_H_0,dist_H_1,dist_H_2,dist_H_3,dist_H_4,dist_C_0,dist_C_1,dist_C_2,...,dist_O_0,dist_O_1,dist_O_2,dist_O_3,dist_O_4,dist_F_0,dist_F_1,dist_F_2,dist_F_3,dist_F_4
0,dsgdb9nsd_000001,0,0.838682,0.83868,0.838674,0.838672,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,dsgdb9nsd_000001,1,0.314513,0.314503,0.3145,0.0,0.0,0.838672,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,dsgdb9nsd_000001,2,0.314513,0.314503,0.3145,0.0,0.0,0.838674,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,dsgdb9nsd_000001,3,0.314503,0.314503,0.3145,0.0,0.0,0.838682,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,dsgdb9nsd_000001,4,0.314503,0.314503,0.3145,0.0,0.0,0.83868,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
df_distance_test.head()

Unnamed: 0,molecule_name,atom_index,dist_H_0,dist_H_1,dist_H_2,dist_H_3,dist_H_4,dist_C_0,dist_C_1,dist_C_2,...,dist_O_0,dist_O_1,dist_O_2,dist_O_3,dist_O_4,dist_F_0,dist_F_1,dist_F_2,dist_F_3,dist_F_4
0,dsgdb9nsd_000004,0,0.886482,0.195583,0.0,0.0,0.0,0.695512,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,dsgdb9nsd_000004,1,0.886482,0.195583,0.0,0.0,0.0,0.695512,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,dsgdb9nsd_000004,2,0.090546,0.0,0.0,0.0,0.0,0.886482,0.195583,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,dsgdb9nsd_000004,3,0.090546,0.0,0.0,0.0,0.0,0.886482,0.195583,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,dsgdb9nsd_000015,0,0.837293,0.822961,0.822959,0.143484,0.143451,0.182995,0.0,0.0,...,0.505799,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
def merge_atom(df, df_distance):
    df_merge_0 = pd.merge(df, df_distance, left_on=['molecule_name', 'atom_index_0'], right_on=['molecule_name', 'atom_index'])
    df_merge_0_1 = pd.merge(df_merge_0, df_distance, left_on=['molecule_name', 'atom_index_1'], right_on=['molecule_name', 'atom_index'])
    del df_merge_0_1['atom_index_x'], df_merge_0_1['atom_index_y']
    return df_merge_0_1

In [21]:
df_train_dist = merge_atom(df_train, df_distance_train)

In [22]:
df_test_dist = merge_atom(df_test, df_distance_test)

In [23]:
print(df_train_dist.shape, df_test_dist.shape)

(4658147, 56) (2505542, 55)


In [24]:
df_train_dist.drop(columns=['molecule_name','atom_index_0','atom_index_1','type','scalar_coupling_constant']).to_pickle('../../data/feature/coulomb-interaction-speed-up_train.pkl')
df_test_dist.drop(columns=['molecule_name','atom_index_0','atom_index_1','type']).to_pickle('../../data/feature/coulomb-interaction-speed-up_test.pkl')