In [1]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [2]:
import sys
sys.path.append("../..") # Adds higher directory to python modules path.
from utilities import aggregate_feature_calculators
from utilities import aggregate_feature_calculators_setting as aggcal
from utilities.parallel import Parallel

In [3]:
import os
import pandas as pd
import numpy as np
import math

from tqdm import tqdm_notebook, tqdm

import openbabel

In [4]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [5]:
file_folder =  '../../data/input'
os.listdir(file_folder)

['structures',
 'magnetic_shielding_parameters.csv',
 'train.csv',
 'dipole_moments.csv',
 'magnetic_shielding_tensors.csv',
 'mulliken_charges.csv',
 'potential_energy.csv',
 'sample_submission.csv',
 'scalar_coupling_contributions.csv',
 'structures.csv',
 'test.csv']

In [6]:
train = pd.read_csv(f'{file_folder}/train.csv')
test = pd.read_csv(f'{file_folder}/test.csv')
magnetic_shielding_tensors = pd.read_csv(f'{file_folder}/magnetic_shielding_tensors.csv')
dipole_moments = pd.read_csv(f'{file_folder}/dipole_moments.csv')
mulliken_charges = pd.read_csv(f'{file_folder}/mulliken_charges.csv')
potential_energy = pd.read_csv(f'{file_folder}/potential_energy.csv')
scalar_coupling_contributions = pd.read_csv(f'{file_folder}/scalar_coupling_contributions.csv')
structures = pd.read_csv(f'{file_folder}/structures.csv')

In [7]:
x = structures.groupby('molecule_name').atom_index.max().reset_index(drop=False)
x.columns = ['molecule_name','totalatoms']
x.totalatoms+=1
train = train.merge(x,on='molecule_name')
# train = train[train.molecule_name=='dsgdb9nsd_000001']

In [8]:
test = test.merge(x,on='molecule_name')

In [9]:
obConversion = openbabel.OBConversion()
obConversion.SetInFormat("xyz")
structdir=f'{file_folder}/structures/'
mols=[]
mols_files=os.listdir(structdir)
mols_index=dict(map(reversed,enumerate(mols_files)))
for f in mols_index.keys():
    mol = openbabel.OBMol()
    obConversion.ReadFile(mol, structdir+f) 
    mols.append(mol)

In [10]:
def _worker(item):
    
    m = item[0]
    groupdf = item[1]
    mol=mols[mols_index[m+'.xyz']]
    list_ = []
    for i in groupdf.index.values:
        
        id_ = int(groupdf.loc[i].id)
        totalatoms = groupdf.loc[i].totalatoms
        firstatomid = int(groupdf.loc[i].atom_index_0)
        secondatomid = int(groupdf.loc[i].atom_index_1)
        entrystats = {}
        entrystats['id'] = id_
        entrystats['totalatoms'] = totalatoms
        if 'scalar_coupling_constant' in groupdf.columns:
            entrystats['scalar_coupling_constant'] = float(groupdf.loc[i].scalar_coupling_constant)
        entrystats['type'] = groupdf.loc[i]['type']
        a = mol.GetAtomById(firstatomid)
        b = mol.GetAtomById(secondatomid)
        entrystats['molecule_name'] = m
        entrystats['atom_index_0'] = firstatomid
        entrystats['atom_index_1'] = secondatomid
        entrystats['bond_distance'] = a.GetDistance(b)
        entrystats['bond_atom'] = b.GetType()

        #Put the tertiary data in order of distance from first hydrogen
        tertiarystats = {}
        for j,c in enumerate(list(set(range(totalatoms)).difference(set([firstatomid,secondatomid])))):
            tertiaryatom = mol.GetAtomById(c)
            tp = tertiaryatom.GetType()
            dist = a.GetDistance(tertiaryatom)
            ang = a.GetAngle(b,tertiaryatom)*math.pi/180
            while(dist in tertiarystats):
                dist += 1e-15
                # print('Duplicates!',m,j,dist)
            tertiarystats[dist] = [tp,dist,ang]
        
        for k, c in enumerate(sorted(tertiarystats.keys())):
            entrystats['tertiary_atom_'+str(k)] = tertiarystats[c][0]
            entrystats['tertiary_distance_'+str(k)] = tertiarystats[c][1]
            entrystats['tertiary_angle_'+str(k)] = tertiarystats[c][2]
        list_.append(entrystats)
    return list_


def extract_features(df):
    
    molecule_names = df.molecule_name.unique().tolist()
    N_ = len(molecule_names)
    segment = N_//100 if N_ > 100 else 1
    df_ = pd.DataFrame()
    for start in tqdm_notebook(range(0, N_, segment)):
        end = start+segment if start+segment < N_ else N_
        res = Parallel(_worker,{}).run(df[df['molecule_name'].isin(molecule_names[start:end])].groupby('molecule_name'), n_jobs=16)
        df_i = pd.DataFrame(res)
        df_ = pd.concat([df_, df_i], axis=0)
    df_ = df_.sort_values(by=['id'])
    return df_


In [11]:
df_train = extract_features(train)

HBox(children=(IntProgress(value=0, max=101), HTML(value='')))

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.







In [12]:
print(df_train.shape, train.shape)
df_train.head()

(4658147, 90) (4658147, 7)


Unnamed: 0,atom_index_0,atom_index_1,bond_atom,bond_distance,id,molecule_name,scalar_coupling_constant,tertiary_angle_0,tertiary_angle_1,tertiary_angle_10,...,tertiary_distance_26,tertiary_distance_3,tertiary_distance_4,tertiary_distance_5,tertiary_distance_6,tertiary_distance_7,tertiary_distance_8,tertiary_distance_9,totalatoms,type
0,1,0,C3,1.091953,0,dsgdb9nsd_000001,84.8076,1.910584,1.910635,,...,,,,,,,,,5,1JHC
1,1,2,H,1.78312,1,dsgdb9nsd_000001,-11.257,0.615505,1.047203,,...,,,,,,,,,5,2JHH
2,1,3,H,1.783147,2,dsgdb9nsd_000001,-11.2548,0.615481,1.047176,,...,,,,,,,,,5,2JHH
3,1,4,H,1.783157,3,dsgdb9nsd_000001,-11.2543,0.615474,1.047176,,...,,,,,,,,,5,2JHH
4,2,0,C3,1.091952,4,dsgdb9nsd_000001,84.8074,1.910584,1.910637,,...,,,,,,,,,5,1JHC


In [13]:
df_test = extract_features(test)

HBox(children=(IntProgress(value=0, max=101), HTML(value='')))

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.







In [19]:
print(df_test.shape, df_train.shape)

(2505542, 89) (4658147, 90)


In [20]:
set(df_train.columns.tolist()) - set(df_test.columns.tolist())

{'scalar_coupling_constant'}

In [21]:
df_train = reduce_mem_usage(df_train)

Mem. usage decreased to 1630.34 Mb (49.6% reduction)


In [22]:
df_test = reduce_mem_usage(df_test)

Mem. usage decreased to 872.16 Mb (49.3% reduction)


In [23]:
df_train.drop(columns=['molecule_name','atom_index_0','atom_index_1','type','scalar_coupling_constant']).to_pickle('../../data/feature/angles-and-distances_train.pkl')

In [24]:
df_test.drop(columns=['molecule_name','atom_index_0','atom_index_1','type']).to_pickle('../../data/feature/angles-and-distances_test.pkl')