In [1]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [2]:
import os
import pandas as pd
import openbabel
import numpy as np
from tqdm import tqdm

In [3]:
file_folder = '../../data/input'
os.listdir(file_folder)

['sample_submission.csv',
 'magnetic_shielding_tensors.csv',
 'potential_energy.csv',
 'scalar_coupling_contributions.csv',
 'dipole_moments.csv',
 'mulliken_charges.csv',
 'train.csv',
 'test.csv',
 'structures.csv',
 'structures']

In [5]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir(file_folder))

# Any results you write to the current directory are saved as output.

['sample_submission.csv', 'magnetic_shielding_tensors.csv', 'potential_energy.csv', 'scalar_coupling_contributions.csv', 'dipole_moments.csv', 'mulliken_charges.csv', 'train.csv', 'test.csv', 'structures.csv', 'structures']


In [6]:
import scipy as sp #collection of functions for scientific computing and advance mathematics
from scipy import stats
from scipy.stats import norm, skew
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax


# warnings mute
import warnings
warnings.simplefilter('ignore')

In [7]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [9]:
train = pd.read_csv(f'{file_folder}/train.csv')
test = pd.read_csv(f'{file_folder}/test.csv')
magnetic_shielding_tensors = pd.read_csv(f'{file_folder}/magnetic_shielding_tensors.csv')
dipole_moments = pd.read_csv(f'{file_folder}/dipole_moments.csv')
mulliken_charges = pd.read_csv(f'{file_folder}/mulliken_charges.csv')
potential_energy = pd.read_csv(f'{file_folder}/potential_energy.csv')
contributions = pd.read_csv(f'{file_folder}/scalar_coupling_contributions.csv')
structures = pd.read_csv(f'{file_folder}/structures.csv')

structures_notreal = structures

In [10]:
#electronegativity pauling
electronegativity = {'H':2.2, 'C':2.55, 'N':3.04 , 'O':3.44, 'F':3.98 }
#Type de charge 
charge ={'H':0, 'C':1, 'N':1 , 'O':3.44, 'F':3.98 }
#etat
etat ={'H':0, 'C':1, 'N':0, 'O':3.44, 'F':3.98 }
#Masse kg/m^3
masse = {'H':76, 'C':3513, 'N':1026, 'O':3.44, 'F':3.98 }
#volume cm^3/mole
volume = {'H':13.26, 'C':3.42, 'N':13.65, 'O':3.44, 'F':3.98 }
#Rayon atomique (mesuré)
rayon_am = {'H':25, 'C':70, 'N':65, 'O':3.44, 'F':3.98 }
#Rayon atomique (calculé)
rayon_ac = {'H':53, 'C':67, 'N':56 ,'O':0.73, 'F':0.71}
#Rayon covalent
rayon_c = {'H':38, 'C':77, 'N':75, 'O':3.44, 'F':3.98 }
#Rayon ionique
rayon_i = {'H':-3, 'C':4, 'N':-3, 'O':3.44, 'F':3.98 }
#Rayon de Van der Waals
rayon_vdw = {'H':120, 'C':170, 'N':155, 'O':3.44, 'F':3.98 }

In [11]:
#Point de fusion
fusion = {'H':-259.1, 'C':3546.9, 'N':-209.9, 'O':3.44, 'F':3.98 }
#seuil d'ébulution minimal (celcius)
ebulution_min = {'H':-252.9, 'C':4826.9, 'N':-195.8, 'O':3.44, 'F':3.98 }
#Enthalpie de fusion ΔHf (kj/mol)
enthalpie_fusion = {'H':0.12, 'C':105, 'N':0.72, 'O':3.44, 'F':3.98 }
#Enthalpie de vaporisation ΔHv
enthalpie_vaporisation = {'H':0.46, 'C':710.9, 'N':5.58, 'O':3.44, 'F':3.98 }
#Capacité thermique
capacite_thermique = {'H':14.3, 'C':0.71, 'N':1.04, 'O':3.44, 'F':3.98 }
#Conductivité thermique
conductivite_thermique = {'H':0.18, 'C':990, 'N':0.03, 'O':3.44, 'F':3.98 }

In [12]:
#Nb isotopes
isotopes = {'H':3, 'C':12, 'N':12, 'O':3.44, 'F':3.98 }
#Isotopes emeteurs
isotopes_emeteurs = {'H':0, 'C':3, 'N':3, 'O':3.44, 'F':3.98 }

In [13]:
dico_chemical_elements = {'electronegativity':electronegativity ,
                         'charge':charge,
                          'etat':etat,
                          'masse':masse,
                          'volume':volume,
                          'rayon_am':rayon_am,
                          'rayon_ac':rayon_ac,
                          'rayon_c':rayon_c,
                          'rayon_i':rayon_i,
                          'rayon_vdw':rayon_vdw,
                          'fusion':fusion,
                          'ebulution_min':ebulution_min,
                          'enthalpie_fusion':enthalpie_fusion,
                          'enthalpie_vaporisation':enthalpie_vaporisation,
                          'capacite_thermique':capacite_thermique,
                          'conductivite_thermique':conductivite_thermique,
                          'isotopes':isotopes,
                          'isotopes_emeteurs':isotopes_emeteurs
                         }

In [14]:
def dico_todf(list_dicos, df):
    for k,v in list_dicos.items():
        df[k] = df['atom'].apply(lambda x : v[x])
    return df 

In [15]:
def map_atom_info(df, atom_idx):
    df = pd.merge(df, structures, how = 'left',
                  left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
                  right_on = ['molecule_name',  'atom_index'])
    
    df = df.drop('atom_index', axis=1)
    df = df.rename(columns={'atom': f'atom_{atom_idx}',
                            'x': f'x_{atom_idx}',
                            'y': f'y_{atom_idx}',
                            'z': f'z_{atom_idx}'})
    return df

In [16]:
def dihedral_angle(data): 
        
    vals = np.array(data[:, 3:6], dtype=np.float64)
    mol_names = np.array(data[:, 0], dtype=np.str)
 
    result = np.zeros((data.shape[0], 2), dtype=object)
    # use every 4 rows to compute the dihedral angle
    for idx in range(0, vals.shape[0] - 4, 4):

        a0 = vals[idx]
        a1 = vals[idx + 1]
        a2 = vals[idx + 2]
        a3 = vals[idx + 3]
        
        b0 = a0 - a1
        b1 = a2 - a1
        b2 = a3 - a2
        
        # normalize b1 so that it does not influence magnitude of vector
        # rejections that come next
        b1 /= np.linalg.norm(b1)
    
        # vector rejections
        # v = projection of b0 onto plane perpendicular to b1
        #   = b0 minus component that aligns with b1
        # w = projection of b2 onto plane perpendicular to b1
        #   = b2 minus component that aligns with b1

        v = b0 - np.dot(b0, b1) * b1
        w = b2 - np.dot(b2, b1) * b1

        # angle between v and w in a plane is the torsion angle
        # v and w may not be normalized but that's fine since tan is y/x
        x = np.dot(v, w)
        y = np.dot(np.cross(b1, v), w)
       
        # We want all 4 first rows for every molecule to have the same value
        # (in order to have the same length as the dataframe)
        result[idx:idx + 4] = [mol_names[idx], np.degrees(np.arctan2(y, x))]
        
    return result

In [17]:
from datetime import datetime
startTime = datetime.now()
dihedral = dihedral_angle(structures[structures.groupby('molecule_name')['atom_index'].transform('count').ge(4)].groupby('molecule_name').head(4).values)
print('Time elapsed (hh:mm:ss.ms) {}'.format(datetime.now() - startTime))

Time elapsed (hh:mm:ss.ms) 0:00:12.771621


In [18]:
themap = {k:v for k, v in dihedral if k}
# Add diehral and cos diehral angle to features 
structures['dihedral'] = structures['molecule_name'].map(themap)
structures['cosdihedral'] = structures['dihedral'].map(np.cos)

In [19]:
atoms = structures['atom'].values
atoms_en = [electronegativity[x] for x in (atoms)]
atoms_rad = [rayon_ac[x] for x in (atoms)]

structures['EN'] = atoms_en
structures['rad'] = atoms_rad

In [20]:
#Add bonds to features
i_atom = structures['atom_index'].values
p = structures[['x', 'y', 'z']].values
p_compare = p
m = structures['molecule_name'].values
m_compare = m
r = structures['rad'].values
r_compare = r

source_row = np.arange(len(structures))
max_atoms = 28

bonds = np.zeros((len(structures)+1, max_atoms+1), dtype=np.int8)
bond_dists = np.zeros((len(structures)+1, max_atoms+1), dtype=np.float32)


for i in (range(max_atoms-1)):
    p_compare = np.roll(p_compare, -1, axis=0)
    m_compare = np.roll(m_compare, -1, axis=0)
    r_compare = np.roll(r_compare, -1, axis=0)
    
    mask = np.where(m == m_compare, 1, 0) #Are we still comparing atoms in the same molecule?
    dists = np.linalg.norm(p - p_compare, axis=1) * mask
    r_bond = r + r_compare
    
    bond = np.where(np.logical_and(dists > 0.0001, dists < r_bond), 1, 0)
    
    source_row = source_row
    target_row = source_row + i + 1 #Note: Will be out of bounds of bonds array for some values of i
    target_row = np.where(np.logical_or(target_row > len(structures), mask==0), len(structures), target_row) #If invalid target, write to dummy row
    
    source_atom = i_atom
    target_atom = i_atom + i + 1 #Note: Will be out of bounds of bonds array for some values of i
    target_atom = np.where(np.logical_or(target_atom > max_atoms, mask==0), max_atoms, target_atom) #If invalid target, write to dummy col
    
    bonds[(source_row, target_atom)] = bond
    bonds[(target_row, source_atom)] = bond
    bond_dists[(source_row, target_atom)] = dists
    bond_dists[(target_row, source_atom)] = dists

bonds = np.delete(bonds, axis=0, obj=-1) #Delete dummy row
bonds = np.delete(bonds, axis=1, obj=-1) #Delete dummy col
bond_dists = np.delete(bond_dists, axis=0, obj=-1) #Delete dummy row
bond_dists = np.delete(bond_dists, axis=1, obj=-1) #Delete dummy col


bonds_numeric = [[i for i,x in enumerate(row) if x] for row in (bonds)]
bond_lengths = [[dist for i,dist in enumerate(row) if i in bonds_numeric[j]] for j,row in enumerate((bond_dists))]
bond_lengths_mean = [ np.mean(x) for x in bond_lengths]
n_bonds = [len(x) for x in bonds_numeric]

bond_data = {'n_bonds':n_bonds, 'bond_lengths_mean': bond_lengths_mean }
bond_df = pd.DataFrame(bond_data)
structures = structures.join(bond_df)

In [21]:
df_struct_aux= structures.groupby(['molecule_name'])['atom'].agg([('bonds_distc',lambda x : len(set(x.tolist())))]).reset_index()
structures=pd.merge(structures,df_struct_aux)

In [22]:
structures=dico_todf(dico_chemical_elements,structures)

In [23]:
structures_aux_xbary=structures.groupby(['molecule_name'])[['x','y','z']].agg({'avg':np.average}).reset_index()
structures_aux_xbary_=pd.DataFrame(structures_aux_xbary.get_values())
structures_aux_xbary_.columns=['molecule_name','x_bar','y_bar','z_bar']
structures_bary = pd.merge(structures,structures_aux_xbary_)
del structures_bary['atom']
structures=structures_bary

In [24]:
train = map_atom_info(train, 0)
train = map_atom_info(train, 1)
test = map_atom_info(test, 0)
test = map_atom_info(test, 1)
del train['x_bar_y']
del train['y_bar_y']
del train['z_bar_y']
del test['x_bar_y']
del test['y_bar_y']
del test['z_bar_y']

In [26]:
train.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,x_0,y_0,z_0,dihedral_x,...,rayon_i_y,rayon_vdw_y,fusion_y,ebulution_min_y,enthalpie_fusion_y,enthalpie_vaporisation_y,capacite_thermique_y,conductivite_thermique_y,isotopes_y,isotopes_emeteurs_y
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076,0.00215,-0.006031,0.001976,-35.263968,...,4.0,170.0,3546.9,4826.9,105.0,710.9,0.71,990.0,12.0,3.0
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257,0.00215,-0.006031,0.001976,-35.263968,...,-3.0,120.0,-259.1,-252.9,0.12,0.46,14.3,0.18,3.0,0.0
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548,0.00215,-0.006031,0.001976,-35.263968,...,-3.0,120.0,-259.1,-252.9,0.12,0.46,14.3,0.18,3.0,0.0
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543,0.00215,-0.006031,0.001976,-35.263968,...,-3.0,120.0,-259.1,-252.9,0.12,0.46,14.3,0.18,3.0,0.0
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074,1.011731,1.463751,0.000277,-35.263968,...,4.0,170.0,3546.9,4826.9,105.0,710.9,0.71,990.0,12.0,3.0


In [27]:
df_train = reduce_mem_usage(train)

Memory usage of dataframe is 2345.56 MB
Memory usage after optimization is: 638.97 MB
Decreased by 72.8%


In [28]:
df_test = reduce_mem_usage(test)

Memory usage of dataframe is 1242.52 MB
Memory usage after optimization is: 338.53 MB
Decreased by 72.8%


In [29]:
df_train.to_pickle('../../data/feature/feature-engineering-physical-chemical-measurement_train.pkl')

In [30]:
df_test.to_pickle('../../data/feature/feature-engineering-physical-chemical-measurement_test.pkl')