In [1]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [1]:
import os
import pandas as pd
import openbabel
import numpy as np
from tqdm import tqdm

In [2]:
file_folder = '../../data/input'
os.listdir(file_folder)

['sample_submission.csv',
 'magnetic_shielding_tensors.csv',
 'potential_energy.csv',
 'scalar_coupling_contributions.csv',
 'dipole_moments.csv',
 'mulliken_charges.csv',
 'train.csv',
 'test.csv',
 'structures.csv',
 'structures']

In [3]:
train = pd.read_csv(f'{file_folder}/train.csv')
test = pd.read_csv(f'{file_folder}/test.csv')
magnetic_shielding_tensors = pd.read_csv(f'{file_folder}/magnetic_shielding_tensors.csv')
dipole_moments = pd.read_csv(f'{file_folder}/dipole_moments.csv')
mulliken_charges = pd.read_csv(f'{file_folder}/mulliken_charges.csv')
potential_energy = pd.read_csv(f'{file_folder}/potential_energy.csv')
scalar_coupling_contributions = pd.read_csv(f'{file_folder}/scalar_coupling_contributions.csv')
structures = pd.read_csv(f'{file_folder}/structures.csv')

In [4]:
obConversion = openbabel.OBConversion()
obConversion.SetInFormat("xyz")
xyz_path = f'{file_folder}/structures/'

In [5]:
def cis_trans_bond_indices(molecule_name):
    mol = openbabel.OBMol()
    obConversion.ReadFile(mol, f'{xyz_path}/{molecule_name}.xyz')
    obs = openbabel.OBStereoFacade(mol)
    has_ct = [obs.HasCisTransStereo(n) for n in range(mol.NumBonds())]
    return [i for i, x in enumerate(has_ct) if x == True] if has_ct else []

In [6]:
df = pd.DataFrame(structures.molecule_name.unique(), columns=['molecule_name'])
df.head()

Unnamed: 0,molecule_name
0,dsgdb9nsd_000001
1,dsgdb9nsd_000002
2,dsgdb9nsd_000003
3,dsgdb9nsd_000004
4,dsgdb9nsd_000005


In [7]:
df['bond_indices'] = df.molecule_name.apply(lambda x: cis_trans_bond_indices(x))
df['len_bond_indices'] = df.bond_indices.apply(lambda x:len(x))

In [8]:
train = pd.merge(train, df, how='left', on='molecule_name')
test = pd.merge(test, df, how='left', on='molecule_name')

In [12]:
def is_cis_trans(molecule_name, bond_indices, atom_index_0, atom_index_1):
    if len(bond_indices) == 0:
        return pd.Series([0,0])

    mol = openbabel.OBMol()
    obConversion.ReadFile(mol, f'{xyz_path}/{molecule_name}.xyz')
    obs = openbabel.OBStereoFacade(mol)
    
    is_cis   = [obs.GetCisTransStereo(i).IsCis(atom_index_0, atom_index_1) for i in bond_indices]
    is_trans = [obs.GetCisTransStereo(i).IsTrans(atom_index_0, atom_index_1) for i in bond_indices]
    return pd.Series([int(True in is_cis), int(True in is_trans)])

In [10]:
train[['is_cis','is_trans']] = train.apply(lambda x: is_cis_trans(x.molecule_name, x.bond_indices, x.atom_index_0, x.atom_index_1), axis=1)

In [15]:
test[['is_cis','is_trans']] = test.apply(lambda x: is_cis_trans(x.molecule_name, x.bond_indices, x.atom_index_0, x.atom_index_1), axis=1)

In [16]:
angles = pd.read_csv(f'../../data/temp/angles.csv')

In [17]:
angles.head()

Unnamed: 0,molecule_name,atom_index_0,atom_index_1,shortest_path_atoms,shortest_path_n_bonds,cosinus,dihedral
0,dsgdb9nsd_000001,1,0,,1,,
1,dsgdb9nsd_000001,1,2,C,2,-0.33329,
2,dsgdb9nsd_000001,1,3,C,2,-0.33333,
3,dsgdb9nsd_000001,1,4,C,2,-0.33335,
4,dsgdb9nsd_000001,2,0,,1,,


In [18]:
train = pd.merge(train,  angles[['molecule_name','atom_index_0','atom_index_1','dihedral']], how='left', on=['molecule_name','atom_index_0','atom_index_1'])

In [19]:
test = pd.merge(test,  angles[['molecule_name','atom_index_0','atom_index_1','dihedral']], how='left', on=['molecule_name','atom_index_0','atom_index_1'])

In [20]:
train.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,bond_indices,len_bond_indices,is_cis,is_trans,dihedral
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076,[],0,0,0,
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257,[],0,0,0,
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548,[],0,0,0,
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543,[],0,0,0,
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074,[],0,0,0,


In [21]:
test.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,bond_indices,len_bond_indices,is_cis,is_trans,dihedral
0,4658147,dsgdb9nsd_000004,2,0,2JHC,[],0,0,0,
1,4658148,dsgdb9nsd_000004,2,1,1JHC,[],0,0,0,
2,4658149,dsgdb9nsd_000004,2,3,3JHH,[],0,0,0,0.0
3,4658150,dsgdb9nsd_000004,3,0,1JHC,[],0,0,0,
4,4658151,dsgdb9nsd_000004,3,1,2JHC,[],0,0,0,


In [24]:
train.bond_indices.values[23233]

[]

In [25]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [26]:
df_train = train[['id', 'is_cis','is_trans','dihedral']]

In [27]:
df_train.head()

Unnamed: 0,id,is_cis,is_trans,dihedral
0,0,0,0,
1,1,0,0,
2,2,0,0,
3,3,0,0,
4,4,0,0,


In [28]:
df_test = test[['id', 'is_cis','is_trans','dihedral']]

In [29]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [30]:
df_train = reduce_mem_usage(df_train)

Mem. usage decreased to 71.08 Mb (60.0% reduction)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [31]:
df_test = reduce_mem_usage(df_test)

Mem. usage decreased to 38.23 Mb (60.0% reduction)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [34]:
df_train.to_pickle('../../data/feature/cis-trans-isomerism-feature_train.pkl')

In [35]:
df_test.to_pickle('../../data/feature/cis-trans-isomerism-feature_test.pkl')