In [2]:
import openbabel
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
# tqdm.pandas()

In [6]:
base_path = '../../data/input'
print(os.listdir('../../data/input'))

['sample_submission.csv', 'magnetic_shielding_tensors.csv', 'potential_energy.csv', 'scalar_coupling_contributions.csv', 'dipole_moments.csv', 'mulliken_charges.csv', 'train.csv', 'test.csv', 'structures.csv', 'structures']


In [21]:
structures = pd.read_csv(f'{base_path}/structures.csv')

# data reduction
train = pd.read_csv(f'{base_path}/train.csv')
test = pd.read_csv(f'{base_path}/test.csv')

In [22]:

obConversion = openbabel.OBConversion()
obConversion.SetInFormat("xyz")
xyz_path = f'{base_path}/structures/'

def cis_trans_bond_indices(molecule_name):
    mol = openbabel.OBMol()
    obConversion.ReadFile(mol, f'{xyz_path}/{molecule_name}.xyz')
    obs = openbabel.OBStereoFacade(mol)
    has_ct = [obs.HasCisTransStereo(n) for n in range(mol.NumBonds())]
    return [i for i, x in enumerate(has_ct) if x == True] if has_ct else []

In [23]:
df = pd.DataFrame(structures.molecule_name.unique(), columns=['molecule_name'])
df.head()

Unnamed: 0,molecule_name
0,dsgdb9nsd_000001
1,dsgdb9nsd_000002
2,dsgdb9nsd_000003
3,dsgdb9nsd_000004
4,dsgdb9nsd_000005


In [24]:
df['bond_indices'] = df.molecule_name.apply(lambda x: cis_trans_bond_indices(x))
df['len_bond_indices'] = df.bond_indices.apply(lambda x:len(x))

In [25]:
df.len_bond_indices.unique()

array([0, 1, 2, 3, 4])

In [26]:
df[df['len_bond_indices']!=0].head()

Unnamed: 0,molecule_name,bond_indices,len_bond_indices
114,dsgdb9nsd_000120,[8],1
150,dsgdb9nsd_000156,[10],1
151,dsgdb9nsd_000157,[8],1
152,dsgdb9nsd_000158,[6],1
161,dsgdb9nsd_000167,[1],1


In [27]:
train = pd.merge(train, df, how='left', on='molecule_name')
test = pd.merge(test, df, how='left', on='molecule_name')

In [28]:
def is_cis_trans(molecule_name, bond_indices, atom_index_0, atom_index_1):
    if len(bond_indices) == 0:
        return pd.Series([0,0])

    mol = openbabel.OBMol()
    obConversion.ReadFile(mol, f'{xyz_path}/{molecule_name}.xyz')
    obs = openbabel.OBStereoFacade(mol)
    
    is_cis   = [obs.GetCisTransStereo(i).IsCis(atom_index_0, atom_index_1) for i in bond_indices]
    is_trans = [obs.GetCisTransStereo(i).IsTrans(atom_index_0, atom_index_1) for i in bond_indices]
    return pd.Series([int(True in is_cis), int(True in is_trans)])

In [29]:
train[['is_cis','is_trans']] = train.apply(lambda x: is_cis_trans(x.molecule_name,
                                                                           x.bond_indices,
                                                                           x.atom_index_0,
                                                                           x.atom_index_1), axis=1)

In [30]:
train.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,bond_indices,len_bond_indices,is_cis,is_trans
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076,[],0,0,0
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257,[],0,0,0
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548,[],0,0,0
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543,[],0,0,0
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074,[],0,0,0


In [32]:
test[['is_cis','is_trans']] = test.apply(lambda x: is_cis_trans(x.molecule_name,
                                                                        x.bond_indices,
                                                                        x.atom_index_0,
                                                                        x.atom_index_1), axis=1)

In [33]:
test.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,bond_indices,len_bond_indices,is_cis,is_trans
0,4658147,dsgdb9nsd_000004,2,0,2JHC,[],0,0,0
1,4658148,dsgdb9nsd_000004,2,1,1JHC,[],0,0,0
2,4658149,dsgdb9nsd_000004,2,3,3JHH,[],0,0,0
3,4658150,dsgdb9nsd_000004,3,0,1JHC,[],0,0,0
4,4658151,dsgdb9nsd_000004,3,1,2JHC,[],0,0,0


In [34]:
train.to_pickle('train_cis_trans.csv', compression='gzip')
test.to_csv('test_cis_trans.csv', compression='gzip')