In [None]:
import pandas as pd
import numpy as np

from rdkit import Chem
# from rdkit.Chem import Draw
# from rdkit.Chem import PandasTools
# from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import AllChem
from rdkit.Chem import rdFingerprintGenerator
from rdkit import DataStructs
from rdkit.Chem import MACCSkeys
from rdkit.Chem.Fingerprints import FingerprintMols

# from descriptastorus.descriptors.DescriptorGenerator import MakeGenerator
#https://github.com/bp-kelley/descriptastorus

from mordred import Calculator, descriptors

In [None]:
train = pd.read_csv('../data/train_test_sets/train.csv', index_col = 'CASRN')
test = pd.read_csv('../data/train_test_sets/test.csv', index_col = 'CASRN')

In [None]:
PandasTools.AddMoleculeColumnToFrame(train,smilesCol='SMILES')
PandasTools.AddMoleculeColumnToFrame(test,smilesCol='SMILES')

## ECFP6 Bits

In [None]:
# morgan fingerprint
train_ECFP6 = [AllChem.GetMorganFingerprintAsBitVect(x,3) for x in train['ROMol']]
test_ECFP6 = [AllChem.GetMorganFingerprintAsBitVect(x,3) for x in test['ROMol']]

In [None]:
train_ecfp6_lists = [list(l) for l in train_ECFP6]
test_ecfp6_lists = [list(l) for l in test_ECFP6]

In [None]:
ecfp6_name = [f'Bit_{i}' for i in range(2048)]

In [None]:
train_ecfp6_df = pd.DataFrame(train_ecfp6_lists, index = train.index, columns=ecfp6_name)
test_ecfp6_df = pd.DataFrame(test_ecfp6_lists, index = test.index, columns=ecfp6_name)

In [None]:
train_ecfp6_df.shape, test_ecfp6_df.shape

In [None]:
train_ecfp6_df.to_csv('../data/Descriptors/train_ecfp6_bits.csv')
test_ecfp6_df.to_csv('../data/Descriptors/test_ecfp6_bits.csv')

## ECFP6 counts

In [None]:
# morgan fingerprint counts
train_ECFP6_counts = [AllChem.GetHashedMorganFingerprint(x,3) for x in train['ROMol']]
test_ECFP6_counts = [AllChem.GetHashedMorganFingerprint(x,3) for x in test['ROMol']]

In [None]:
train_ecfp6_counts_lists = [list(l) for l in train_ECFP6_counts]
test_ecfp6__counts_lists = [list(l) for l in test_ECFP6_counts]

In [None]:
train_ecfp6_counts_df = pd.DataFrame(train_ecfp6_counts_lists, index = train.index, columns=ecfp6_name)
test_ecfp6_counts_df = pd.DataFrame(test_ecfp6__counts_lists, index = test.index, columns=ecfp6_name)

In [None]:
train_ecfp6_counts_df.shape, test_ecfp6_counts_df.shape

In [None]:
train_ecfp6_counts_df.to_csv('../data/Descriptors/train_ecfp6_counts.csv')
test_ecfp6_counts_df.to_csv('../data/Descriptors/test_ecfp6_counts.csv')

## MACCS keys

In [None]:
# MACCS keys
train_maccs = [MACCSkeys.GenMACCSKeys(x) for x in train['ROMol']]
test_maccs = [MACCSkeys.GenMACCSKeys(x) for x in test['ROMol']]

In [None]:
train_maccs_lists = [list(l) for l in train_maccs]
test_maccs_lists = [list(l) for l in test_maccs]

In [None]:
maccs_name = [f'Bit_{i}' for i in range(167)]

In [None]:
train_maccs_df = pd.DataFrame(train_maccs_lists, index = train.index, columns=maccs_name)
test_maccs_df = pd.DataFrame(test_maccs_lists, index = test.index, columns=maccs_name)

In [None]:
train_maccs_df.shape, test_maccs_df.shape

In [None]:
train_maccs_df.to_csv('../data/Descriptors/train_maccs.csv')
test_maccs_df.to_csv('../data/Descriptors/test_maccs.csv')

## RDKit

In [None]:
# RDKit
generator = MakeGenerator(("RDKit2D",))

train_rdkit2d = [generator.process(x)[1:] for x in train['SMILES']]
test_rdkit2d = [generator.process(x)[1:] for x in test['SMILES']]

In [None]:
rdkit2d_name = []
for name, numpy_type in generator.GetColumns():
    rdkit2d_name.append(name)

In [None]:
train_rdkit2d_df = pd.DataFrame(train_rdkit2d, index = train.index, columns=rdkit2d_name[1:])
test_rdkit2d_df = pd.DataFrame(test_rdkit2d, index = test.index, columns=rdkit2d_name[1:])

In [None]:
train_rdkit2d_df.shape, test_rdkit2d_df.shape

In [None]:
train_rdkit2d_df.to_csv('../data/Descriptors/train_rdkit2d.csv')
test_rdkit2d_df.to_csv('../data/Descriptors/test_rdkit2d.csv')

## mordred

In [None]:
#mordred
mordred_calc = Calculator(descriptors, ignore_3D=True)  # can't do 3D without sdf or mol file

train_mordred = mordred_calc.pandas([mol for mol in train['ROMol']])
test_mordred = mordred_calc.pandas([mol for mol in test['ROMol']])

In [None]:
train_mordred.shape, test_mordred.shape

In [None]:
# remove non numerical features.
train_mordred = train_mordred.select_dtypes(include=['float64', 'int64', 'float'])
test_mordred = test_mordred[list(train_mordred)]
train_mordred.shape, test_mordred.shape

In [None]:
test_mordred.head(1)

In [None]:
list(train_mordred) == list(test_mordred)

In [None]:
train_mordred.index = train.index
test_mordred.index = test.index

In [None]:
train_mordred.shape, test_mordred.shape

In [None]:
train_mordred.to_csv('../data/Descriptors/train_mordred.csv')
test_mordred.to_csv('../data/Descriptors/test_mordred.csv')