In [1]:
import re

import pandas as pd
pd.set_option("mode.copy_on_write", True)
import numpy as np

In [2]:
df = pd.read_csv('chembl_antibiotics.csv', delimiter=';')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 226 entries, 0 to 225
Data columns (total 36 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   ChEMBL ID                        226 non-null    object 
 1   Name                             226 non-null    object 
 2   Synonyms                         226 non-null    object 
 3   Type                             226 non-null    object 
 4   Max Phase                        226 non-null    float64
 5   Molecular Weight                 222 non-null    float64
 6   Targets                          206 non-null    float64
 7   Bioactivities                    206 non-null    float64
 8   AlogP                            210 non-null    float64
 9   Polar Surface Area               210 non-null    float64
 10  HBA                              210 non-null    float64
 11  HBD                              210 non-null    float64
 12  #RO5 Violations       

In [4]:
# Drop rows with null entries for features of interest
df = df.dropna(subset=['Targets', 'Bioactivities'])

# Drop features with many null entries
df = df.drop(['CX Acidic pKa', 'CX Basic pKa'], axis=1)

# Now drop remaining null, 'None', and 'Unknown' values
df = df[df['Type'] != 'Unknown'].dropna()
df = df.mask(df.eq('None')).dropna()

# Drop columns with constant values across all remaining rows
df = df.drop(['Type', 'Structure Type', 'Inorganic Flag', 'Orphan'], axis=1)

# Drop alternative identifiers
df = df.drop(['ChEMBL ID', 'Synonyms'], axis=1)

In [5]:
# Parse chemical compound and determine the number of a given constituent element
def count_element(formula, element):
    parsed_formula = re.findall(element + '[0-9]?[0-9]?[0-9]?',formula)
    if len(parsed_formula) == 0:
        return 0
    element_count = parsed_formula[0][len(element):]
    if len(element_count) == 0:
        return 1
    else:
        return int(element_count)

In [6]:
# Create columns to count constituent elements
all_atomic_constituents = {}
for i in range(len(df)):
    formula = df.iloc[i]['Molecular Formula']
    formula_parsed = re.findall('[A-Z][a-z]?', formula)
    for element in formula_parsed:
        all_atomic_constituents[element] = True
all_atomic_constituents = all_atomic_constituents.keys()

for i in df.index:
    formula = df.loc[i, 'Molecular Formula']
    for element in all_atomic_constituents:
        df.loc[i, element] = count_element(formula, element)

In [7]:
# Monoisotopic molecular weight is now linearly dependent with the elemental counts constructed above
df = df.drop(['Molecular Weight (Monoisotopic)'], axis=1)

In [8]:
# Additional feature engineering
df['Log Targets'] = np.log10(df['Targets'])
df['Log Bioactivities'] = np.log10(df['Bioactivities'])

In [9]:
df.to_csv('chembl_antibiotics_cleaned.csv', index=False)