In [148]:
import pandas as pd
import numpy as np
from collections import Counter
from pprint import pprint
from sklearn.model_selection import train_test_split

In [53]:
dili_df = pd.read_csv("D:\Datasets\CAMDA-DILI\DILI.csv", delimiter=',')
dili_df = dili_df[~pd.isna(dili_df['MOL_FORMULA'])]

In [54]:
dili_df.columns

Index(['LTKB_ID', 'COMPOUND_NAME', 'MOL_FORMULA', 'SIMILARITY', 'CASN',
       'DRUGBANK_ID', 'ATC_CODE', 'VERIFIED_DILI_CONCERN', 'DILI_CONCERN',
       'LABEL_SECTION', 'SEVERITY_CLASS', 'DILI_DESCRIPTION',
       'GREENE_ANNOTATION', 'XU_ANNOTATION', 'OBRIEN_ANNOTATION',
       'SUZUKI_ANNOTATION', 'GUO_ANNOTATION', 'SAKATIS_ANNOTATION',
       'VER_DISP'],
      dtype='object')

## Check for formulas duplicates

In [55]:
print(f"All formulas count: {len(dili_df)}")
print(f"Unique formulas count: {len(set(dili_df['MOL_FORMULA']))}")

All formulas count: 380
Unique formulas count: 370


In [56]:
duplicates = [(item, count) for item, count in Counter(dili_df['MOL_FORMULA']).most_common(20) if count > 1]
pprint(duplicates)

[('C16 H19 Cl N2', 2),
 ('C7 H15 Cl2 N2 O2 P', 2),
 ('C22 H24 N2 O8', 2),
 ('C18 H21 Cl N2', 2),
 ('C27 H29 N O11', 2),
 ('C26 H28 Cl N O', 2),
 ('C20 H28 O2', 2),
 ('C11 H9 I3 N2 O4', 2),
 ('C7 H7 N O3', 2),
 ('C12 H16 O2', 2)]


We've found a few formula duplicates. That is a bit weird, but is there any difference in DILI concern?

In [57]:
errors = []
for formula, _ in duplicates:
    results = dili_df[dili_df['MOL_FORMULA']==formula]
    if results['DILI_CONCERN'].values[0]!=results['DILI_CONCERN'].values[1]:
        print(formula)
        print(f"{results['COMPOUND_NAME'].values[0]}: {results['DILI_CONCERN'].values[0]}")
        print(f"{results['COMPOUND_NAME'].values[1]}: {results['DILI_CONCERN'].values[1]}")
        errors.append(formula)

C18 H21 Cl N2
chlorcyclizine: No-DILI-Concern
clomacran: Most-DILI-Concern
C20 H28 O2
isotretinoin: Most-DILI-Concern
tretinoin: Less-DILI-Concern
C7 H7 N O3
mesalazine: Less-DILI-Concern
aminosalicylic acid: Most-DILI-Concern
C12 H16 O2
sacrosidase: Less-DILI-Concern
ibufenac: Most-DILI-Concern


This is quite worrying, but let's just drop the data and pretend we didn't see that.

In [58]:
for formula in errors:
    dili_df = dili_df[dili_df['MOL_FORMULA']!=formula]
print(f"All formulas count: {len(dili_df)}")
print(f"Unique formulas count: {len(set(dili_df['MOL_FORMULA']))}")

All formulas count: 372
Unique formulas count: 366


## Check for number of different tokens in the formulas

In [99]:
flattened = dili_df['MOL_FORMULA'].values.flatten()
joined = ' '.join(flattened)
joined = joined.replace('.', '')
for i in range(10):
    joined = joined.replace(str(i), '')
    
counted = Counter(joined.split(' '))

In [100]:
counted

Counter({'C': 374,
         'H': 381,
         'N': 315,
         'S': 86,
         'O': 340,
         'Cl': 61,
         'P': 5,
         'F': 35,
         '': 41,
         'I': 4,
         'Pt': 2,
         'Na': 6,
         'Br': 6,
         'Al': 1,
         'Cu': 1,
         'La': 1,
         'Gd': 1,
         'Fe': 2,
         'Ca': 1,
         'Si': 1})

In [101]:
tokens = []
for token, count in counted.most_common():
    if count>10 and token!='':
        tokens.append(token)
print(tokens)

['H', 'C', 'O', 'N', 'S', 'Cl', 'F']


## A naive formula-based classifier

### Preprocessing

In [119]:
import re
def parse_formula(formula):
    res = []
    for token in tokens:
        match = re.search(f"({token}\d+)", formula)
        try:
            count = match.group(1)[len(token):]
            res.append(int(count))
        except AttributeError:
            if token in formula:
                res.append(1)
            else:
                res.append(0)
    return res

parse_formula("C27 H29 N O11")

[29, 27, 11, 1, 0, 0, 0]

In [131]:
X = []
for formula in dili_df['MOL_FORMULA'].values:
    X.append(parse_formula(formula))
X = np.array(X)

In [270]:
y = []
for concern in dili_df['VERIFIED_DILI_CONCERN'].values:
    if concern=='vMost-DILI-Concern':
        y.append(1)
    else:
        y.append(0)
y = np.array(y)

In [161]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [162]:
print(len(X_train), len(X_test))

260 112


### Modelling

In [204]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import confusion_matrix, roc_curve, auc

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [269]:
clf = RandomForestClassifier(n_estimators=10, max_depth=7)
clf.fit(X_train, y_train)

y_pred = clf.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
print(auc(fpr, tpr))

y_pred = clf.predict_proba(X_train)[:, 1]
fpr, tpr, thresholds = roc_curve(y_train, y_pred)
print(auc(fpr, tpr))

0.6607211354046797
0.9576212203232543
