In [7]:
import pandas as pd
import numpy as np
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import label_ranking_loss, multilabel_confusion_matrix
import pickle

### Load data and classifier

In [8]:
FINGERPRINTS_PATH = "./embeddings/tms_maccs_fingerprint.csv"
SPEC2VEC_PATH = "./embeddings/tms_spec2vec_embeddings.csv"
CLASSIFIER_PATH = "./models/tms/tms_one_vs_rest_classifier.pkl"

In [9]:
### Load and parse data
fingerprints = pd.read_csv(FINGERPRINTS_PATH)
fingerprints.rename(columns={"InChIKey": "inchikey", "Name": "name", "InChI": "inchi"}, inplace=True)
if "name" in fingerprints.columns:
    fingerprints.drop(columns=["name"], inplace=True)
if "inchi" in fingerprints.columns:
    fingerprints.drop(columns=["inchi"], inplace=True)
fingerprints.set_index("inchikey", inplace=True)
fingerprints = fingerprints.astype(bool)
print(fingerprints.shape)
fingerprints.head()
# Simple indicator analysis
print("Mean true indicators: ", fingerprints.sum(axis=1).mean())
print("Std true indicators: ", fingerprints.sum(axis=1).std())
print("Min true indicators: ", fingerprints.sum(axis=1).min())
print("Max true indicators: ", fingerprints.sum(axis=1).max())
true_class_weight = 1 - fingerprints.sum(axis=1).mean() / fingerprints.shape[1]
false_class_weight = 1 - true_class_weight
true_class_weight, false_class_weight
# Validate fingerprint
print("Nan values: ", fingerprints.isna().sum().sum())
spec2vec = pd.read_csv(SPEC2VEC_PATH)
spec2vec.rename(columns={"InChI Key": "inchikey", "Name": "name"}, inplace=True)
if "name" in spec2vec.columns:
    spec2vec = spec2vec.drop(columns=["name"])
spec2vec = spec2vec.set_index("inchikey")
spec2vec = spec2vec.astype(float)
print(spec2vec.shape)
spec2vec.head()
# Validate embeddings
print("Nan values: ", spec2vec.isna().sum().sum())
# For Both df in index repalce \xa0 with space and strip (remove leading and trailing spaces)
spec2vec.index = spec2vec.index.str.replace("\xa0", " ").str.strip()
fingerprints.index = fingerprints.index.str.replace("\xa0", " ").str.strip()
# Missing inchikeys in spec2vec
set(fingerprints.index.unique()) - (set(spec2vec.index.unique()))
# Missing inchikeys in fingerprints
set(spec2vec.index.unique()) - set(fingerprints.index.unique())
# Merge the dataframes to obtain X and y matrices (we add suffixes for later extraction)
merged = pd.merge(spec2vec.add_suffix("_x"), fingerprints.add_suffix("_y"), left_index=True, right_index=True, how="inner")
print(merged.shape)
merged.head()
# X is data from merged with suffix _x
X = merged.filter(regex="_x$").to_numpy()
# y is data from merged with suffix _y
y = merged.filter(regex="_y$").to_numpy().astype(int)
X.shape, y.shape

(105, 192)
Mean true indicators:  36.23809523809524
Std true indicators:  17.869464613349937
Min true indicators:  21
Max true indicators:  192
Nan values:  0
(3144, 300)
Nan values:  0
(3082, 492)


((3082, 300), (3082, 192))

In [10]:
classifier = pickle.load(open(CLASSIFIER_PATH, "rb"))

### Evaluations

In [11]:
y_pred = classifier.predict(X)
mcm = multilabel_confusion_matrix(y, y_pred)

In [18]:
# count unique values in y
unique, counts = np.unique(y, return_counts=True)
print(f"Positive ratio {counts[1] / counts.sum()}, Negative ratio {counts[0] / counts.sum()}")

Positive ratio 0.18844128542072247, Negative ratio 0.8115587145792775


In [19]:
first_column_y = y[:, 0]

In [22]:
unique, counts = np.unique(first_column_y, return_counts=True)
print(f"Positive ratio {counts[1] / counts.sum()}, Negative ratio {counts[0] / counts.sum()}")  

Positive ratio 0.00973393900064893, Negative ratio 0.9902660609993511


In [23]:
TP_sum = mcm[:, 1, 1].sum()
FP_sum = mcm[:, 0, 1].sum()
TN_sum = mcm[:, 0, 0].sum()
FN_sum = mcm[:, 1, 0].sum()

cm = np.array([[TP_sum, FP_sum], [FN_sum, TN_sum]])
cm

array([[105412,    776],
       [  6097, 479459]], dtype=int64)