In [None]:
import gc
import torch
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors
from mordred import Calculator, descriptors
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
from computer_ontology.featurizer import get_morgan
from computer_ontology.custom_funcs import x_y_split
from sklearn.model_selection import RandomizedSearchCV
from computer_ontology.config import computer_dataset_path
from skmultilearn.model_selection import IterativeStratification
from sklearn.metrics import make_scorer, roc_auc_score, precision_score, f1_score, recall_score
from torchmetrics.classification import MultilabelF1Score, MultilabelAUROC, MultilabelPrecision, MultilabelRecall

In [None]:
def x_y_split(df):
  """
  Splies the oncoming dataset to X and
  y for classification.

  :param df: A molecular dataset for odor prediction
  :type df: pandas Dataframe
  :return: A list of classes/labels for each row.
  :rtype: pandas dataframes
  """
  x = df[['IsomericSMILES', 'CID']].copy()
  try:
    y = df.drop(['IsomericSMILES', 'Descriptors', 'CID', 'Descriptor Count'], axis=1).copy()
    return x,y
  except:
    y = df.drop(['IsomericSMILES', 'Descriptors', 'CID'], axis=1).copy()
    return x,y

def get_morgan(df):
  """
  This function takes in a dataframe and returns
  a featurized dataframe with morgan fingerprints.

  :param df: A molecular dataset for odor prediction with SMILES strings
  :type df: pandas Dataframe
  :return: A featurized dataframe.
  :rtype: pandas dataframes
  """
  df['molecule'] = df['IsomericSMILES'].apply(lambda x: Chem.MolFromSmiles(x))
  df['MorganFP'] = df['molecule'].apply(lambda x: rdMolDescriptors.GetMorganFingerprintAsBitVect(x,radius=4,nBits=2048,useFeatures=True,useChirality=True))

  df_list = []

  for i in range(df.shape[0]):
    array = np.array(df['MorganFP'][i])
    df_i = pd.DataFrame(array)
    df_i = df_i.T
    df_list.append(df_i)
  morganfp = pd.concat(df_list, ignore_index=True)

  return morganfp

def iterative_train_test_split(X, y, test_size):
  """
  Function doing a train-test split
  using the second order iterative
  stratification method.

  :param df: X and y dataframes for a multilabel machine learning task
  :type df: pandas Dataframes
  :return: train-test split dataframes
  :rtype: pandas dataframes
  """
  stratifier = IterativeStratification(n_splits=2, order=2, sample_distribution_per_fold=[test_size, 1.0-test_size])
  train_indexes, test_indexes = next(stratifier.split(X, y))

  X_train, y_train = X.iloc[train_indexes], y.iloc[train_indexes]
  X_test, y_test = X.iloc[test_indexes], y.iloc[test_indexes]

  return X_train, y_train, X_test, y_test

In [None]:
dataset = pd.read_csv(computer_dataset_path)

In [None]:
X, y = x_y_split(dataset)

In [None]:
morgan = get_morgan(X)

In [None]:
train_x, train_y, test_x, test_y = iterative_train_test_split(morgan, y, 0.3)

In [None]:
clf = RandomForestClassifier(random_state=0)
clf.fit(train_x, train_y)

y_hat = clf.predict(test_x)

f1score_macro = MultilabelF1Score(num_labels=len(train_y.columns), average="macro")(torch.tensor(y_hat, dtype=torch.float), torch.tensor(test_y.values, dtype=torch.float))
auroc_macro = MultilabelAUROC(num_labels=len(train_y.columns), average="macro")(torch.tensor(y_hat, dtype=torch.float), torch.tensor(test_y.values, dtype=torch.long))
precision_macro = MultilabelPrecision(num_labels=len(train_y.columns), average="macro")(torch.tensor(y_hat, dtype=torch.float), torch.tensor(test_y.values, dtype=torch.long))
recall_macro = MultilabelRecall(num_labels=len(train_y.columns), average="macro")(torch.tensor(y_hat, dtype=torch.float), torch.tensor(test_y.values, dtype=torch.long))

f1score_micro = MultilabelF1Score(num_labels=len(train_y.columns), average="micro")(torch.tensor(y_hat, dtype=torch.float), torch.tensor(test_y.values, dtype=torch.float))
auroc_micro = MultilabelAUROC(num_labels=len(train_y.columns), average="micro")(torch.tensor(y_hat, dtype=torch.float), torch.tensor(test_y.values, dtype=torch.long))
precision_micro = MultilabelPrecision(num_labels=len(train_y.columns), average="micro")(torch.tensor(y_hat, dtype=torch.float), torch.tensor(test_y.values, dtype=torch.long))
recall_micro = MultilabelRecall(num_labels=len(train_y.columns), average="micro")(torch.tensor(y_hat, dtype=torch.float), torch.tensor(test_y.values, dtype=torch.long))

print(f"F1 Score Macro: {f1score_macro}")
print(f"AUROC Macro: {auroc_macro}")
print(f"Precision Macro: {precision_macro}")
print(f"Recall Macro: {recall_macro}")
print(f"F1 Score Micro: {f1score_micro}")
print(f"AUROC Micro: {auroc_micro}")
print(f"Precision Micro: {precision_micro}")
print(f"Recall Micro: {recall_micro}")
print("=====================================")