# Dataset Download

In [1]:
import polaris as po
import datamol as dm
import numpy as np
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
benchmark = po.load_benchmark("polaris/pkis1-kit-wt-mut-c-1")
train, test = benchmark.get_train_test_split()

[32m2024-06-20 12:55:23.826[0m | [1mINFO    [0m | [36mpolaris._artifact[0m:[36m_validate_version[0m:[36m66[0m - [1mThe version of Polaris that was used to create the artifact (0.0.0) is different from the currently installed version of Polaris (dev).[0m
[32m2024-06-20 12:55:23.840[0m | [1mINFO    [0m | [36mpolaris._artifact[0m:[36m_validate_version[0m:[36m66[0m - [1mThe version of Polaris that was used to create the artifact (0.0.0) is different from the currently installed version of Polaris (dev).[0m


# Dataset Analysis

In [3]:
train.X[:5]

array(['O=C(Nc1n[nH]c2cc(-c3ccc(F)cc3)ccc12)C1CC1',
       'CCn1c(-c2nonc2N)nc2c(C#CC(C)(C)O)ncc(OC3CCNCC3)c21',
       'CN(C)c1cc2c(Nc3ccc4c(cnn4Cc4ccccc4)c3)ncnc2cn1',
       'NS(=O)(=O)c1cccc(-c2ccc3c(NC(=O)C4CC4)n[nH]c3c2)c1',
       'Cc1nn(C)c2cc(N(C)c3ccnc(Nc4cccc(S(N)(=O)=O)c4)n3)ccc12'],
      dtype='<U77')

In [4]:
ys = train.y
ys.keys()

dict_keys(['CLASS_KIT_(T6701_mutant)', 'CLASS_KIT_(V560G_mutant)', 'CLASS_KIT'])

In [5]:
ys = np.stack([ys[target] for target in benchmark.target_cols], axis=1)
ys.shape

(277, 3)

In [6]:
# Get the index of elements with NaN valued
nan_idx = np.isnan(ys).any(axis=1)
nan_idx = np.where(nan_idx)[0][0]
nan_idx

112

In [7]:
# See the NaN value datapoint
train[nan_idx]

('CC(C)(C)c1ccc(Oc2nccc(-c3c(-c4ccc(F)cc4)ncn3C3CCNCC3)n2)cc1',
 {'CLASS_KIT_(T6701_mutant)': 0.0,
  'CLASS_KIT_(V560G_mutant)': 0.0,
  'CLASS_KIT': nan})

We can use this training point for two of the three datasets. However, it is not that important as it is part of the highly represented class. We would have wanted to keep it if it was part of the minority class.

In [8]:
mask = ~np.any(np.isnan(ys), axis=1)

# Class imbalance

As we can see, the targets are now returned to us as a dictionary. Let's train a multi-task model on this data! We first preprocess the data to be in a format we can use with scikit-learn.

In [9]:
df = pd.DataFrame(train.X[mask])
df["tg1"] = ys[mask, 0]
df["tg2"] = ys[mask, 1]
df["tg3"] = ys[mask, 2]

In [10]:
df.head()

Unnamed: 0,0,tg1,tg2,tg3
0,O=C(Nc1n[nH]c2cc(-c3ccc(F)cc3)ccc12)C1CC1,0.0,0.0,0.0
1,CCn1c(-c2nonc2N)nc2c(C#CC(C)(C)O)ncc(OC3CCNCC3...,0.0,0.0,0.0
2,CN(C)c1cc2c(Nc3ccc4c(cnn4Cc4ccccc4)c3)ncnc2cn1,0.0,0.0,0.0
3,NS(=O)(=O)c1cccc(-c2ccc3c(NC(=O)C4CC4)n[nH]c3c...,0.0,0.0,0.0
4,Cc1nn(C)c2cc(N(C)c3ccnc(Nc4cccc(S(N)(=O)=O)c4)...,0.0,0.0,1.0
