# Janssen AMES Dataset Analysis


Import required packages

In [20]:
import os
import numpy as np
import pandas as pd
from matplotlib.pyplot import imshow
%matplotlib inline

Setup paths

In [2]:
data_path = "datasets"

In [3]:
if not os.path.isdir(data_path):
    os.mkdir(data_path)
    # load data from the google drive

In [4]:
def load_npz(path, file_name):
    with np.load(os.path.join(data_path, file_name), mmap_mode='rb', fix_imports=True) as file:
        if file is None:
            print("Failed to load {} file".format(file_name))
        else:
            print("{} contains the following files: {}".format(file_name, file.files))
            data_arrays = []
            for _name in file.files:
                data = file[_name]
                data_arrays.append(data)
                print(" - {} contains the data of the following shape: {}, type: {}, min: {}, max: {}".format(_name, data.shape, data.dtype, data.min(), data.max()))
    return data_arrays

In [43]:
ames_mrgne = load_npz(data_path, "ames_mrgne.npz")
ames_mrgnf = load_npz(data_path, "ames_mrgnf.npz")
ames_rdkit2d = load_npz(data_path, "ames_rdkit2d.npz")

ames_mrgne = ames_mrgne[0]
ames_mrgnf = ames_mrgnf[0]
ames_rdkit2d = ames_rdkit2d[0]

with open(os.path.join(data_path, "ames_data.csv"), "r") as file:
    ames_data = pd.read_csv(file)
    if ames_data is not None:
        _label_name = ames_data.columns[-1]
        print("ames_data.csv contains the following fields: {}".format(ames_data.columns))
        print(" - {} column contains the following data: n records: {}, type: {}, min: {}, max: {}".format(
            _label_name, ames_data[_label_name].size, ames_data[_label_name].dtype, ames_data[_label_name].min(), ames_data[_label_name].max()))
        
data = ames_mrgne.astype(np.float)
labels = ames_data['ames'].astype(np.float)

ames_mrgne.npz contains the following files: ['arr_0']
 - arr_0 contains the data of the following shape: (4324, 2048), type: int64, min: 0, max: 34
ames_mrgnf.npz contains the following files: ['arr_0']
 - arr_0 contains the data of the following shape: (4324, 4096), type: int64, min: 0, max: 1
ames_rdkit2d.npz contains the following files: ['arr_0']
 - arr_0 contains the data of the following shape: (4324, 200), type: float64, min: nan, max: nan
ames_data.csv contains the following fields: Index(['smiles', 'ames'], dtype='object')
 - ames column contains the following data: n records: 4324, type: int64, min: 0, max: 1


In [50]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [52]:
parameters = {'C': [10**p for p in range(-4,4,1)]}
clf = GridSearchCV(LogisticRegression(), parameters, cv=5)
clf.fit(data, labels)





GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [53]:
clf.cv_results_

{'mean_fit_time': array([0.06727333, 0.0415894 , 0.04504728, 0.09632921, 0.15182891,
        0.1865149 , 0.23999629, 0.20121002]),
 'std_fit_time': array([0.03849794, 0.00249738, 0.00243122, 0.038865  , 0.04762564,
        0.05878523, 0.09084995, 0.02805021]),
 'mean_score_time': array([0.00270314, 0.00258355, 0.00222697, 0.00372362, 0.00378346,
        0.00220566, 0.00407386, 0.00212345]),
 'std_score_time': array([7.63662523e-04, 5.36906071e-04, 1.47260918e-04, 1.36029753e-03,
        1.41127629e-03, 1.55486018e-04, 3.06691387e-03, 9.12192849e-05]),
 'param_C': masked_array(data=[0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 0.0001},
  {'C': 0.001},
  {'C': 0.01},
  {'C': 0.1},
  {'C': 1},
  {'C': 10},
  {'C': 100},
  {'C': 1000}],
 'split0_test_score': array([0.65011547, 0.72632794, 0.80138568, 0.81639723, 0.79792148,
        0.77829099, 0