# Part 0: Import Statements

In [2]:
# import stats + sklearn classifiers + regressors
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
import sklearn
from sklearn import metrics
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import LinearSVR

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC

#!pip3 install pydot
import pydot
from sklearn import *

# Part 1: Read and parse data

In [3]:
# read in data and get the size of the data set
X = pd.read_csv('drug_properties_final.csv', header=0, usecols=range(1,16))
print(X.head(5))
nsamples, nfts = X.shape
print(nsamples)
print(nfts)

     0    1    2      3    4           5    6    7     8    9     10  \
0  1.0  0.0  0.0  734.0  1.0  344.089603  7.0  3.0  25.0  0.0  344.3   
1  1.0  0.0  0.0  633.0  1.0  354.183109  4.0  1.0  26.0  0.0  354.4   
2  1.0  0.0  0.0  497.0  1.0  292.121178  3.0  1.0  22.0  0.0  292.3   
3  0.0  0.0  0.0  780.0  1.0  518.194068  8.0  6.0  38.0  0.0  518.6   
4  2.0  0.0  0.0  415.0  1.0  301.179027  3.0  3.0  22.0  0.0  301.4   

           11   12     13   14  
0  344.089603  2.0  121.0  2.1  
1  354.183109  8.0   71.4  4.4  
2  292.121178  1.0   52.9  2.4  
3  518.194068  5.0  156.0  6.9  
4  301.179027  2.0   68.4  2.3  
1784
15


In [4]:
# get the names of the drugs we are looking at
drugs = pd.read_csv('drug_name_list_final.csv', header=0, usecols=range(1,2))
#drugs

In [5]:
# there are more drugs that have max response and auc data since not all drugs were 
# accessible in pubchem- so load that bigger set of drugs
drugs_with_resp_auc_data = pd.read_csv('drugs.csv', usecols = range(1,2))
#drugs_with_resp_auc_data

In [6]:
# get the proper indexes from the set of all drugs to map to the pubchem drugs 
indexes = []
counter = 0
for drug in drugs_with_resp_auc_data.iloc[:,0]:
    if drug in list(drugs.iloc[:,0]):
        indexes.append(counter)
    counter = counter + 1

In [7]:
# get values for auc
auc_values = pd.read_csv('auc.csv')
auc_values = auc_values.iloc[:,1:7]
print(auc_values.head(5))

# get values for max resp
maxresp_values = pd.read_csv('max_resp.csv')
maxresp_values = maxresp_values.iloc[:,1:7]
print(maxresp_values.head(5))

         0        1        2        3        4        5
0  428.792  460.119  277.164  443.183  408.479  408.348
1  466.491  494.689  477.969  580.252  445.541  490.083
2  413.997  261.922  438.649  360.282  417.510  322.348
3  473.658  514.225  391.226  472.357  455.033  483.530
4  499.085  454.862  284.317  498.955  490.830  436.067
         0        1       2        3       4        5
0   31.627   50.723  39.010   62.477  53.749   73.943
1  104.528  106.693  94.311  120.259  79.373  105.277
2   94.854   15.955  91.379   82.129  87.158   66.432
3   63.828   74.488  49.955  101.986  88.269   92.542
4    5.796    8.253  14.334   89.583  72.228   47.565


In [8]:
# get the re-indexed set of drugs, auc values, and max resp
final_drugs = drugs_with_resp_auc_data.iloc[indexes,0]
final_drugs

final_auc = auc_values.iloc[indexes,:]
print(final_auc)

final_maxresp = maxresp_values.iloc[indexes,:]
print(final_maxresp)

            0        1        2        3        4        5
1     466.491  494.689  477.969  580.252  445.541  490.083
2     413.997  261.922  438.649  360.282  417.510  322.348
3     473.658  514.225  391.226  472.357  455.033  483.530
4     499.085  454.862  284.317  498.955  490.830  436.067
5     439.192  464.651  441.677  496.490  482.966  438.622
6     444.042  507.567  487.697  473.677  337.470  371.668
7     453.739  466.528  386.022  412.972  410.137  422.828
8     248.701  322.929  362.261  299.505  290.917  273.682
9     468.376  456.636  207.428  425.188  439.731  444.202
10    479.920  458.372  234.410  453.171  445.012  474.465
11    508.541  501.774  476.485  556.192  478.471  528.478
12    460.677  481.198  486.335  474.319  432.789  458.532
13    558.791  529.796  526.594  572.759  547.693  529.938
15    480.426  496.298  475.607  581.248  548.532  409.708
18    285.296  321.411  320.884  356.004  298.973  286.425
19    471.813  514.940  478.022  561.303  495.328  498.6

In [9]:
# fix inf, nan values, and values exceeding the max
X[X==np.inf]=np.nan
np.where(X.values >= np.finfo(np.float32).max)
X = np.nan_to_num(X)

  This is separate from the ipykernel package so we can avoid doing imports until


In [10]:
# check data to make sure we got rid of inf, nan, and max values
print(sum(np.isnan(X)))
print(sum(X > 10000))
print(sum(np.isinf(X)))

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [11]:
# and remove the nans in the auc data frame
final_auc = np.nan_to_num(final_auc)
print(sum(np.isnan(final_auc)))

[0 0 0 0 0 0]


In [12]:
# and remove the nans in the max resp data frame
final_maxresp = np.nan_to_num(final_maxresp)
print(sum(np.isnan(final_maxresp)))

[0 0 0 0 0 0]


In [13]:
# get column names (names of drug features)
fts = pd.read_csv('val_names.csv')
fts = fts.iloc[:,1]
print(fts)

0         atom_stereo_count
1         bond_stereo_count
2                    charge
3                complexity
4       covalent_unit_count
5                exact_mass
6     h_bond_acceptor_count
7        h_bond_donor_count
8          heavy_atom_count
9        isotope_atom_count
10         molecular_weight
11        monoisotopic_mass
12       rotable_bond_count
13                     tpsa
14                    xlogp
Name: 0, dtype: object


# Part 2: Exploration

In [16]:
print(X[0:5,:])
print(final_auc)
print(final_maxresp)
print(drugs)

[[  1.           0.           0.         734.           1.
  344.08960285   7.           3.          25.           0.
  344.3        344.08960285   2.         121.           2.1       ]
 [  1.           0.           0.         633.           1.
  354.18310932   4.           1.          26.           0.
  354.4        354.18310932   8.          71.4          4.4       ]
 [  1.           0.           0.         497.           1.
  292.12117776   3.           1.          22.           0.
  292.3        292.12117776   1.          52.9          2.4       ]
 [  0.           0.           0.         780.           1.
  518.19406792   8.           6.          38.           0.
  518.6        518.19406792   5.         156.           6.9       ]
 [  2.           0.           0.         415.           1.
  301.17902699   3.           3.          22.           0.
  301.4        301.17902699   2.          68.4          2.3       ]]
[[466.491 494.689 477.969 580.252 445.541 490.083]
 [413.997 261.922 

In [None]:
plt.scatter(X[:,3], final_auc[:,0], color = 'blue', label = lines[0])
plt.scatter(X[:,3], final_auc[:,1], color = 'red', label = lines[1])
plt.scatter(X[:,3], final_auc[:,2], color = 'orange', label = lines[2])
plt.scatter(X[:,3], final_auc[:,3], color = 'green', label = lines[3])
plt.scatter(X[:,3], final_auc[:,4], color = 'purple', label = lines[4])
plt.scatter(X[:,3], final_auc[:,5], color = 'pink', label = lines[5])
plt.legend()
plt.xlabel('Complexity')
plt.ylabel('AUC')