In [32]:
import zipfile
import pandas as pd
import numpy as np
import glob
import scipy.sparse
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from tempfile import TemporaryFile
%matplotlib inline

In [33]:
molecule_descriptor_files = []
for file in glob.glob('./Dataset/AID_2289_descriptors_part[0-9]*.csv'):
    molecule_descriptor_files.append(file)

#### Use counter-screen (AID: 588342) to filter out false positive

Counter-screen: A screen performed in parallel with or after the primary screen. The assay used in the counter-screen is developed to identify compounds that have the potential to interfere with the assay used in the primary screen (the primary assay).

Identify the overlapping CID between 588342 active and 2289 active: 2,399 molecules

In [34]:
active_588342 = pd.read_csv('./Dataset/active_588342.csv',dtype={'CID':'str'})
active_2289 = pd.read_csv('./Dataset/active_2289.csv',dtype={'CID':'str'})

In [35]:
false_positive_cid = list(set(active_2289['CID']) & set(active_588342['CID']))

In [36]:
len(false_positive_cid)

2399

In [37]:
# only one file contains active cases
molecule_matrix = pd.read_csv(molecule_descriptor_files[9],dtype={'CID':'str','SID':'str'})
features = molecule_matrix.columns[2:181] # feature names except the molecule ID

In [38]:
filterout_index = []
for idx in range(molecule_matrix.shape[0]):
    cid = molecule_matrix.iloc[idx,:]['CID']
    if cid in false_positive_cid:
        filterout_index.append(idx)

In [39]:
ind = np.ones(molecule_matrix.shape[0]).astype('bool')
ind[filterout_index] = False
molecule_matrix = molecule_matrix.iloc[ind,:]
molecule_matrix = np.array(molecule_matrix)

In [40]:
for file in molecule_descriptor_files[:9]:
    f = pd.read_csv(file,dtype={'CID':'str','SID':'str'})
    f = np.array(f.dropna(axis=0)) # drop rows with NA 
    molecule_matrix = np.vstack((molecule_matrix,f))

In [41]:
label = molecule_matrix[:,molecule_matrix.shape[1]-1]
label[label=='active']=1
label[label=='inactive']=0
label = label.astype(int)

In [42]:
print(Counter(label))

Counter({0: 304501, 1: 884})


In [43]:
moleculeCID = molecule_matrix[:,0]

#### Construct the whole matrix

In [44]:
molecule_matrix = molecule_matrix[:,2:181]

In [45]:
molecule_matrix.shape

(305385, 179)

In [46]:
molecule_matrix = molecule_matrix.astype('float')

In [47]:
print('Number of molecule ID:',len(moleculeCID))
print('Number of label:',len(label))
print('Number of features:',len(features))
print('Shape:',molecule_matrix.shape)

Number of molecule ID: 305385
Number of label: 305385
Number of features: 179
Shape: (305385, 179)


#### Remove attributes of no variation

In [48]:
remove_col = []
for col in range(molecule_matrix.shape[1]):
    temp_sum = np.sum(molecule_matrix[:,col])
    if temp_sum == 0 or temp_sum == molecule_matrix.shape[0]:
        remove_col.append(col)

In [49]:
remove = np.ones(molecule_matrix.shape[1]).astype('bool')
remove[np.array(remove_col)] = False
molecule_matrix = molecule_matrix[:,remove]

In [50]:
print('After removing attributes without any variation:', molecule_matrix.shape)

After removing attributes without any variation: (305385, 158)


In [51]:
print('Matrix\'s shape:', molecule_matrix.shape)
print('Length of label:',len(label))
print('Length of molecule ID:', len(moleculeCID))

Matrix's shape: (305385, 158)
Length of label: 305385
Length of molecule ID: 305385


In [29]:
molecule_matrix = pd.DataFrame(molecule_matrix)
molecule_matrix.to_csv('Molecule_Matrix.csv',index_label=False)

In [31]:
label = pd.DataFrame(label)
label.to_csv('label.csv', index_label = False)
moleculeCID = pd.DataFrame(moleculeCID)
moleculeCID.to_csv('moleculeCID.csv',index_label = False)