In [1]:
import numpy as np
import pandas as pd
import autokeras as ak

from sklearn.decomposition import PCA

In [2]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [3]:
PATH = './DrugCell/data/'

t = pd.read_table(
    PATH + 'train_rcell_over50_not_equal.txt', 
    header=None
    )
t.columns = ['CELL_LINE', 'SMILES', 'label']
t = reduce_mem_usage(t)

tmp = pd.read_csv(PATH + 'SMILES_from_PubchemID.txt', header=None, sep='\t')
mfp = pd.read_csv(
    './DrugCell/data/mfp.txt',
    header=None,
)
mfp = reduce_mem_usage(mfp)
pca = PCA(n_components=10)
mfp = pca.fit_transform(mfp)
mfp = pd.DataFrame(mfp)
mfp['SMILES'] = tmp[1]

cell_line = pd.read_csv('./DrugCell/data/cell2mutation.txt', header=None)
cell_line = reduce_mem_usage(cell_line)
cell_line = pca.fit_transform(cell_line)
cell_line = pd.DataFrame(cell_line)
cell_line['CELL_LINE'] = list(pd.read_csv('./DrugCell/data/cell2ind.txt', sep='\t', header=None)[1])

t = t.merge(mfp, on='SMILES')    
train = t.merge(cell_line, on='CELL_LINE')

Memory usage of dataframe is 7.42 MB
Memory usage after optimization is: 1.88 MB
Decreased by 74.7%
Memory usage of dataframe is 258.16 MB
Memory usage after optimization is: 32.27 MB
Decreased by 87.5%
Memory usage of dataframe is 28.11 MB
Memory usage after optimization is: 3.51 MB
Decreased by 87.5%


In [4]:
reg = ak.StructuredDataRegressor(
    overwrite=True, 
    max_trials=100
) 

In [5]:
t = pd.read_table(
    PATH + 'val_rcell_over50_not_equal.txt', 
    header=None
    )
t.columns = ['CELL_LINE', 'SMILES', 'label']
t = reduce_mem_usage(t)
t = t.merge(mfp, on='SMILES')    
t = t.merge(cell_line, on='CELL_LINE')

Memory usage of dataframe is 2.47 MB
Memory usage after optimization is: 0.85 MB
Decreased by 65.8%


In [6]:
reg.fit(
    train.drop('label', axis=1),
    train['label'],
    validation_data=(t.drop('label', axis=1), t['label']),
    epochs=300,
)

Trial 68 Complete [00h 06m 34s]
val_loss: 0.9114857316017151

Best val_loss So Far: 0.9109929203987122
Total elapsed time: 06h 40m 30s
INFO:tensorflow:Oracle triggered exit
Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoc

<keras.callbacks.History at 0x7fa6fa8f6400>

In [7]:
t = pd.read_table(
    PATH + 'test_rcell_over50_not_equal.txt', 
    header=None
    )
t.columns = ['CELL_LINE', 'SMILES', 'label']
t = reduce_mem_usage(t)

t = t.merge(mfp, on='SMILES')    
t = t.merge(cell_line, on='CELL_LINE')

Memory usage of dataframe is 2.47 MB
Memory usage after optimization is: 0.85 MB
Decreased by 65.8%


In [12]:
predicted_y = reg.predict(t.drop('label', axis=1))



In [16]:
np.corrcoef(pd.DataFrame(predicted_y)[0], t['label'])

array([[ 1.        , -0.02717375],
       [-0.02717375,  1.        ]])