In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

import pyodbc
import pandas as pd
import time
import pickle as pkl
import numpy as np
import matplotlib.pyplot as plt
import json
from pathlib import Path
from keras import backend as K
import datetime
K.clear_session()

%matplotlib inline

with open ('../params.json') as f:
    params = json.load(f)
    
table_prefix = params['table_prefix']
diseases = params['diseases']
case_limit = params['case_limit']
control_limit = params['control_limit']
min_enrollment = params['enrollment']
user = params['user']

chunk = True


# run_id = table_prefix_ + str(case_limit) + '_' + datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
# print(run_id)

creds_file = "/home/" + user + "/creds.txt" 
creds = lines = [line.rstrip('\n') for line in open(creds_file)]

connection_string = ("Driver={ODBC Driver 17 for SQL Server};" + 
                     "server=" + creds[0] + ";" +
                     "domain=" + creds[1] + ";" +  
                     "database=" + creds[2] + ";" +
                     "uid=" + creds[3]  + ";" +
                     "pwd=" + creds[4] + ";" +
                     "ssl=require;")

cn = pyodbc.connect(connection_string, autocommit=True)
cursor = cn.cursor()

directory = '../../data/diseaes_replaced' + str(table_prefix) + '_' + str(case_limit)
output_dir = '../../outputs/' + str(table_prefix) + _ + str(case_limit) + '_' +\
             'gd'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

def get_data(path):
    ''' Returns dataframe with columns: 'path', 'word'.'''
    datadir = Path(path)
    files = [(str(f), f.parts[-1]) for f in datadir.glob('*.csv.gz') if f]
    df = pd.DataFrame(files, columns=['path', 'word'])
    
    return df

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
back_window = 25
train = get_data(directory)
X = train.path
print(X)
X.to_csv(directory + '/files_df.csv')

0       ../../data/diseaes_replacedtest_10000/seq_1842...
1       ../../data/diseaes_replacedtest_10000/seq_1734...
2       ../../data/diseaes_replacedtest_10000/seq_1151...
3       ../../data/diseaes_replacedtest_10000/seq_1865...
4       ../../data/diseaes_replacedtest_10000/seq_1788...
5       ../../data/diseaes_replacedtest_10000/seq_8770...
6       ../../data/diseaes_replacedtest_10000/seq_8740...
7       ../../data/diseaes_replacedtest_10000/seq_1540...
8       ../../data/diseaes_replacedtest_10000/seq_8500...
9       ../../data/diseaes_replacedtest_10000/seq_1131...
10      ../../data/diseaes_replacedtest_10000/seq_3560...
11      ../../data/diseaes_replacedtest_10000/seq_1888...
12      ../../data/diseaes_replacedtest_10000/seq_1666...
13      ../../data/diseaes_replacedtest_10000/seq_5430...
14      ../../data/diseaes_replacedtest_10000/seq_1964...
15      ../../data/diseaes_replacedtest_10000/seq_7350...
16      ../../data/diseaes_replacedtest_10000/seq_1196...
17      ../../

In [3]:
from keras.utils import to_categorical
import pickle as pkl

def get_seqs(path, vocab_size):
    #print(path)
    seq = pd.read_csv(path)
    #print(seq.shape)
    
    X = []
    decoder_target = []
    y_time = []
    
    for index, row in seq.iterrows():
        if row['fromIndex'] >= 0 and index < seq.shape[0] - 1:
            X.append(to_categorical(seq.iloc[index+1-back_window:index+1]['InputCode'].values, 
                                    num_classes=vocab_size))
            decoder_target.append(to_categorical(seq.iloc[index+2-back_window:index+2]['InputCode'].values, 
                                 num_classes=vocab_size))
            y_time.append(seq.iloc[index+1]['fromIndex'])
    
    
    # encoder input, decoder input, decoder target, y_time
    return (np.asarray(X), np.asarray(X),
            np.asarray(decoder_target), np.asarray(y_time))

    
def batch_generator(X, batch_size=32):
    vocab_dict = pkl.load(open(directory + '/input_token_index.pkl', 'rb'))
    vocab_size = len(vocab_dict)
    while True:
        # choose batch_size random images / labels from the data
        idx = np.random.randint(0, X.shape[0], batch_size)
        x_file = X[idx]
        
        if str(x_file.values[0]) == 'nan':
            continue
        encoder_input, decoder_input, decoder_target, y_time = get_seqs(str(x_file.values[0]), len(vocab_dict))
        
        for i in range(encoder_input.shape[0] / batch_size):
            if (encoder_input.ndim == 3) and (((i+1)*batch_size)-(i*batch_size)) > 1:
                yield ([encoder_input[(i*batch_size):(i+1)*batch_size],
                        decoder_input[(i*batch_size):(i+1)*batch_size]], 
                       [decoder_target[(i*batch_size):(i+1)*batch_size],
                        y_time[(i*batch_size):(i+1)*batch_size,]])
            else:
                pass 
#                 print(encoder_input.shape, decoder_input.shape, decoder_target.shape, y_time.shape)
#                 print(X[idx])
#                 raise Exception('stop')

In [4]:
# print(get_seqs('../../data/test_1000/seq_1200_1300.csv.gz'))
%time [ei_batch, di_batch], [dt_batch, y_time_batch] = batch_generator(X).next()
print('shapes: ', ei_batch.shape, di_batch.shape, dt_batch.shape, y_time_batch.shape)

CPU times: user 9.17 s, sys: 3.12 s, total: 12.3 s
Wall time: 12.4 s
('shapes: ', (32, 25, 1436), (32, 25, 1436), (32, 25, 1436), (32,))


In [5]:
from keras.models import Model
from keras.utils import multi_gpu_model
from keras.layers import Input, LSTM, Dense, Lambda, GRU, Embedding, GaussianNoise
from keras import backend as K
from keras.callbacks import EarlyStopping, TensorBoard
from keras_exp.multigpu import print_mgpu_modelsummary

def get_model(num_tokens, latent_dim):
    encoder_inputs = Input(shape=(None, num_tokens), name='encoder_input')
    encoder = GRU(latent_dim, return_state=True, name='encoded')
    encoder_outputs, state_h = encoder(encoder_inputs)
                                       
    # Set up the decoder, using `encoder_states` as initial state.
    decoder_inputs = Input(shape=(None, num_tokens), name='decoder_input')
    noisy_inputs = GaussianNoise(0.2)(decoder_inputs)
    decoder_gru = GRU(latent_dim, return_sequences=True, return_state=True)
    decoder_outputs, _ = decoder_gru(noisy_inputs, initial_state=state_h)
    decoder_dense = Dense(num_tokens, activation='softmax', name='ae')
    decoder_outputs = decoder_dense(decoder_outputs)

    time_internal = Dense(100, activation='relu')(encoder_outputs)
    time_dense = Dense(1, activation='relu', name='time')
    time_output = time_dense(time_internal)

    model = Model([encoder_inputs, decoder_inputs], [decoder_outputs, time_output])

    model.summary()
    return model

NCCL support available


In [6]:
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.callbacks import ModelCheckpoint

print(tf.__version__)
    
config = tf.ConfigProto()
config.gpu_options.allow_growth = True  # dynamically grow the memory used on the GPU
config.log_device_placement = True  # to log device placement (on which device the operation ran)
                                    # (nothing gets printed in Jupyter, only if you run it standalone)
sess = tf.Session(config=config)
K.set_session(sess)  # set this TensorFlow session as the default session for Keras

print(type(X), X.shape[0]*0.8)
Xt = X[int(X.shape[0]*0.8):]
X = X[:int(X.shape[0]*0.8)]

Xt = Xt.reset_index(drop=True)

print(len(X), len(Xt)) 

fixed_length = 25
fixed_vocab = len(pkl.load(open(directory + '/input_token_index.pkl', 'rb')))
# epochs = 100
latent_dim = 10

model = get_model(fixed_vocab, latent_dim)

losses = {'ae':'categorical_crossentropy', 'time':'mse'}
loss_weights = {'ae':1, 'time':2}
earlystopper = EarlyStopping(monitor='loss', patience=100, verbose=1)
tbCallback = TensorBoard(log_dir='../../logs/gd', write_graph=True)

filepath = output_dir + "/weights.{epoch:02d}-{val_loss:.2f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=False, mode='min')

model.compile(optimizer='adam', loss=losses, loss_weights=loss_weights)
model.save(output_dir + '/empty_model.model')

1.8.0
(<class 'pandas.core.series.Series'>, 1600.0)
(1600, 400)
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
decoder_input (InputLayer)      (None, None, 1436)   0                                            
__________________________________________________________________________________________________
encoder_input (InputLayer)      (None, None, 1436)   0                                            
__________________________________________________________________________________________________
gaussian_noise_1 (GaussianNoise (None, None, 1436)   0           decoder_input[0][0]              
__________________________________________________________________________________________________
encoded (GRU)                   [(None, 10), (None,  43410       encoder_input[0][0]              
_____________________________________________

  '. They will not be included '


In [7]:
batch_size = 1024
train_gen = batch_generator(X, batch_size=batch_size)
valid_gen = batch_generator(Xt, batch_size=batch_size)
print(len(X), len(Xt))

(1600, 400)


In [8]:
## import time
start_time = time.time()
model.fit_generator(
    generator=train_gen,
    validation_data=valid_gen,
    validation_steps=10,
    epochs=2000,
    steps_per_epoch=100, #X.shape[0], - we do smaller steps per epoch to see training progress
    use_multiprocessing = True,
    workers = 8,
    callbacks=[earlystopper, checkpoint, tbCallback])
print(time.time()-start_time)

model.save(output_dir + '/trained_model_weights.model')



Epoch 1/2000




Epoch 00001: saving model to ../../outputs/test10000_gd/weights.01-1379612.71.hdf5
Epoch 2/2000

Epoch 00002: saving model to ../../outputs/test10000_gd/weights.02-1348884.83.hdf5
Epoch 3/2000

Epoch 00003: saving model to ../../outputs/test10000_gd/weights.03-890498.77.hdf5
Epoch 4/2000

Epoch 00004: saving model to ../../outputs/test10000_gd/weights.04-812559.93.hdf5
Epoch 5/2000

Epoch 00005: saving model to ../../outputs/test10000_gd/weights.05-488228.60.hdf5
Epoch 6/2000

Epoch 00006: saving model to ../../outputs/test10000_gd/weights.06-721262.95.hdf5
Epoch 7/2000

Epoch 00007: saving model to ../../outputs/test10000_gd/weights.07-627573.37.hdf5
Epoch 8/2000

Epoch 00008: saving model to ../../outputs/test10000_gd/weights.08-650588.84.hdf5
Epoch 9/2000

Epoch 00009: saving model to ../../outputs/test10000_gd/weights.09-453785.07.hdf5
Epoch 10/2000

Epoch 00010: saving model to ../../outputs/test10000_gd/weights.10-436924.12.hdf5
Epoch 11/2000

Epoch 00011: saving model to ../../


Epoch 00057: saving model to ../../outputs/test10000_gd/weights.57-609217.40.hdf5
Epoch 58/2000

Epoch 00058: saving model to ../../outputs/test10000_gd/weights.58-470564.64.hdf5
Epoch 59/2000

Epoch 00059: saving model to ../../outputs/test10000_gd/weights.59-650148.47.hdf5
Epoch 60/2000

Epoch 00060: saving model to ../../outputs/test10000_gd/weights.60-478541.89.hdf5
Epoch 61/2000

Epoch 00061: saving model to ../../outputs/test10000_gd/weights.61-409501.32.hdf5
Epoch 62/2000

Epoch 00062: saving model to ../../outputs/test10000_gd/weights.62-490919.67.hdf5
Epoch 63/2000

Epoch 00063: saving model to ../../outputs/test10000_gd/weights.63-631539.70.hdf5
Epoch 64/2000

Epoch 00064: saving model to ../../outputs/test10000_gd/weights.64-413699.80.hdf5
Epoch 65/2000

Epoch 00065: saving model to ../../outputs/test10000_gd/weights.65-480752.59.hdf5
Epoch 66/2000

Epoch 00066: saving model to ../../outputs/test10000_gd/weights.66-520484.55.hdf5
Epoch 67/2000

Epoch 00067: saving model to 


Epoch 00113: saving model to ../../outputs/test10000_gd/weights.113-470264.75.hdf5
Epoch 114/2000

Epoch 00114: saving model to ../../outputs/test10000_gd/weights.114-436523.40.hdf5
Epoch 115/2000

Epoch 00115: saving model to ../../outputs/test10000_gd/weights.115-539267.47.hdf5
Epoch 116/2000

Epoch 00116: saving model to ../../outputs/test10000_gd/weights.116-368031.48.hdf5
Epoch 117/2000

Epoch 00117: saving model to ../../outputs/test10000_gd/weights.117-401375.52.hdf5
Epoch 118/2000

Epoch 00171: saving model to ../../outputs/test10000_gd/weights.171-515382.67.hdf5
Epoch 172/2000

Epoch 00172: saving model to ../../outputs/test10000_gd/weights.172-570534.46.hdf5
Epoch 173/2000

Epoch 00173: saving model to ../../outputs/test10000_gd/weights.173-330755.55.hdf5
Epoch 174/2000

Epoch 00174: saving model to ../../outputs/test10000_gd/weights.174-501326.16.hdf5
Epoch 175/2000

Epoch 00175: saving model to ../../outputs/test10000_gd/weights.175-328682.66.hdf5
Epoch 176/2000

Epoch 001


Epoch 00194: saving model to ../../outputs/test10000_gd/weights.194-430953.26.hdf5
Epoch 195/2000

Epoch 00195: saving model to ../../outputs/test10000_gd/weights.195-557420.67.hdf5
Epoch 196/2000

Epoch 00196: saving model to ../../outputs/test10000_gd/weights.196-507968.62.hdf5
Epoch 197/2000

Epoch 00197: saving model to ../../outputs/test10000_gd/weights.197-460534.83.hdf5
Epoch 198/2000

Epoch 00198: saving model to ../../outputs/test10000_gd/weights.198-387659.99.hdf5
Epoch 199/2000

Epoch 00199: saving model to ../../outputs/test10000_gd/weights.199-572179.43.hdf5
Epoch 200/2000

Epoch 00200: saving model to ../../outputs/test10000_gd/weights.200-382838.97.hdf5
Epoch 201/2000

Epoch 00201: saving model to ../../outputs/test10000_gd/weights.201-340296.27.hdf5
Epoch 202/2000

Epoch 00202: saving model to ../../outputs/test10000_gd/weights.202-438146.95.hdf5
Epoch 203/2000

Epoch 00203: saving model to ../../outputs/test10000_gd/weights.203-389959.61.hdf5
Epoch 204/2000

Epoch 002


Epoch 00222: saving model to ../../outputs/test10000_gd/weights.222-357360.03.hdf5
Epoch 223/2000

Epoch 00223: saving model to ../../outputs/test10000_gd/weights.223-367672.62.hdf5
Epoch 224/2000

Epoch 00224: saving model to ../../outputs/test10000_gd/weights.224-501397.53.hdf5
Epoch 225/2000

Epoch 00225: saving model to ../../outputs/test10000_gd/weights.225-440027.11.hdf5
Epoch 226/2000

Epoch 00226: saving model to ../../outputs/test10000_gd/weights.226-387034.82.hdf5
Epoch 227/2000

Epoch 00227: saving model to ../../outputs/test10000_gd/weights.227-342703.94.hdf5
Epoch 228/2000

Epoch 00228: saving model to ../../outputs/test10000_gd/weights.228-515563.96.hdf5
Epoch 229/2000

Epoch 00229: saving model to ../../outputs/test10000_gd/weights.229-427755.57.hdf5
Epoch 230/2000

Epoch 00230: saving model to ../../outputs/test10000_gd/weights.230-364209.25.hdf5
Epoch 231/2000

Epoch 00231: saving model to ../../outputs/test10000_gd/weights.231-287186.21.hdf5
Epoch 232/2000

Epoch 002