In [None]:
from modified_vCNN import *
import matplotlib.pyplot as plt

In [20]:
hyperparameters = {
    "number_of_kernel":[64],
    "max_ker_len":[50],
    "kernel_init_size":[6,8,10,12,14,16],
    "lr":[1,0.8,0.6],
    "rho":[0.99],
    "epsilon":[1.0e-8],
    "mu":[0.0015,0.002,0.0025,0.003,0.0035]
}

## vCNN on Xylella Nanopore sequencing data

In [18]:
dataname_list = list()
filePath_dict = dict()
for data in os.listdir('../../../samples/'):
    for suffix in ['_peaks_uBH_0.001', \
                   '_peaks_uBH_0.001_peak_dist_2_min_cov_20_min_dist_20', \
                   '_peaks_uBH_0.001_peak_dist_2_min_cov_20_min_dist_20_k_3_kmer_quantile_0.25']:
        dataname = data + suffix
        dataname_list.append(dataname)
        filePath_dict[dataname] = '../../../samples/' + data + '/peaks/' + dataname + '.fasta'

In [None]:
for dataname in dataname_list:
    print(dataname)
    filePath = filePath_dict[dataname]
    OutputDir = 'training/' + dataname

    # data
    GeneRateOneHotMatrixTest = GeneRateOneHotMatrix(padding=15)
    OutputDirHdf5 = OutputDir + "/Hdf5/"
    GeneRateOneHotMatrixTest.runSimple(filePath, OutputDirHdf5, SaveData=False)
    data_set = [[GeneRateOneHotMatrixTest.TrainX, GeneRateOneHotMatrixTest.TrainY],
                [GeneRateOneHotMatrixTest.TestX, GeneRateOneHotMatrixTest.TestY]]
    input_shape = GeneRateOneHotMatrixTest.TestX[0].shape

    # train model
    modelsave_output_prefix = OutputDir + "/ModelParameter/"
    model, test_auc, modelsave = train_vCNN_hyperparameter_tuning(input_shape, modelsave_output_prefix, data_set, random_seed=233, 
                                                                  batch_size=100, epoch_scheme=100, hyperparameters=hyperparameters)

    # plot loss / accuracy curves
    history = model.history.history
    plt.plot(history['acc'])
    plt.plot(history['val_acc'])
    plt.title('model accuracy')
    plt.ylabel('acc')
    plt.xlabel('epoch')
    plt.legend(['train', 'val'], loc='upper left')
    plt.show()
    plt.plot(history['loss'])
    plt.plot(history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'val'], loc='upper right')
    plt.show()

    # get kernels
    DenseWeights = K.get_value(model.layers[4].kernel)
    meanValue = np.mean(np.abs(DenseWeights))
    std = np.std(np.abs(DenseWeights))
    workWeightsIndex = np.where(np.abs(DenseWeights) > meanValue-std)[0]
    kernels = recover_ker(model, "vCNN", workWeightsIndex)

    # save PWM (top 500)
    PwmWorklist = []
    for ker_id in range(len(kernels)):
        kernel = kernels[ker_id]

        KernelSeqs, KSconvValue, seqinfo = KernelSeqDive_top(kernel, GeneRateOneHotMatrixTest.seq_pos_matrix_out, top=500)
        KernelSeqs = np.asarray(KernelSeqs)
        PwmWork = NormPwm(KernelSeqs, True)
        PwmWorklist.append(PwmWork)

    pwm_save_dir = OutputDir + "/recover_PWM_500/"
    mkdir(pwm_save_dir)
    for i in range(len(PwmWorklist)):
        mkdir(pwm_save_dir + "/")
        np.savetxt(pwm_save_dir + "/" + str(i) + ".txt", PwmWorklist[i])

## vCNN on artificial datasets with controlled TPR

In [22]:
dataname_list = list()
filePath_dict = dict()
for dataname in os.listdir('../../../artificial_Riv19/'):
    dataname_list.append(dataname)
    filePath_dict[dataname] = '../../../artificial_Riv19/' + dataname + '.fasta'

In [None]:
for dataname in dataname_list:
    print(dataname)
    filePath = filePath_dict[dataname]
    OutputDir = 'training/' + dataname

    # data
    GeneRateOneHotMatrixTest = GeneRateOneHotMatrix(padding=15)
    OutputDirHdf5 = OutputDir + "/Hdf5/"
    GeneRateOneHotMatrixTest.runSimple(filePath, OutputDirHdf5, SaveData=False)
    data_set = [[GeneRateOneHotMatrixTest.TrainX, GeneRateOneHotMatrixTest.TrainY],
                [GeneRateOneHotMatrixTest.TestX, GeneRateOneHotMatrixTest.TestY]]
    input_shape = GeneRateOneHotMatrixTest.TestX[0].shape

    # train model
    modelsave_output_prefix = OutputDir + "/ModelParameter/"
    model, test_auc, modelsave = train_vCNN_hyperparameter_tuning(input_shape, modelsave_output_prefix, data_set, random_seed=233, 
                                                                  batch_size=100, epoch_scheme=100, hyperparameters=hyperparameters)

    # plot loss / accuracy curves
    history = model.history.history
    plt.plot(history['acc'])
    plt.plot(history['val_acc'])
    plt.title('model accuracy')
    plt.ylabel('acc')
    plt.xlabel('epoch')
    plt.legend(['train', 'val'], loc='upper left')
    plt.show()
    plt.plot(history['loss'])
    plt.plot(history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'val'], loc='upper right')
    plt.show()

    # get kernels
    DenseWeights = K.get_value(model.layers[4].kernel)
    meanValue = np.mean(np.abs(DenseWeights))
    std = np.std(np.abs(DenseWeights))
    workWeightsIndex = np.where(np.abs(DenseWeights) > meanValue-std)[0]
    kernels = recover_ker(model, "vCNN", workWeightsIndex)

    # save PWM (top 500)
    PwmWorklist = []
    for ker_id in range(len(kernels)):
        kernel = kernels[ker_id]

        KernelSeqs, KSconvValue, seqinfo = KernelSeqDive_top(kernel, GeneRateOneHotMatrixTest.seq_pos_matrix_out, top=500)
        KernelSeqs = np.asarray(KernelSeqs)
        PwmWork = NormPwm(KernelSeqs, True)
        PwmWorklist.append(PwmWork)

    pwm_save_dir = OutputDir + "/recover_PWM_500/"
    mkdir(pwm_save_dir)
    for i in range(len(PwmWorklist)):
        mkdir(pwm_save_dir + "/")
        np.savetxt(pwm_save_dir + "/" + str(i) + ".txt", PwmWorklist[i])