In [1]:
from __future__ import print_function
from tensorflow.python.client import device_lib
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
from sklearn.model_selection import train_test_split
import csv


In [2]:
os.environ["CUDA_VISIBLE_DEVICES"]="1"

config = tf.ConfigProto(device_count = {'GPU': 2})
config.gpu_options.per_process_gpu_memory_fraction = 0.5
config.gpu_options.allow_growth = True

In [3]:
def generateMoreData(dataframe, X, y, label, multiply):
    # EG. 55x6=330
    pos_list = [i for i, x in enumerate(dataframe.label) if x == label]
    
    more_data_x_pos = []

    for x in range(multiply):
        for pos in pos_list:
            more_data_x_pos.append(X[pos])
    
    more_data_y_pos = [1 for i in range(len(more_data_x_pos))]
    
    y = np.concatenate((y, more_data_y_pos), axis=None)

    for x in more_data_x_pos:
        X.append(x)
        
    return [X, y]

In [4]:
data_corpus_path = "/datb/aphasia/languagedata/corpus/dataset/datasetboundary_distance_4ms_v1.csv"

df_corpus_data = pd.read_csv(data_corpus_path, sep=',', skiprows=1,
                             names=['region', 'label', 'sample_rate', 'begin', 'end', 'audiopath'])

# Voor het weghalen van 'nan' rows in de column "region"
df_corpus = df_corpus_data.dropna(subset=['region'])

In [5]:
df_corpus_data.shape


(1996214, 6)

In [6]:
# # Get input data
X = []

for x in df_corpus.region:
    trans = [float(y) for y in x.split('|')] # 0,1
    X.append(trans)

# # Get labels
y = [int(x) for x in df_corpus.label]

print('Before generating more 1 labels data')
print('X:{}'.format(len(X)))
print('y:{}'.format(len(y)))

more_data = generateMoreData(df_corpus, X, y, 1, 9)

X_more = more_data[0]
y_more = more_data[1]

print('After generating more 1 labels data')
print('X:{}'.format(len(X_more)))
print('y:{}'.format(len(y_more)))

Before generating more 1 labels data
X:1996214
y:1996214
After generating more 1 labels data
X:3629480
y:3629480


In [7]:
# Snij de dataset
import collections

def generateBalancedData(X_more, y_more, size):
    pos_list = [i for i, x in enumerate(y_more) if x == 1][:int(size/2)]
    neg_list = [i for i, x in enumerate(y_more) if x == 0][:int(size/2)]

    balancedDataX = np.concatenate(([X_more[pos] for pos in pos_list], [X_more[neg] for neg in neg_list]), axis=0)
    
    balancedDataY = np.concatenate(([y_more[pos] for pos in pos_list], [y_more[neg] for neg in neg_list]), axis=0)
        
    return [balancedDataX, balancedDataY]



dataSize = 300000

balancedX, balancedY = generateBalancedData(X_more, y_more, dataSize)

X_more_cut = balancedX
y_more_cut = balancedY

print('X more cut size: {}'.format(len(X_more_cut)))
print('Y more cut size: {}'.format(len(y_more_cut)))

collections.Counter(y_more_cut)

X more cut size: 300000
Y more cut size: 300000


Counter({1: 150000, 0: 150000})

In [8]:
assert len(X_more) == len(y_more)
assert len(X_more_cut) == len(y_more_cut)

In [22]:
# Parameters
LEARNING_RATE = 0.005 # probeer 0.030, 0.010, 0.003, 0.001
N_INSTANCES = len(X_more_cut)
TEST_SIZE = 0.2
TRAIN_SIZE = int(N_INSTANCES * (1 - TEST_SIZE)) 
BATCH_SIZE = 100
ACTIVATION_FUNCTION_SIGMOID = tf.nn.sigmoid
STDDEV = 0.1
RANDOM_STATE = 42

# Network Parameters
# hidden_nodes = 100
hidden_nodes = 10
num_classes = 2
num_features = len(X_more_cut[0])

In [10]:
# One hot encoding for labels
labels_ = np.zeros((N_INSTANCES, num_classes))
labels_[np.arange(N_INSTANCES), y_more_cut] = 1

X_train, X_test, y_train, y_test = train_test_split(X_more_cut, labels_,
                                                    test_size=TEST_SIZE,
                                                    random_state=RANDOM_STATE)

In [11]:
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [12]:
print('X_train:{}'.format(X_train.shape))
print('X_test:{}'.format(X_test.shape))
print('y_train:{}'.format(y_train.shape))
print('y_test:{}'.format(y_test.shape))

X_train:(240000, 52)
X_test:(60000, 52)
y_train:(240000, 2)
y_test:(60000, 2)


In [19]:
n_hidden_1 = hidden_nodes # 1st layer number of neurons
n_hidden_2 = hidden_nodes # 2nd layer number of neurons
n_hidden_3 = hidden_nodes # 2nd layer number of neurons
n_hidden_4 = hidden_nodes # 2nd layer number of neurons
n_hidden_5 = hidden_nodes # 2nd layer number of neurons
# n_hidden_3 = hidden_nodes
n_input = num_features # CORPUS data input (audio region shape: 65)
n_classes = num_classes # CORPUS total classes (0-1 labels)
    
# placeholders for training pairs (x, y)
X = tf.placeholder(tf.float32, shape=[None, n_input], name="X")
Y = tf.placeholder(tf.int32, shape=[None, n_classes], name="Y")

def mlp(_X, _weights, _biases):
 
    layer1 = tf.nn.relu(tf.add(tf.matmul(_X, _weights['h1']), _biases['b1']))
    layer1 = tf.nn.dropout(layer1, 0.5)
    layer2 = tf.nn.relu(tf.add(tf.matmul(layer1, _weights['h2']), _biases['b2']))
    layer3 = tf.nn.relu(tf.add(tf.matmul(layer2, _weights['h3']), _biases['b3']))
    layer4 = tf.nn.relu(tf.add(tf.matmul(layer3, _weights['h4']), _biases['b4']))
    layer5 = tf.nn.relu(tf.add(tf.matmul(layer4, _weights['h5']), _biases['b5']))
    out_layer = ACTIVATION_FUNCTION_SIGMOID(tf.matmul(layer5, _weights['out']) + _biases['out'])
    return out_layer

weights = {
    'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1],stddev=STDDEV)),
    'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2],stddev=STDDEV)),
    'h3': tf.Variable(tf.random_normal([n_hidden_2, n_hidden_3],stddev=STDDEV)),
    'h4': tf.Variable(tf.random_normal([n_hidden_3, n_hidden_4],stddev=STDDEV)),
    'h5': tf.Variable(tf.random_normal([n_hidden_3, n_hidden_4],stddev=STDDEV)),
    'out': tf.Variable(tf.random_normal([n_hidden_5, n_classes],stddev=STDDEV)),                                   
}

biases = {
    'b1': tf.Variable(tf.random_normal([n_hidden_1])),
    'b2': tf.Variable(tf.random_normal([n_hidden_2])),
    'b3': tf.Variable(tf.random_normal([n_hidden_3])),
    'b4': tf.Variable(tf.random_normal([n_hidden_4])),
    'b5': tf.Variable(tf.random_normal([n_hidden_5])),
    'out': tf.Variable(tf.random_normal([n_classes]))
}

# weights = {
#     'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1],stddev=STDDEV)),
#     'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2],stddev=STDDEV)),
#     'h3': tf.Variable(tf.random_normal([n_hidden_2, n_hidden_3],stddev=STDDEV)),
#     'out': tf.Variable(tf.random_normal([n_hidden_3, n_classes],stddev=STDDEV)),                                   
# }

# biases = {
#     'b1': tf.Variable(tf.random_normal([n_hidden_1])),
#     'b2': tf.Variable(tf.random_normal([n_hidden_2])),
#     'b3': tf.Variable(tf.random_normal([n_hidden_3])),
#     'out': tf.Variable(tf.random_normal([n_classes]))
# }

pred = mlp(X, weights, biases)

# use a negative log loss function for logistic regression
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=Y))

# configure the optimizer
optimizer = tf.train.AdamOptimizer(learning_rate = LEARNING_RATE).minimize(cost)

# Accuracy
correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(Y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

In [20]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report


In [None]:
training_epochs = 500
# training_epochs = 10000
display_step = 1 # controls how often the loss is reported

modeltrain_log = '/datb/aphasia/languagedata/corpus/result/modeltrain_log_h_2_n_82'

with tf.Session(config=config) as sess:
#     with open(modeltrain_log+'.csv', 'w') as writeTo:
        
#         writer = csv.DictWriter(writeTo, fieldnames=['epoch', 'training_acc', 'test_acc', 'avg_cost'])
#         writer.writeheader()
        
        sess.run(tf.global_variables_initializer())

        print('Learning rate: {}'.format(LEARNING_RATE))
        print('hidden_nodes: ' + str(hidden_nodes))
        print('Training epochs: {}'.format(training_epochs))
        print('TEST_SIZE: ' + str(TEST_SIZE))
        print('Dataset rows: {}'.format(len(X_more_cut)))
        print('Dataset features: {} \n'.format(len(X_more_cut[0])))

        for epoch in range(training_epochs+1):

            avg_cost = 0.
            total_batch = int(len(X_train) / BATCH_SIZE)

            for i in range(total_batch):
                randidx = np.random.randint(int(TRAIN_SIZE), size = BATCH_SIZE)
                batch_xs = X_train[randidx, :]
                batch_ys = y_train[randidx, :]

                # Fit using batched data
                sess.run(optimizer, feed_dict={X: batch_xs, Y: batch_ys})

                # Calculate average cost
                avg_cost += sess.run(cost, feed_dict={X: batch_xs, Y: batch_ys})/total_batch

            # Display progress
            if epoch % display_step == 0:
                print ("Epoch: %03d/%03d cost: %.9f" % (epoch, training_epochs, avg_cost))
                train_acc = sess.run(accuracy, feed_dict={X: batch_xs, Y: batch_ys})
                print ("Training accuracy: %.3f" % (train_acc))
                
                test_acc = sess.run(accuracy, feed_dict={X: X_test, Y: y_test})

#                 writer.writerow({'epoch': epoch, 'training_acc': train_acc, 'test_acc':test_acc, 'avg_cost': avg_cost})

    #             print(sess.run(cost, feed_dict={X: batch_xs, Y: batch_ys}))

                # Save the variables to disk.
    #             save_path = saver.save(sess, export_path+"phonemeboundary_model-"+str(epoch))
    #             print("Model saved in path: %s" % save_path)

        print ("End of training.\n")
        print("Testing...\n")
        
        
        y_p = tf.argmax(pred, 1)
        val_accuracy, y_pred = sess.run([accuracy, y_p], feed_dict={X: X_test, Y: y_test})

        print("validation accuracy: {}".format(val_accuracy))
        y_true = np.argmax(y_test,1)
        print("Precision: {}".format(precision_score(y_true, y_pred)))
        print("Recall:{}".format(recall_score(y_true, y_pred)))
        print("f1_score:{}".format(f1_score(y_true, y_pred)))
        
        print("confusion_matrix")
        print(confusion_matrix(y_true, y_pred))
        
        print("classification_report")
        print(classification_report(y_true, y_pred))

    #     # Save the variables to disk.
    #     save_path = saver.save(sess, export_path+"phonemeboundary_model")
    #     print("Model saved in path: %s" % save_path)

        # Testing

        test_acc = sess.run(accuracy, feed_dict={X: X_test, Y: y_test})
        print ("Test accuracy: %.3f" % (test_acc))

        sess.close()
        print("Session closed!")

Learning rate: 0.005
hidden_nodes: 10
Training epochs: 500
TEST_SIZE: 0.2
Dataset rows: 300000
Dataset features: 52 

Epoch: 000/500 cost: 0.692573939
Training accuracy: 0.530
Epoch: 001/500 cost: 0.690096294
Training accuracy: 0.570
Epoch: 002/500 cost: 0.687699007
Training accuracy: 0.420
Epoch: 003/500 cost: 0.685383779
Training accuracy: 0.590
Epoch: 004/500 cost: 0.683429468
Training accuracy: 0.530
Epoch: 005/500 cost: 0.681996382
Training accuracy: 0.560
Epoch: 006/500 cost: 0.680355246
Training accuracy: 0.470
Epoch: 007/500 cost: 0.678418424
Training accuracy: 0.560
Epoch: 008/500 cost: 0.676528774
Training accuracy: 0.450
Epoch: 009/500 cost: 0.675542542
Training accuracy: 0.640
Epoch: 010/500 cost: 0.674670691
Training accuracy: 0.590
Epoch: 011/500 cost: 0.673418228
Training accuracy: 0.560
Epoch: 012/500 cost: 0.673022943
Training accuracy: 0.590
Epoch: 013/500 cost: 0.671277694
Training accuracy: 0.530
Epoch: 014/500 cost: 0.670223132
Training accuracy: 0.540
Epoch: 015/5

In [None]:
with tf.Session() as sess:
          new_saver = tf.train.import_meta_graph('my_jesse_test2_model-1000.meta')
          new_saver.restore(sess, tf.train.latest_checkpoint('./'))

In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [None]:
modeltrain_log_path = '/datb/aphasia/languagedata/corpus/result/modeltrain_log.csv'

modeltrain_log = pd.read_csv(modeltrain_log_path, sep=',', skiprows=1,
                             names=['epoch', 'training_acc', 'test_acc', 'avg_cost'])

In [None]:
max_values = modeltrain_log.loc[modeltrain_log['epoch'].idxmax()]
train_acc = modeltrain_log.training_acc
test_acc = modeltrain_log.test_acc
epoch = int(max_values[0])
max_train = modeltrain_log.loc[modeltrain_log['training_acc'].idxmax()]
max_test = modeltrain_log.loc[modeltrain_log['test_acc'].idxmax()]
print('Max train: {}'.format(max_train[1]))
print('Max test: {}'.format(max_test[2]))