In [None]:
# Include project path to available custom class at jupyter
import os
import shutil
import sys
sys.path.insert(0, os.path.abspath('/home/PycharmProjects/RecurrentNetworks/'))

# Disable warnings
import warnings
warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [None]:
import tensorflow as tf
import keras.backend.tensorflow_backend as K
import datetime
import re
import copy
K.set_session

import numpy as np
import pandas as pd
import utils.definition_network as dn
import pickle

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import LSTM
from keras.layers import Embedding

from network_model.model_class import ModelClass
from utils.experiment_processes import ExperimentProcesses

### 1. Tests Step 1: (A) Kernel Functions and (B) Word Embeddings

#### 1.1 Auxiliary Functions

In [None]:
PARAMS_BY_DISORDER = dict({'anxiety': {'total_registers': 1040, 'subdirectory': 'anxiety'},
                           'depression': {'total_registers': 2160, 'subdirectory': 'depression'},
                           'anxiety,depression': {'total_registers': 880, 'subdirectory': 'anxiety,depression'},
                           'anx_dep_multilabel': {'total_registers': 2640, 'subdirectory': 'anx_dep_multilabel'}})

In [None]:
def create_directory(dir_path):
    ret = True
    try:
        os.makedirs(dir_path)
    except OSError:
        print ("Creation of the directory %s failed" % dir_path)
        ret = False
    else:
        print ("Successfully created the directory %s" % dir_path)
        
    return ret

In [None]:
def create_test_dir_tree(test_dir_path, disorder_dir_path):    
    if create_directory(dn.PATH_PROJECT + test_dir_path):
        _ = create_directory(dn.PATH_PROJECT + test_dir_path + '/tokenizers/' + disorder_dir_path)
        _ = create_directory(dn.PATH_PROJECT + test_dir_path + '/pre_train_embeddings/' + disorder_dir_path)

In [None]:
def move_file(src_dir, dst_dir, extension_file_lst):
    file_lst = [file_name for file_name in os.listdir(src_dir) 
                             for ext_file in extension_file_lst if file_name.endswith(ext_file)]    
    
    for file in file_lst:
        shutil.move(src_dir + file, dst_dir + file)
        print('Move file to %s' % (dst_dir + file))

In [None]:
def move_test_files(test_dir_path, disorder_dir_path):
    # Move files test in root directory
    move_file(dn.PATH_PROJECT, dn.PATH_PROJECT + test_dir_path + '/', [".h5", ".df", ".csv", ".txt"])

    # Move tokenizer
    move_file(dn.PATH_PROJECT + 'tokenizers/' + disorder_dir_path  + '/', 
              dn.PATH_PROJECT + test_dir_path + '/tokenizers/' + disorder_dir_path  + '/', [".df"])
    
    # Move pre_train_embeddings
    move_file(dn.PATH_PROJECT + 'pre_train_embeddings/' + disorder_dir_path  + '/', 
              dn.PATH_PROJECT + test_dir_path + '/pre_train_embeddings/' + disorder_dir_path  + '/', [".df"])

In [None]:
def generate_model(set_params):
    exp = ExperimentProcesses(set_params['log_file'])
    exp.pp_data.set_dataset_source(dataset_name='SMHD', label_set=['control', 'anxiety', 'depression'],
                                   total_registers=PARAMS_BY_DISORDER[set_params['disorder']]['total_registers'], 
                                   subdirectory=PARAMS_BY_DISORDER[set_params['disorder']]['subdirectory'])

    exp.pp_data.vocabulary_size = 5000
    exp.pp_data.embedding_size = 300
    exp.pp_data.max_posts = 1750
    exp.pp_data.max_terms_by_post = 300
    exp.pp_data.format_input_data = dn.InputData.POSTS_ONLY_TEXT
    exp.pp_data.remove_stopwords = False
    exp.pp_data.delete_low_tfid = False
    exp.pp_data.min_df = 0
    exp.pp_data.min_tf = 0
    exp.pp_data.random_posts = False
    exp.pp_data.random_users = False
    exp.pp_data.tokenizing_type = 'WE'
    exp.pp_data.type_prediction_label = set_params['type_prediction_label']

    exp.use_custom_metrics = False
    exp.use_valid_set_for_train = True
    exp.valid_split_from_train_set = 0.0
    exp.imbalanced_classes = False
    exp.pp_data.embedding_type = set_params['embedding_type']
    exp.pp_data.word_embedding_custom_file = set_params['embedding_custom_file']
    exp.pp_data.use_embedding = set_params['use_embedding']
    exp.pp_data.load_dataset_type = dn.LoadDataset.TRAIN_DATA_MODEL

    model_cs = ModelClass(1)
    model_cs.loss_function = 'binary_crossentropy'
    model_cs.optmizer_function = dn.OPTIMIZER_FUNCTIONS[set_params['optimizer_function']]
    model_cs.epochs = set_params['epochs']
    model_cs.batch_size = set_params['batch_size']
    model_cs.patience_train = set_params['epochs'] / 2

    model_cs.use_embedding_pre_train = exp.pp_data.use_embedding
    model_cs.embed_trainable = (model_cs.use_embedding_pre_train == \
                                (dn.UseEmbedding.RAND or dn.UseEmbedding.NON_STATIC))

    emb_name = set_params['function']
    if set_params['embedding_custom_file'] != '':
        emb_name = exp.pp_data.word_embedding_custom_file.split('.')[0]

    we_file_name = 'ET_' + str(exp.pp_data.embedding_type.value) + '_UE_' + str(exp.pp_data.use_embedding.value) +\
                   '_EF_' + emb_name + '_' + set_params['kernel_function'].split('_')[0]

    exp.experiment_name = set_params['name_model'] + '_lstm_exp9_var_L3' + '_N' +\
                          str(set_params['neuronios_by_layer']) + '_B' + str(set_params['batch_size']) + '_E' +\
                          str(set_params['epochs']) + '_D' + str(set_params['dropouts']) + '_OF' +\
                          model_cs.optmizer_function + '_HL' + str(set_params['hidden_layer']) + '_' + we_file_name
    
    #TRAIN
    np.random.seed(dn.SEED)
    time_ini_rep = datetime.datetime.now()
    x_train, y_train, x_valid, y_valid, num_words, embedding_matrix = exp.pp_data.load_data()
    exp.set_period_time_end(time_ini_rep, 'Load data')
    
    model_cs.model = Sequential()
    model_cs.model.add(Embedding(exp.pp_data.vocabulary_size, exp.pp_data.embedding_size,
                                 trainable=model_cs.embed_trainable, name='emb_' + set_params['name_model']))
    
    for id_hl in range(set_params['hidden_layer']-1):
        model_cs.model.add(LSTM(set_params['neuronios_by_layer'], activation=set_params['activation'],
                                dropout=set_params['dropouts'], recurrent_dropout=set_params['dropouts'],
                                kernel_initializer= set_params['kernel_function'],
                                return_sequences=set_params['return_sequences'],
                                name='dense_'+str(id_hl)+'_' + set_params['name_model']))

    model_cs.model.add(LSTM(set_params['neuronios_by_layer'], activation=set_params['activation'], 
                            dropout=set_params['dropouts'], recurrent_dropout=set_params['dropouts'],
                            kernel_initializer= set_params['kernel_function'],
                            name='dense_'+str(id_hl+1)+'_' + set_params['name_model']))

    model_cs.model.add(Dense(set_params['neuronios_dense_layer'], 
                             kernel_initializer=set_params['kernel_function'],
                             activation=set_params['act_last_layer'],
                             name='dense_'+str(id_hl+2)+'_' + set_params['name_model']))

#     print(model_cs.model.summary())
    time_ini_exp = datetime.datetime.now()
    exp.generate_model_hypeparams(model_cs, x_train, y_train, x_valid, y_valid, embedding_matrix)
    exp.set_period_time_end(time_ini_exp, 'Total experiment')

    del x_train, y_train, x_valid, y_valid, num_words, embedding_matrix
    
    #TEST
    exp.pp_data.load_dataset_type = dn.LoadDataset.TEST_DATA_MODEL
    np.random.seed(dn.SEED)
    time_ini_rep = datetime.datetime.now()
    x_test, y_test = exp.pp_data.load_data()
    exp.set_period_time_end(time_ini_rep, 'Load data')

    model_cs.model = exp.load_model(dn.PATH_PROJECT + exp.experiment_name + '.h5')
    exp.save_geral_configs('Experiment Specific Configuration: ' + exp.experiment_name)
    exp.save_summary_model(model_cs.model)
    exp.predict_samples(model_cs, x_test, y_test)

    del x_test, y_test, model_cs, exp

### 2.0 Test 1A - Comparing model performance using Glorot x LeCun Uniform Kernel Function 

#### 2.1 Definitions for test scenario

In [None]:
def test_kernel_function(id_test, kernel_function, prefix_name, disorder):
    set_params = dict({'disorder': disorder,
                       'log_file': prefix_name+'_lstm_L3',
                       'name_model': prefix_name,
                       'function': 'glove6B300d',
                       'type_prediction_label': dn.TypePredictionLabel.MULTI_LABEL_CATEGORICAL,
                       'embedding_type': dn.EmbeddingType.GLOVE_6B,
                       'embedding_custom_file': '',
                       'use_embedding': dn.UseEmbedding.STATIC,
                       'optimizer_function': dn.OptimizerFunction.ADAM.value,
                       'hidden_layer': 3,
                       'neuronios_by_layer': 16,
                       'activation': 'tanh',
                       'return_sequences': True,
                       'dropouts': 0.2,
                       'kernel_function': kernel_function,
                       'neuronios_dense_layer': 3,
                       'act_last_layer': 'sigmoid',
                       'batch_size': 40,
                       'epochs': 32})

    generate_model(set_params)

#### 2.2 Run Test Kernel Function Variations to Comparative

In [None]:
disorders_dct = dict({'anxiety': 'a', 'depression': 'd', 'anxiety,depression': 'ad'})
for disorder, prefix in disorders_dct.items():
    for kernel_function in ['glorot_uniform', 'lecun_uniform']:
        for id_test in range(1,11):
            name_test = 'test_kf_'+kernel_function[0:2]+'_'+prefix+str(id_test)

            print(name_test)
            create_test_dir_tree(name_test, disorder)
            
            test_kernel_function(id_test, kernel_function, name_test, disorder)
            
            move_test_files(name_test, disorder)

### 3.0 Test 1B - Comparing model performance using differents Word Embeddings

#### 3.1 Auxiliar Function and Definitions for test scenario

In [None]:
def build_emb_test_name(pfx_emb, word_embedding_custom_file):
    prefix = pfx_emb.split('_')[0]
    function = pfx_emb.split('_')[1]

    we_file_substr = 'None'

    if len(word_embedding_custom_file) > 0:
        we_file_substr = word_embedding_custom_file.split('-')
        
        if len(we_file_substr[2]) == 1:
            we_file_substr = we_file_substr[1][0:4]+'_'+we_file_substr[4]
        else:
            we_file_substr = we_file_substr[1][0:4]+'_'+we_file_substr[2]

    return prefix, function, we_file_substr

In [None]:
def test_embedding_performance(id_test, set_params, prefix_name, disorder):
    set_params.update({'disorder': disorder,
                       'log_file': prefix_name+'_lstm_L3',
                       'name_model': prefix_name,
                       'optimizer_function': dn.OptimizerFunction.ADAM.value,
                       'hidden_layer': 3,
                       'neuronios_by_layer': 16,
                       'activation': 'tanh',
                       'return_sequences': True,
                       'dropouts': 0.2,
                       'kernel_function': 'glorot_uniform',
                       'neuronios_dense_layer': 3,
                       'act_last_layer': 'sigmoid',
                       'batch_size': 40,
                       'epochs': 32})

    generate_model(set_params)

#### 3.2 Run Test Word Embeddings Variations to Comparative

In [None]:
disorders_dct = dict({'anxiety': 'a', 'depression': 'd', 'anxiety,depression': 'ad'})

embedding_types = dict({'g6_glove6B300d': dn.EmbeddingType.GLOVE_6B,
                        'gt_gloveTwitter': dn.EmbeddingType.GLOVE_TWITTER,
                        'gn_googleNews': dn.EmbeddingType.WORD2VEC,
                        'wc_w2vCustom': dn.EmbeddingType.WORD2VEC_CUSTOM,
                        'gc_gloveCustom': dn.EmbeddingType.GLOVE_CUSTOM})

use_embeddings = [dn.UseEmbedding.STATIC, dn.UseEmbedding.NON_STATIC]

for disorder, prefix_disorder in disorders_dct.items():
    for pfx_emb, embedding_type in embedding_types.items():
        if embedding_type == dn.EmbeddingType.WORD2VEC_CUSTOM:
            word_embedding_custom_files = ['SMHD-Skipgram-AllUsers-300.bin', 'SMHD-CBOW-AllUsers-300.bin',
                                           'SMHD-Skipgram-A-D-ADUsers-300.bin', 'SMHD-CBOW-A-D-ADUsers-300.bin']
        elif embedding_type == dn.EmbeddingType.GLOVE_CUSTOM:
            word_embedding_custom_files = ['SMHD-glove-AllUsers-300.pkl', 'SMHD-glove-A-D-ADUsers-300.pkl']
        else:
            word_embedding_custom_files = ['']

        for word_embedding_custom_file in word_embedding_custom_files:
            for use_embedding in use_embeddings:
                for id_test in range(1,11):
                    prefix, function, we_file_substr = build_emb_test_name(pfx_emb, word_embedding_custom_file)

                    name_test = 'test_'+prefix_disorder+str(id_test)+'_'+prefix+'_UE_'+str(use_embedding.value)+\
                                '_WF_'+we_file_substr
                                 
                    print(name_test)
                    create_test_dir_tree(name_test, disorder)
                    
                    set_params = dict({'function': function,
                                       'type_prediction_label': dn.TypePredictionLabel.MULTI_LABEL_CATEGORICAL,
                                       'embedding_type': embedding_type,
                                       'embedding_custom_file': word_embedding_custom_file,
                                       'use_embedding': use_embedding})

                    test_embedding_performance(id_test, set_params, name_test, disorder)
                    move_test_files(name_test, disorder)