In [1]:
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
#####################################################################
#                           Set C                                   #
#####################################################################
# Testing a variety of NN architectures with Embeddings             #
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
import tensorflow as tf
from keras_preprocessing.sequence import pad_sequences
import pandas as pd
from sklearn.model_selection import StratifiedKFold, KFold
pd.set_option('max_colwidth',400)
from keras.preprocessing.sequence import pad_sequences
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, SpatialDropout1D, Bidirectional, Dense, \
    LSTM, Conv1D, MaxPooling1D, Dropout, concatenate, Flatten, add
from keras import initializers, regularizers, constraints
from keras import backend as K
from keras.engine import Layer
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras import Input, Model
from keras.optimizers import Adam
from keras.models import Sequential, clone_model
from keras.layers.embeddings import Embedding
from keras.preprocessing.text import Tokenizer
from utilities.preprocess import Preproccesor
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, precision_score, recall_score
import time
import numpy as np
from keras.models import model_from_json

In [0]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
X, y = Preproccesor.load_data(True)

class_names = ['noHateSpeech', 'hateSpeech']
f = open("../results/setC_replicate.txt", "a+")
f.write("{:<10} | {:<7} {:<7} {:<7} {:<7} {:<7} {:<7} \n".format('Method','F1score','Precisi','Recall','Accurac','Specifi','Sensiti'))
f.write("=========================================================================\n")
f.close()
print ("{:<10} | {:<7} {:<7} {:<7} {:<7} {:<7} {:<7}".format('Method','F1score','Precisi','Recall','Accurac','Specifi','Sensiti'))

  data = pd.read_csv("../ethos_data/Davidson_Dataset_Binary.csv", delimiter='∫')


Method     | F1score Precisi Recall  Accurac Specifi Sensiti


In [None]:
!wget 'https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip'
!wget 'http://nlp.stanford.edu/data/glove.42B.300d.zip' 

In [5]:
import zipfile
with zipfile.ZipFile("crawl-300d-2M.vec.zip","r") as zip_ref:
    zip_ref.extractall()
    print(zip_ref.filelist)
with zipfile.ZipFile("glove.42B.300d.zip","r") as zip_ref:
    zip_ref.extractall()
    print(zip_ref.filelist)

del zip_ref

[<ZipInfo filename='crawl-300d-2M.vec' compress_type=deflate filemode='-rw-r--r--' file_size=4514687127 compress_size=1523784963>]
[<ZipInfo filename='glove.42B.300d.txt' compress_type=deflate filemode='-rw-rw-r--' file_size=5025028820 compress_size=1877800207>]


In [7]:
!rm 'crawl-300d-2M.vec.zip'
!rm 'glove.42B.300d.zip'

In [3]:
def specificity(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    if(tn+fp)>0:
        speci = tn/(tn+fp)
        return speci
    return 0
def sensitivity(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    if(tp+fn)>0:
        sensi = tp/(tp+fn)
        return sensi
    return 0

In [5]:
embedding_path1 = "crawl-300d-2M.vec" #FastText
embedding_path2 = "glove.42B.300d.txt" #Glove 300d
embed_size = 300

In [6]:
def get_coefs(word,*arr):
    return word, np.asarray(arr, dtype='float32')

In [7]:
def build_matrix(embedding_path, tk, max_features):
    embedding_index = dict(get_coefs(*o.strip().split(" ")) for o in open(embedding_path, encoding = "utf-8"))

    word_index = tk.word_index
    nb_words = max_features
    embedding_matrix = np.zeros((nb_words + 1, 300))
    for word, i in word_index.items():
        if i >= max_features:
            continue
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix
def create_embedding_matrix(embed, tk, max_features):
    if embed == 1:
      return build_matrix(embedding_path1, tk, max_features)
    elif embed == 2:
      return build_matrix(embedding_path2, tk, max_features)
    else:
      return np.concatenate([build_matrix(embedding_path1, tk, max_features), build_matrix(embedding_path2, tk, max_features)], axis=-1)
  

In [8]:
n_fold = 10
folds = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=7)

In [9]:
class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)
        
    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'supports_masking' : self.supports_masking,
            'init': self.init,
            'W_regularizer': self.W_regularizer,
            'b_regularizer': self.b_regularizer,
            'W_constraint': self.W_constraint,
            'b_constraint': self.b_constraint,
            'bias': self.bias,
            'step_dim': self.step_dim,
            'features_dim' : self.features_dim
        })
        return config

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight(shape=(input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight(shape=(input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                              K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0], self.features_dim

In [10]:
strategy = tf.distribute.MirroredStrategy()
def build_model1(X_train, y_train, X_valid, y_valid, max_len, max_features, embed_size, embedding_matrix, lr=0.0, lr_d=0.0, spatial_dr=0.0, dense_units=128, conv_size=128, dr=0.2, patience=3, fold_id=1):
    file_path = f"best_model_fold_{fold_id}.hdf5"
    check_point = ModelCheckpoint(file_path, monitor="val_accuracy", verbose=1, save_best_only=True, mode="max")
    early_stop = EarlyStopping(monitor="val_accuracy", mode="max", patience=patience)
    with strategy.scope():
        inp = Input(shape=(max_len,))
        x = Embedding(max_features + 1, embed_size * 2, weights=[embedding_matrix], trainable=False)(inp)
        x1 = SpatialDropout1D(spatial_dr)(x)
        att = Attention(max_len)(x1)
        x = Conv1D(conv_size, 2, activation='relu', padding='same')(x1)
        x = MaxPooling1D(5, padding='same')(x)
        x = Conv1D(conv_size, 3, activation='relu', padding='same')(x)
        x = MaxPooling1D(5, padding='same')(x)
        x = Flatten()(x)
        x = concatenate([x, att])
        x = Dropout(dr)(Dense(dense_units, activation='relu')(x))
        x = Dense(1, activation="sigmoid")(x)
        model = Model(inputs=inp, outputs=x)
        model.compile(loss="binary_crossentropy", optimizer=Adam(lr=lr, decay=lr_d), metrics=["accuracy"])
    model.fit(X_train, y_train, batch_size=16, epochs=10, validation_data=(X_valid, y_valid), verbose=1, callbacks=[early_stop, check_point])
    with strategy.scope():
        model2 = Model(inputs=inp, outputs=x)
        model2.load_weights(file_path)
        model2.compile(loss="binary_crossentropy", optimizer=Adam(lr=lr, decay=lr_d), metrics=["accuracy"])
    return model2
def build_model2(X_train, y_train, X_valid, y_valid, max_len, max_features, embed_size, embedding_matrix, lr=0.0, lr_d=0.0, spatial_dr=0.0, dense_units=128, conv_size=128, dr=0.2, patience=3, fold_id=1):
    file_path = f"best_model_fold_{fold_id}.hdf5"
    check_point = ModelCheckpoint(file_path, monitor="val_accuracy", verbose=1,save_best_only=True, mode="max")
    early_stop = EarlyStopping(monitor="val_accuracy", mode="max", patience=patience)
    model = Sequential()
    model.add(Embedding(max_features + 1, embed_size * 2, input_length=max_len, weights=[embedding_matrix], trainable=False))
    model.add(Conv1D(200, 10, activation='relu'))
    model.add(MaxPooling1D(pool_size=5))
    model.add(LSTM(100))
    model.add(Dense(50, activation='relu'))
    model.add(Dropout(rate=0.35))
    model.add(Dense(1, activation='sigmoid'))
    model2 = model
    model.compile(loss="binary_crossentropy", optimizer=Adam(lr=lr, decay=lr_d), metrics=["accuracy"])
    model.fit(X_train, y_train, batch_size=16, epochs=10, validation_data=(X_valid, y_valid), verbose=1, callbacks=[early_stop, check_point])
    model2.load_weights(file_path)
    model2.compile(loss="binary_crossentropy", optimizer=Adam(lr=lr, decay=lr_d), metrics=["accuracy"])
    return model2
def build_model3(X_train, y_train, X_valid, y_valid, max_len, max_features, embed_size, embedding_matrix, lr=0.0, lr_d=0.0, spatial_dr=0.0, dense_units=128, conv_size=128, dr=0.2, patience=3, fold_id=1):
    file_path = f"best_model_fold_{fold_id}.hdf5"
    check_point = ModelCheckpoint(file_path, monitor="val_accuracy", verbose=1,save_best_only=True, mode="max")
    early_stop = EarlyStopping(monitor="val_accuracy", mode="max", patience=patience)
    main_input = Input(shape = (max_len,),name='main_input')
    glove_Embed = (Embedding(max_features + 1, embed_size * 2, weights=[embedding_matrix], trainable=False))(main_input)
    y = LSTM(300)(glove_Embed)
    y = Dense(200, activation='relu')(y)
    y = Dropout(rate=0.15)(y)
    z = Dense(100, activation='relu')(y)
    output_lay = Dense(1, activation='sigmoid')(z)
    model = Model(inputs=[main_input], outputs=[output_lay])
    model.compile(loss="binary_crossentropy", optimizer=Adam(lr=lr, decay=lr_d), metrics=["accuracy"])
    model2 = Model(inputs=[main_input], outputs=[output_lay])
    model.fit(X_train, y_train, batch_size=16, epochs=10, validation_data=(X_valid, y_valid), verbose=1, callbacks=[early_stop, check_point])
    model2.load_weights(file_path)
    model2.compile(loss="binary_crossentropy", optimizer=Adam(lr=lr, decay=lr_d), metrics=["accuracy"])
    return model2
def build_model4(X_train, y_train, X_valid, y_valid, max_len, max_features, embed_size, embedding_matrix, lr=0.0, lr_d=0.0, spatial_dr=0.0, dense_units=128, conv_size=128, dr=0.2, patience=3, fold_id=1):
    file_path = f"best_model_fold_{fold_id}.hdf5"
    check_point = ModelCheckpoint(file_path, monitor="val_accuracy", verbose=1,save_best_only=True, mode="max")
    early_stop = EarlyStopping(monitor="val_accuracy", mode="max", patience=patience)
    main_input = Input(shape=(max_len,), dtype='int32', name='main_input')
    glove_Embed = (Embedding(max_features + 1, embed_size * 2, input_length=max_len, weights=[embedding_matrix], trainable=False))(main_input)

    x0 = Conv1D(128, 10, activation='relu')(glove_Embed)
    x1 = Conv1D(64, 5, activation='relu')(x0)
    x2 = Conv1D(32, 4, activation='relu')(x1)
    x3 = Conv1D(16, 3, activation='relu')(x2)
    x4 = Conv1D(8, 5, activation='relu')(x3)
    x = MaxPooling1D(pool_size=3)(x4)
    x = Dropout(rate=0.25)(x)
    x = LSTM(100)(x)

    p = MaxPooling1D(pool_size=10)(x0)
    p = Dropout(rate=0.15)(p)
    p = LSTM(100)(p)

    o = MaxPooling1D(pool_size=8)(x1)
    o = Dropout(rate=0.15)(o)
    o = LSTM(100)(o)

    i = MaxPooling1D(pool_size=6)(x2)
    i = Dropout(rate=0.15)(i)
    i = LSTM(100)(i)

    r = MaxPooling1D(pool_size=4)(x3)
    r = Dropout(rate=0.15)(r)
    r = LSTM(100)(r)

    t = MaxPooling1D(pool_size=3)(x4)
    t = Dropout(rate=0.15)(t)
    t = LSTM(100)(t)

    y = LSTM(500)(glove_Embed)
    y = Dense(250,activation='relu')(y)
    y = Dropout(rate=0.15)(y)

    z = concatenate([x, p, o, i, r, t, y])

    z = Dense(400,activation='relu')(z)
    z = Dropout(0.15)(z)
    z = Dense(200,activation='relu')(z)
    z = Dense(100,activation='relu')(z)
    z = Dropout(0.15)(z)
    z = Dense(50,activation='relu')(z)
    output_lay = Dense(1, activation='sigmoid')(z)
    model = Model(inputs=[main_input], outputs=[output_lay])
    model.compile(loss="binary_crossentropy", optimizer=Adam(lr=lr, decay=lr_d), metrics=["accuracy"])
    model2 = Model(inputs=[main_input], outputs=[output_lay])
    model.fit(X_train, y_train, batch_size=16, epochs=10, validation_data=(X_valid, y_valid), verbose=1, callbacks=[early_stop, check_point])
    model2.load_weights(file_path)
    model2.compile(loss="binary_crossentropy", optimizer=Adam(lr=lr, decay=lr_d), metrics=["accuracy"])
    return model2
def build_model5(X_train, y_train, X_valid, y_valid, max_len, max_features, embed_size, embedding_matrix, lr=0.0, lr_d=0.0, spatial_dr=0.0, dense_units=128, conv_size=128, dr=0.2, patience=3, fold_id=1):
    file_path = f"best_model_fold_{fold_id}.hdf5"
    check_point = ModelCheckpoint(file_path, monitor="val_accuracy", verbose=1,save_best_only=True, mode="max")
    early_stop = EarlyStopping(monitor="val_accuracy", mode="max", patience=patience)
    main_input = Input(shape=(max_len,), dtype='int32', name='main_input')
    x = (Embedding(max_features + 1, embed_size*2, input_length=max_len, weights=[embedding_matrix], trainable=False))(main_input)
    x = SpatialDropout1D(0.3)(x)
    x = Bidirectional(LSTM(128, return_sequences=True))(x)
    x = Bidirectional(LSTM(128, return_sequences=True))(x)
    hidden = concatenate([
        GlobalMaxPooling1D()(x),
        GlobalAveragePooling1D()(x),
    ])
    hidden = Dense(1024, activation='relu')(hidden)
    hidden = Dense(512, activation='relu')(hidden)
    output_lay = Dense(1, activation='sigmoid')(hidden)
    model = Model(inputs=[main_input], outputs=[output_lay])
    model.compile(loss="binary_crossentropy", optimizer=Adam(lr=lr, decay=lr_d), metrics=["accuracy"])
    model2 = Model(inputs=[main_input], outputs=[output_lay])
    model.fit(X_train, y_train, batch_size=16, epochs=10, validation_data=(X_valid, y_valid), verbose=1, callbacks=[early_stop, check_point])
    model2.load_weights(file_path)
    model2.compile(loss="binary_crossentropy", optimizer=Adam(lr=lr, decay=lr_d), metrics=["accuracy"])
    return model2
def build_model6(X_train, y_train, X_valid, y_valid, max_len, max_features, embed_size, embedding_matrix, lr=0.0, lr_d=0.0, spatial_dr=0.0, dense_units=128, conv_size=128, dr=0.2, patience=3, fold_id=1):
    file_path = f"best_model_fold_{fold_id}.hdf5"
    check_point = ModelCheckpoint(file_path, monitor="val_accuracy", verbose=1,save_best_only=True, mode="max")
    early_stop = EarlyStopping(monitor="val_accuracy", mode="max", patience=patience)
    with strategy.scope():
        main_input = Input(shape=(max_len,), name='main_input')
        x = (Embedding(max_features + 1, embed_size*2, input_length=max_len, weights=[embedding_matrix], trainable=False))(main_input)
        x = SpatialDropout1D(0.2)(x)
        x = Bidirectional(LSTM(128, return_sequences=True))(x)
        x = Bidirectional(LSTM(128, return_sequences=True))(x)
        hidden = concatenate([
            Attention(max_len)(x),
            GlobalMaxPooling1D()(x),
        ])
        hidden = Dense(1024, activation='relu')(hidden)
        hidden = Dense(512, activation='relu')(hidden)
        output_lay = Dense(1, activation='sigmoid')(hidden)
        model = Model(inputs=[main_input], outputs=[output_lay])
        model.compile(loss="binary_crossentropy", optimizer=Adam(lr=lr, decay=lr_d), metrics=["accuracy"])
        model2 = Model(inputs=[main_input], outputs=[output_lay])
    model.fit(X_train, y_train, batch_size=16, epochs=10, validation_data=(X_valid, y_valid), verbose=1, callbacks=[early_stop, check_point])
    with strategy.scope():
        model2.load_weights(file_path)
        model2.compile(loss="binary_crossentropy", optimizer=Adam(lr=lr, decay=lr_d), metrics=["accuracy"])
    return model2
def run_model_on_fold(name, max_len, embed_size, embed, bulid_fun):
    max_features = 50000
    scores = {}
    scores.setdefault('fit_time', [])
    scores.setdefault('score_time', [])
    scores.setdefault('test_F1', [])
    scores.setdefault('test_Precision', [])
    scores.setdefault('test_Recall', [])
    scores.setdefault('test_Accuracy', [])
    scores.setdefault('test_Specificity', [])
    scores.setdefault('test_Sensitivity', [])
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X, y)):
        print('Fold', fold_n, 'started at', time.ctime())
        X_train, X_valid = X[train_index], X[valid_index]
        y_train, y_valid = y[train_index], y[valid_index]
        tk = Tokenizer(lower = True, filters='', num_words=max_features, oov_token = True)
        tk.fit_on_texts(X_train)
        train_tokenized = tk.texts_to_sequences(X_train)
        valid_tokenized = tk.texts_to_sequences(X_valid)
        X_train = pad_sequences(train_tokenized, maxlen=max_len)
        X_valid = pad_sequences(valid_tokenized, maxlen=max_len)
        embedding_matrix = create_embedding_matrix(embed, tk, max_features)
        
        model = bulid_fun(X_train, y_train, X_valid, y_valid, max_len, max_features, embed_size, embedding_matrix,
                             lr=1e-3, lr_d=0, spatial_dr=0.1, dense_units=128, conv_size=128, dr=0.1, patience=4,
                             fold_id=fold_n)

        y_preds = []
        for i in model.predict(X_valid):
            if i[0] >= 0.5:
                y_preds.append(1)
            else:
                y_preds.append(0)
        print(accuracy_score(y_valid, y_preds))
        scores['test_F1'].append(f1_score(y_valid, y_preds, average='macro'))
        scores['test_Precision'].append(precision_score(y_valid, y_preds, average='macro'))
        scores['test_Recall'].append(recall_score(y_valid, y_preds, average='macro'))
        scores['test_Accuracy'].append(accuracy_score(y_valid, y_preds))
        scores['test_Specificity'].append(specificity(y_valid, y_preds))
        scores['test_Sensitivity'].append(sensitivity(y_valid, y_preds))
    f = open("../results/setC_replicate.txt", "a+")
    f.write("{:<10} | {:<7} {:<7} {:<7} {:<7} {:<7} {:<7}".format(str(name)[:7],
                                                               str('%.4f' % (sum(scores['test_F1']) / 10)),
                                                               str('%.4f' % (sum(scores['test_Precision']) / 10)),
                                                               str('%.4f' % (sum(scores['test_Recall']) / 10)),
                                                               str('%.4f' % (sum(scores['test_Accuracy']) / 10)),
                                                               str('%.4f' % (sum(scores['test_Specificity']) / 10)),
                                                               str('%.4f' % (sum(scores['test_Sensitivity']) / 10)))+'\n')
    f.close()
    print("{:<10} | {:<7} {:<7} {:<7} {:<7} {:<7} {:<7}".format(str(name)[:7],
                                                               str('%.4f' % (sum(scores['test_F1']) / 10)),
                                                               str('%.4f' % (sum(scores['test_Precision']) / 10)),
                                                               str('%.4f' % (sum(scores['test_Recall']) / 10)),
                                                               str('%.4f' % (sum(scores['test_Accuracy']) / 10)),
                                                               str('%.4f' % (sum(scores['test_Specificity']) / 10)),
                                                               str('%.4f' % (sum(scores['test_Sensitivity']) / 10))))


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2')


In [17]:
def build_model_load(X_train, y_train, X_valid, y_valid, max_len, max_features, embed_size, embedding_matrix, lr=0.0, lr_d=0.0, spatial_dr=0.0, dense_units=128, conv_size=128, dr=0.2, patience=3, fold_id=1):
    file_path = f"best_model_fold_{fold_id}.hdf5"
    check_point = ModelCheckpoint(file_path, monitor="val_accuracy", verbose=1,save_best_only=True, mode="max")
    early_stop = EarlyStopping(monitor="val_accuracy", mode="max", patience=patience)
    with strategy.scope():
        main_input = Input(shape=(max_len,), name='main_input')
        x = (Embedding(max_features + 1, embed_size*2, input_length=max_len, weights=[embedding_matrix], trainable=False))(main_input)
        x = SpatialDropout1D(0.2)(x)
        x = Bidirectional(LSTM(128, return_sequences=True))(x)
        x = Bidirectional(LSTM(128, return_sequences=True))(x)
        hidden = concatenate([
            Attention(max_len)(x),
            GlobalMaxPooling1D()(x),
        ])
        hidden = Dense(1024, activation='relu')(hidden)
        hidden = Dense(512, activation='relu')(hidden)
        output_lay = Dense(1, activation='sigmoid')(hidden)
        model = Model(inputs=[main_input], outputs=[output_lay])
        model.compile(loss="binary_crossentropy", optimizer=Adam(lr=lr, decay=lr_d), metrics=["accuracy"])
        model2 = Model(inputs=[main_input], outputs=[output_lay])
    #model.fit(X_train, y_train, batch_size=16, epochs=10, validation_data=(X_valid, y_valid), verbose=1, callbacks=[early_stop, check_point])
    with strategy.scope():
        model2.load_weights(file_path)
        model2.compile(loss="binary_crossentropy", optimizer=Adam(lr=lr, decay=lr_d), metrics=["accuracy"])
    return model2
def run_model_on_fold_load(name, max_len, embed_size, embed, bulid_fun):
    max_features = 50000
    scores = {}
    scores.setdefault('fit_time', [])
    scores.setdefault('score_time', [])
    scores.setdefault('test_F1', [])
    scores.setdefault('test_Precision', [])
    scores.setdefault('test_Recall', [])
    scores.setdefault('test_Accuracy', [])
    scores.setdefault('test_Specificity', [])
    scores.setdefault('test_Sensitivity', [])
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X, y)):
        if fold_n > 2:
            break
        print('Fold', fold_n, 'started at', time.ctime())
        X_train, X_valid = X[train_index], X[valid_index]
        y_train, y_valid = y[train_index], y[valid_index]
        tk = Tokenizer(lower = True, filters='', num_words=max_features, oov_token = True)
        tk.fit_on_texts(X_train)
        train_tokenized = tk.texts_to_sequences(X_train)
        valid_tokenized = tk.texts_to_sequences(X_valid)
        X_train = pad_sequences(train_tokenized, maxlen=max_len)
        X_valid = pad_sequences(valid_tokenized, maxlen=max_len)
        embedding_matrix = create_embedding_matrix(embed, tk, max_features)
        
        model = bulid_fun(X_train, y_train, X_valid, y_valid, max_len, max_features, embed_size, embedding_matrix,
                             lr=1e-3, lr_d=0, spatial_dr=0.1, dense_units=128, conv_size=128, dr=0.1, patience=4,
                             fold_id=fold_n)

        y_preds = []
        for i in model.predict(X_valid):
            if i[0] >= 0.5:
                y_preds.append(1)
            else:
                y_preds.append(0)
        print(accuracy_score(y_valid, y_preds))
        scores['test_F1'].append(f1_score(y_valid, y_preds, average='macro'))
        scores['test_Precision'].append(precision_score(y_valid, y_preds, average='macro'))
        scores['test_Recall'].append(recall_score(y_valid, y_preds, average='macro'))
        scores['test_Accuracy'].append(accuracy_score(y_valid, y_preds))
        scores['test_Specificity'].append(specificity(y_valid, y_preds))
        scores['test_Sensitivity'].append(sensitivity(y_valid, y_preds))

    f = open("../results/setC_replicate.txt", "a+")
    f.write("{:<10} | {:<7} {:<7} {:<7} {:<7} {:<7} {:<7}".format(str(name)[:7],
                                                               str('%.4f' % (sum(scores['test_F1']) / 3)),
                                                               str('%.4f' % (sum(scores['test_Precision']) / 3)),
                                                               str('%.4f' % (sum(scores['test_Recall']) / 3)),
                                                               str('%.4f' % (sum(scores['test_Accuracy']) / 3)),
                                                               str('%.4f' % (sum(scores['test_Specificity']) / 3)),
                                                               str('%.4f' % (sum(scores['test_Sensitivity']) / 3)))+'\n')
    f.close()
    print("{:<10} | {:<7} {:<7} {:<7} {:<7} {:<7} {:<7}".format(str(name)[:7],
                                                               str('%.4f' % (sum(scores['test_F1']) / 3)),
                                                               str('%.4f' % (sum(scores['test_Precision']) / 3)),
                                                               str('%.4f' % (sum(scores['test_Recall']) / 3)),
                                                               str('%.4f' % (sum(scores['test_Accuracy']) / 3)),
                                                               str('%.4f' % (sum(scores['test_Specificity']) / 3)),
                                                               str('%.4f' % (sum(scores['test_Sensitivity']) / 3))))


In [18]:
for emb_ma in [1]:
    embed_size = 150 # * 2 = 300 for matrix 1 and 2
    if emb_ma == 3:
        embed_size = 300
    for max_len in [100]: 
        #run_model_on_fold('b1_'+str(emb_ma)+'_'+str(max_len),max_len,embed_size,emb_ma,build_model1)
        #run_model_on_fold('b2_'+str(emb_ma)+'_'+str(max_len),max_len,embed_size,emb_ma,build_model2)
        #run_model_on_fold('b3_'+str(emb_ma)+'_'+str(max_len),max_len,embed_size,emb_ma,build_model3)
        #run_model_on_fold('b4_'+str(emb_ma)+'_'+str(max_len),max_len,embed_size,emb_ma,build_model4)
        #run_model_on_fold('b5_'+str(emb_ma)+'_'+str(max_len),max_len,embed_size,emb_ma,build_model5)
        run_model_on_fold_load('b6_load'+str(emb_ma)+'_'+str(max_len),max_len,embed_size,emb_ma,build_model_load)

Fold 0 started at Fri Oct  2 12:43:56 2020
0.7676482452601856
Fold 1 started at Fri Oct  2 12:47:50 2020
0.7712787414279951
Fold 2 started at Fri Oct  2 12:51:47 2020
0.7587736990722065
b6_load    | 0.5477  0.5777  0.5470  0.7659  0.9139  0.1800 
