# CV FOR SINGLE LSTM WITH WORD EMBEDDINGS
Oversampling skipped because a deduplicated dataset is needed for cross-validation

In [2]:
!pip install matplotlib
!pip install -U pip setuptools wheel
!pip install -U spacy
!pip install spacytextblob
!python -m spacy download en_core_web_lg
!pip install -U nltk
!pip install gensim
!pip install python-Levenshtein
!pip install ftfy
!pip install emoji
!pip install pycld2
!pip install openpyxl

Collecting matplotlib
  Using cached matplotlib-3.4.3-cp37-cp37m-manylinux1_x86_64.whl (10.3 MB)
Collecting cycler>=0.10
  Using cached cycler-0.10.0-py2.py3-none-any.whl (6.5 kB)
Collecting kiwisolver>=1.0.1
  Using cached kiwisolver-1.3.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.1 MB)
Installing collected packages: kiwisolver, cycler, matplotlib
Successfully installed cycler-0.10.0 kiwisolver-1.3.2 matplotlib-3.4.3
You should consider upgrading via the '/usr/local/bin/python3.7 -m pip install --upgrade pip' command.[0m
Collecting pip
  Using cached pip-21.2.4-py3-none-any.whl (1.6 MB)
Collecting setuptools
  Downloading setuptools-58.2.0-py3-none-any.whl (946 kB)
[K     |████████████████████████████████| 946 kB 27.4 MB/s eta 0:00:01
Collecting wheel
  Using cached wheel-0.37.0-py2.py3-none-any.whl (35 kB)
Installing collected packages: wheel, setuptools, pip
  Attempting uninstall: wheel
    Found existing installation: wheel 0.36.2
    Uninstalling wheel-0.36.2:
  

In [3]:
import numpy as np
import pandas as pd
import string
import itertools
import time
import os
import re
import sklearn
import spacy
import pickle
import random
import gc

from bbclf.sentiment import get_sentiment
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from gensim.models.keyedvectors import KeyedVectors
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix
from text_preprocess import unfold_contractions, remove_emoji, clean_text

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Model
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam, RMSprop, Nadam
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import ( Activation, Dropout, Dense, Flatten, LSTM, Bidirectional, SpatialDropout1D,
                                      RepeatVector, MaxPooling1D, GlobalMaxPooling1D, GlobalMaxPool1D, Embedding,
                                      Input, Concatenate, Reshape, Flatten, Conv1D, GlobalAveragePooling1D )

import matplotlib.pyplot as plt
%matplotlib inline

%load_ext autoreload
%autoreload 2

In [4]:
def plot_confusion_matrix(cm, classes,                      
                          title='CONFUSION MATRIX',
                          cmap=plt.cm.PuBu):         # originally plt.cm.Blues; also good: BuPu,RdPu,PuRd,OrRd,Oranges
    '''
    Plot the confusion matrix    
    '''
    plt.rcParams['xtick.bottom'] = plt.rcParams['xtick.labelbottom'] = False
    plt.rcParams['xtick.top'] = plt.rcParams['xtick.labeltop'] = True
            
    plt.figure(figsize=(4,4))
    im = plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar(im, fraction=0.046, pad=0.05)
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90)
    plt.yticks(tick_marks, classes)
    
    fmt = 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True labels')
    plt.xlabel('Predicted labels')
    plt.tight_layout()
    plt.show()

In [5]:
# DIFFERENT TEXT CLEANING METHODS
def remove_digits(s):
    s = s.translate( str.maketrans(string.digits, ' '*len(string.digits)) )
    s = re.sub('\s+', ' ', s)
    return s.strip()

def remove_punct(s):
    '''
    To avoid words being glued together, replace punct with spaces because
    there are complex words with '-' separator,
    and some people forget there should be a space after punctuation
    '''
    s = s.translate( str.maketrans(string.punctuation, ' '*len(string.punctuation) ) )
    s = re.sub('\s+', ' ', s)
    return s.strip()

def remove_non_alpha(s):
    '''
    To avoid words being glued together, replace punct with spaces because
    there are complex words with '-' separator,
    and some people forget there should be a space after punctuation
    '''
    s = s.translate( str.maketrans( punctuation, ' '*len(punctuation) ) )
    s = s.translate( str.maketrans( string.digits, ' '*len(string.digits) ) )
    s = re.sub('\s+', ' ', s)
    return s.strip()

def remove_stopwords(s):
    s = s.strip().split()
    s = [ w.strip() for w in s if remove_non_alpha(w).lower() not in sw ]
    return ' '.join(s)

def mask_entities(s):
    '''
        Replace simplified named entities with their type,
        if a word is named entity; otherwise, lemmatize word
    '''    
    ent_types = { 
                    'number':       ['CARDINAL', 'ORDINAL'],
                    'place':        ['LOC'],
                    'name':         ['PERSON'],
                    'percent':      ['PERCENT'],
                    'money':        ['MONEY'],
                    'organization': ['ORG'],
                }
        
    # replace entities w/generic names, lemmatize
    out = []
    s = nlp(s)
    for t in s:
        ent_type = ''
        for key in ent_types:
            if t.ent_type_ in ent_types[key]:
                ent_type = key
                break
        if ent_type:
            out.append( ent_type )
        else:
            out.append( t.lemma_ )
        
    # remove entity masks repeated several times in a row
    stack = ['',]
    for item in out:
        if item == stack[-1]:
            continue
        stack.append( item )        
        
    return ' '.join( stack[1:] )

def convert_emoticons(s):
    return s.replace(':)', 'happy_face').replace(':D', 'happy_face')

punctuation = ''.join([c for c in string.punctuation if c not in "'!?"])
sw = [ 'a', 'an', 'the', 'of', ]

In [6]:
def dedupe( df1, df2, col_ ):
    '''
        df2 should not contain annything from df1 in column col_
        (e.g. df2 = smaller test set OR smaller set for 1 category)
        to preserve smaller df2, duplicates are deleted from df1
    '''
    original_length = df1.shape[0]
    df2_sents = df2[col_].values
    df1 = df1[ ~df1[col_].isin(df2_sents) ]
    print( f'\tDropping {original_length - df1.shape[0]} duplicates')
    return df1, df2

def upsample( df_, to_oversample_ ):
    '''
        Upsample df_ by to_oversample_ more samples    
    '''    
    replace = False
    if len(df_) < to_oversample_:
        replace = True
        
    df_upsampled = df_.sample( n=to_oversample_, replace=replace )
    df_          = pd.concat([ df_, df_upsampled ])
    return df_.sample( frac=1 )

def intersect_length( df1, df2, col_ ):
    '''
        Return counts of the same records in df1 and df2 in column col_    
    '''    
    len1 = len([ i for i in df1[col_].values if i in df2[col_].values ])
    len2 = len([ i for i in df2[col_].values if i in df1[col_].values ])
    return len1, len2

## 1. Prepare data

In [None]:
file = ''
df   = pd.read_csv( file , sep='\t', encoding='utf-8' )

print( 'Data size:', df.shape )
print( 'Unique sentences:', len( df['sentence'].unique() ), '\n' )
print( 'Missing values:\n\n', df.isna().sum(), sep='' )
df.head(2)

In [None]:
# wrong labels
convert_2unk = [ 
                 ]
len(convert_2unk)
df.loc[ df['sentence'].isin(convert_2unk), 'label'] = 'unk'
df[ df['sentence'].isin(convert_2unk)][['sentence', 'label']]

In [9]:
df['subset'].value_counts()

train    9015
test      919
val       909
Name: subset, dtype: int64

In [10]:
nlp = spacy.load("en_core_web_lg")
#nlp.remove_pipe('parser')
ml_categories = [ ]
len( ml_categories )

15

In [11]:
#df = df[ df['is_subtle'] == 0 ]
df = df[ df['label'].isin( ml_categories ) ]
df = df.drop(['award_reason', 'is_group_award', 'group_award_id', 'keywords'], axis=1)

df['target'] = df['label'].apply( lambda x: 0 if x == 'unk' else 1 )

In [12]:
# DEDUPE BETWEEN CATEGORIES. FAVOR CATEGORY 1
print(df.shape)
df1 = df[ df['target'] == 1 ].copy()
df0 = df[ df['target'] == 0 ].copy()
df0, df1 = dedupe( df0, df1, 'sentence' )
df = pd.concat([ df0, df1 ]).copy().sample(frac=1).reset_index(drop=True)
print(df.shape)

(10843, 11)
	Dropping 6 duplicates
(10837, 11)


In [13]:
# DEDUPE TRAIN / VAL / TEST SETS. FAVOR TEST, THEN VAL SET
print(df.shape)
df_train = df[ df['subset'] == 'train' ].copy()
df_val   = df[ df['subset'] == 'val' ].copy()
df_test  = df[ df['subset'] == 'test' ].copy()

df_train, df_test = dedupe( df_train, df_test, 'sentence' )
df_val, df_test   = dedupe( df_val, df_test, 'sentence' )
#df_train, df_val  = dedupe( df_train, df_val, 'sentence' )
df = pd.concat([ df_train, df_val, df_test ]).copy().sample(frac=1).reset_index(drop=True)
print(df.shape)

(10837, 11)
	Dropping 90 duplicates
	Dropping 33 duplicates
(10714, 11)


In [14]:
print( 'Cleaning text ...', end=' ')

df['sentence_lower'] = df['sentence'].apply( convert_emoticons )
df['sentence_lower'] = df['sentence_lower'].apply( lambda x: remove_emoji(x, to_text=True) )
df['sentence_lower'] = df['sentence_lower'].apply( unfold_contractions )
df['sentence_lower'] = df['sentence_lower'].apply( mask_entities )
df['sentence_lower'] = df['sentence_lower'].apply( lambda x: x.lower() )
df['sentence_lower'] = df['sentence_lower'].apply( remove_non_alpha )
df['sentence_lower'] = df['sentence_lower'].apply( remove_stopwords )
print('Done!')

# UNIQUE WORDS (very approx. because punctuation may be included)
num_words_cased   = len(set(' '.join(df['sentence'].tolist()).split()))
num_words_uncased = len(set(' '.join(df['sentence_lower'].tolist()).split()))
print( 'Unique cased words (uncleaned):' , num_words_cased )
print( 'Unique uncased words:          ' , num_words_uncased )

df['length'] = df['sentence_lower'].apply( lambda x: len(x.split()) )
maxlen       = df['length'].max()
print( 'Maximum sentence length:', maxlen, '\n' )

Cleaning text ... Done!
Unique cased words (uncleaned): 20179
Unique uncased words:           7064
Maximum sentence length: 88 



### Augment data

In [None]:
# LOAD SENTENCES FROM ANNOTATION GUIDELINES
print( 'Augmenting data....')
df_aug = pd.read_excel( '',
                        sheet_name='guidelines',
                        engine='openpyxl',
                      )
df_aug = df_aug.dropna()

df_aug['sentence_lower'] = df_aug['sentence'].apply( clean_text )
df_aug['sentence_lower'] = df_aug['sentence_lower'].apply( convert_emoticons )
df_aug['sentence_lower'] = df_aug['sentence_lower'].apply( lambda x: remove_emoji(x, to_text=True) )
df_aug['sentence_lower'] = df_aug['sentence_lower'].apply( unfold_contractions )
df_aug['sentence_lower'] = df_aug['sentence_lower'].apply( mask_entities )
df_aug['sentence_lower'] = df_aug['sentence_lower'].apply( lambda x: x.lower() )
df_aug['sentence_lower'] = df_aug['sentence_lower'].apply( remove_non_alpha )
df_aug['sentence_lower'] = df_aug['sentence_lower'].apply( remove_stopwords )

# VERIFY LENGTH IS NOT 0
df_aug['length'] = df_aug['sentence_lower'].apply( lambda x: len(x.split()))
print( 'Lengths of sentences:', sorted( list(set(df_aug['length'].values))) )

# ADD COLUMNS TO MERGE WITH MAIN DF
df_aug['target'] = 1
df_aug['subset'] = 'train'
df_aug = df_aug[['sentence_lower', 'target', 'label', 'subset', 'source']]
for col in df.columns:
    if col not in df_aug.columns:
        df_aug[col] = np.nan
print( 'Shape of augmented data:', df_aug.shape )
df_aug.head()

In [16]:
already_in_test_set = [ i for i in df_aug['sentence_lower'].tolist() if i in df[ df['subset'] == 'test' ]['sentence_lower'].tolist() ]
df_aug = df_aug[ ~df_aug['sentence_lower'].isin( already_in_test_set )]
df_aug.shape

(605, 13)

In [None]:
# LOAD KEYWORDS
kw = pd.read_excel( 'data/kw.xlsx', engine='openpyxl' )

kw['sentence_lower'] = kw['keyword'].apply( clean_text )
kw['sentence_lower'] = kw['sentence_lower'].apply( convert_emoticons )
kw['sentence_lower'] = kw['sentence_lower'].apply( lambda x: remove_emoji(x, to_text=True) )
kw['sentence_lower'] = kw['sentence_lower'].apply( unfold_contractions )
kw['sentence_lower'] = kw['sentence_lower'].apply( mask_entities )
kw['sentence_lower'] = kw['sentence_lower'].apply( lambda x: x.lower() )
kw['sentence_lower'] = kw['sentence_lower'].apply( remove_non_alpha )
kw['sentence_lower'] = kw['sentence_lower'].apply( remove_stopwords )

# VERIFY LENGTH IS NOT 0
kw['length'] = kw['sentence_lower'].apply( lambda x: len(x.split()))
print( 'Lengths of sentences:', sorted( list(set(kw['length'].values))) )

# ADD COLUMNS TO MERGE WITH MAIN DF
kw['target'] = 1
kw['source'] = 'guide'
kw['subset'] = 'train'
kw = kw[['sentence_lower', 'target', 'label', 'subset', 'source']]
for col in df.columns:
    if col not in kw.columns:
        kw[col] = np.nan
#kw = kw[ df.columns ]
print( 'Shape of keywords data:', kw.shape )
kw.head()

In [18]:
# MERGE KEYWORDS WITH MAIN DF
print( df.shape )
df = pd.concat([ df, kw, df_aug ])
df = df.sample( frac=1 )
print( df.shape )

(10714, 13)
(11652, 13)


In [19]:
print(len(df['sentence'].unique()))

10341


In [20]:
df = df.drop_duplicates(subset=['sentence']).reset_index(drop=True)

In [21]:
df['target'].value_counts()

1    5453
0    4888
Name: target, dtype: int64

### Train-Test Split

In [22]:
# ONLY IF NO UPSAMPLING WAS MADE
df_train = df[ df['subset'].isin([ 'train', 'val' ]) ]
df_test  = df[ df['subset'].isin([ 'test' ]) ]

In [23]:
X_train = df_train['sentence_lower'].values
y_train = df_train['target'].values
X_test  = df_test['sentence_lower'].values
y_test  = df_test['target'].values

X_train, y_train = sklearn.utils.shuffle( X_train, y_train )
X_test, y_test   = sklearn.utils.shuffle( X_test, y_test   )

print( 'Shape of train / test data:', X_train.shape, X_test.shape, y_train.shape, y_test.shape )

Shape of train / test data: (9438,) (903,) (9438,) (903,)


In [24]:
maxlen

88

In [25]:
# KERAS TOKENIZER
tokenizer = Tokenizer( num_words=7000,
                       lower=True,
                       oov_token='oov',
                       filters='"#$%&()*+,-./:;<=>@[\\]^_`{|}~\t\n',        # removed '!' and '?'
                     )
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test  = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary size:', vocab_size)

X_train        = pad_sequences(X_train, padding='post', truncating='post', maxlen=maxlen)
X_test         = pad_sequences(X_test, padding='post', truncating='post', maxlen=maxlen)

Vocabulary size: 6776


In [26]:
time_stamp = time.strftime("%Y%m%dT%H%M")

In [27]:
print('Current tokenizer time stamp:', time_stamp)
wdir          = 'data/'
use_sentiment = False

tokenizer_filepath = wdir + f'{time_stamp}_tokenizer.pkl'
X_train_filepath   = wdir + f'{time_stamp}_X_train.pkl'
y_train_filepath   = wdir + f'{time_stamp}_y_train.pkl'
X_test_filepath    = wdir + f'{time_stamp}_X_test.pkl'
y_test_filepath    = wdir + f'{time_stamp}_y_test.pkl'

with open(tokenizer_filepath, 'wb') as f:
    pickle.dump( tokenizer, f, protocol=pickle.HIGHEST_PROTOCOL )
    
with open(X_train_filepath, 'wb') as f:
    pickle.dump( X_train, f, protocol=pickle.HIGHEST_PROTOCOL )
    
with open(y_train_filepath, 'wb') as f:
    pickle.dump( y_train, f, protocol=pickle.HIGHEST_PROTOCOL )
    
with open(X_test_filepath, 'wb') as f:
    pickle.dump( X_test, f, protocol=pickle.HIGHEST_PROTOCOL )
        
with open(y_test_filepath, 'wb') as f:
    pickle.dump( y_test, f, protocol=pickle.HIGHEST_PROTOCOL )


if use_sentiment:
    X_train_sentim_filepath = wdir + f'{time_stamp}_X_train_sentim.pkl'
    X_test_sentim_filepath  = wdir + f'{time_stamp}_X_test_sentim.pkl'

    with open(X_train_sentim_filepath, 'wb') as f:
        pickle.dump( X_train_sentim, f, protocol=pickle.HIGHEST_PROTOCOL )

    with open(X_test_sentim_filepath, 'wb') as f:
        pickle.dump( X_test_sentim, f, protocol=pickle.HIGHEST_PROTOCOL )   
    

Current tokenizer time stamp: 20211003T0319


In [28]:
def get_embeddings(switch):
    
    print( 'Building embedding matrix ...', end=' ' )
    if switch == 0:

        # WORD2VEC
        word_vectors = KeyedVectors.load_word2vec_format('./pretrained_embeddings/GoogleNews-vectors-negative300.bin', binary=True)

        EMBED_SIZE=300
        embedding_matrix = np.zeros((vocab_size, EMBED_SIZE))
        for word, i in tokenizer.word_index.items():
            #if i>=NUM_WORDS:
            #    continue
            try:
                embedding_vector = word_vectors[word]
                embedding_matrix[i] = embedding_vector
            except KeyError:
                embedding_matrix[i]=np.random.normal(0,np.sqrt(0.25),EMBED_SIZE)

        del(word_vectors)
        print( 'using Word2vec!')

    elif switch == 1:

        # GLOVE
        embeddings_dictionary = dict()
        EMBED_SIZE = 300

        glove_file = open('pretrained_embeddings/glove.6B.300d.txt', encoding="utf8")

        for line in glove_file:
            records = line.split()
            word = records[0]
            vector_dimensions = np.asarray(records[1:], dtype='float32')
            embeddings_dictionary[word] = vector_dimensions
        glove_file.close()

        embedding_matrix = np.zeros((vocab_size, EMBED_SIZE))
        for word, index in tokenizer.word_index.items():
            embedding_vector = embeddings_dictionary.get(word)
            if embedding_vector is not None:
                embedding_matrix[index] = embedding_vector

        print( 'using Glove!')

    elif switch == 2:

        # spaCy    
        EMBED_SIZE = len(nlp('The').vector)
        embedding_matrix = np.zeros((vocab_size, EMBED_SIZE))
        for word, index in tokenizer.word_index.items():
            embedding_matrix[index] = nlp(word).vector

        print( 'using spaCy!')

    else:
        print('Cannot proceed without embeddings')
        unknown_variable
        
    return embedding_matrix, EMBED_SIZE


# Embeddings Switch: 0 - Word2vec, 1 - Glove, 2 - Spacy
switch_dict = { 0: 'Word2vec',
                1: 'Glove',
                2: 'spaCy',
              }

In [29]:
switch = 2
embedding_matrix, EMBED_SIZE = get_embeddings( 2 )
#embedding_matrix_glove, _    = get_embeddings( 1 )
#embedding_matrix_w2vec, _    = get_embeddings( 0 )

Building embedding matrix ... using spaCy!


## 2. Train model

__Plan__

1. With oversampling:
* learning_rates = [1e-3, 5e-4, 1e-4]
* batch_sizes    = [8, 16, 32]
* dropouts       = [0.25, 0.4, 0.5, 0.6]
* units          = [150, 100, 75]
* optimizers     = [RMSprop, Adam]

2. Without oversampling:
* Same

In [32]:
seed_value = 43
#import os
#os.environ['PYTHONHASHSEED'] = str(seed_value)
random.seed( seed_value )
np.random.seed( seed_value )
tf.random.set_seed( seed_value )

In [None]:
# MODEL 0 - PLAIN
epochs         = 21
learning_rates = [ 5e-4 ]
batch_sizes    = [ 32, 8, ]
dropouts       = [ 0.6, 0.5, 0.4, 0.25, ]
units_list     = [ 150, 100, 75, ]
optimizers     = [ RMSprop, Adam, ]                               # Adam, RMSprop, Nadam
emb            = switch_dict[ switch ]

all_combinations = list(itertools.product(*[learning_rates, batch_sizes, dropouts, units_list, optimizers]))
wdir             = 'ckpts/current/'
all_results      = dict()        # {HPs: (blind test, test set, avg cv results, clf_repb, clf_rep)}

for learning_rate, batch_size, dropout, units, optimizer in all_combinations:

    time_stamp = time.strftime("%Y%m%dT%H%M")
    file_name  = f'logs/log_{time_stamp}.txt'
        
    with open( file_name, 'w', encoding='utf-8' ) as f:
                
        experiment_name = '1 BiLSTM, UPSAMPLING, N0 SENTIMENT\n'
        f.write( experiment_name )
        
        optimizer_name = optimizer.__module__.split('.')[-1].capitalize()
        params = f'\nEmbeddings={emb}, LR={learning_rate}, batch_size={batch_size}, dropout={dropout}, units={units}, optimizer={optimizer_name}\n'
        message = '\nTimestamp: ' + time_stamp + params
        print( message )        
        f.write( message + '\n' )

        skf = StratifiedKFold(n_splits=5, shuffle=True)
        skf.get_n_splits(X_train, y_train)
                
        # CROSS-VALIDATION CYCLE - SPLIT X_TRAIN INTO 5 FOLDS
        cv_results = []      # micro F1, macro F1, class 1 F1, class 1 precision, class 1 recall, val_accu, val_score
        fold = 1
        for train_idx, val_idx in skf.split(X_train, y_train):
            
            print(f'CV validation results - fold {fold}')
            f.write(f'CV validation results - fold {fold}\n')
            X_train_cv, y_train_cv = X_train[train_idx], y_train[train_idx]
            X_val_cv, y_val_cv     = X_train[val_idx],  y_train[val_idx]
        
            deep_inputs     = Input(shape=(maxlen,))
            embedding_layer = Embedding(vocab_size, EMBED_SIZE, weights=[embedding_matrix], trainable=False)(deep_inputs)
            LSTM_1          = Bidirectional(LSTM( units, dropout=dropout, return_sequences=False ))(embedding_layer)
            #gmp1d           = GlobalMaxPool1D()(LSTM_1)
            gmp1d           = Dense(units, activation='softplus')(LSTM_1)
            dense_layer     = Dense(1, activation='sigmoid')(gmp1d)
            model           = Model(inputs=deep_inputs, outputs=dense_layer)
            #model.summary()

            model.compile( loss='binary_crossentropy', optimizer=optimizer(lr=learning_rate), metrics=['accuracy'] )

            early_stop = tf.keras.callbacks.EarlyStopping(
                                                           monitor='val_loss',
                                                           patience=3,
                                                           restore_best_weights=False,
                                                           verbose=2,
                                                         )

            reduce_lr  = tf.keras.callbacks.ReduceLROnPlateau( 
                                                               monitor="val_loss",
                                                               patience=1,
                                                               factor=0.4,
                                                               min_lr=5e-5,
                                                               verbose=2,
                                                             )

            filepath   = wdir + time_stamp + '-epoch{epoch:02d}-val_accu_{val_accuracy:.2f}-val_loss_{val_loss:.2f}.hdf5'
            checkpoint = tf.keras.callbacks.ModelCheckpoint( 
                                                             filepath,
                                                             verbose=0,
                                                           )

            history = model.fit( X_train_cv,
                                 y_train_cv,
                                 batch_size=batch_size,
                                 epochs=epochs,
                                 verbose=2,
                                 validation_data=(X_val_cv, y_val_cv),
                                 callbacks=[ early_stop, reduce_lr, ]     # checkpoint
                                )

            score        = model.evaluate(X_val_cv, y_val_cv, verbose=1)
            test_results = f'\tFold {fold}\n\tVal Score: {score[0]}\n\tVal Accuracy: {score[1]}\n'                      
            predictions  = model.predict(X_val_cv)
            predictions  = np.round( predictions )
            clf_report   = classification_report(y_val_cv, predictions, digits=4)
            
            print( test_results, '\n', clf_report, '\n', '='*70 )
            f.write( test_results + '\n' + clf_report + '\n' + '='*70 + '\n' )
                        
            clf_rep_dict = classification_report( y_val_cv, predictions, output_dict=True)            
            cv_results.append(( clf_rep_dict['accuracy'], clf_rep_dict['macro avg']['f1-score'], clf_rep_dict['1']['f1-score'],
                                clf_rep_dict['1']['precision'], clf_rep_dict['1']['recall'], score[1], score[0] ))
            fold += 1
                
        # OUTSIDE CV CYCLE (TESTING ON MODEL FROM LAST CV FOLD)        
        micro_f1  = round( np.mean([i[0] for i in cv_results]), 4 )
        macro_f1  = round( np.mean([i[1] for i in cv_results]), 4 )
        cl1_f1    = round( np.mean([i[2] for i in cv_results]), 4 )
        cl1_prec  = round( np.mean([i[3] for i in cv_results]), 4 )
        cl1_rec   = round( np.mean([i[4] for i in cv_results]), 4 )
        val_accu  = round( np.mean([i[5] for i in cv_results]), 4 )
        val_score = round( np.mean([i[6] for i in cv_results]), 4 )
                
        avg_res  = f'\tMicro F1: {micro_f1}\n'
        avg_res += f'\tMacro F1: {macro_f1}\n'
        avg_res += f'\tClass 1 F1: {cl1_f1}\n'
        avg_res += f'\tClass 1 precision1: {cl1_prec}\n'
        avg_res += f'\tClass 1 recall: {cl1_rec}\n'
        avg_res += f'\tVal accuracy: {val_accu}\n'
        avg_res += f'\tVal score: {val_score}\n'
        print('\nAverage CV results:\n', avg_res)
        f.write('\nAverage CV results:\n' + avg_res)
                
        # TEST ON REGULAR TEST SET        
        score        = model.evaluate(X_test, y_test, verbose=1)
        test_results = f'Regular Test Score: {score[0]}\nRegular Test Accuracy: {score[1]}\n'
        predictions  = model.predict(X_test)
        predictions  = np.round( predictions )
        clf_report   = classification_report(y_test, predictions, digits=4)
        print('\nOne-time test on test set\n', test_results, clf_report)
        f.write('\nOne-time test on test set\n' + test_results + '\n' + clf_report + '\n')
        
        # TEST ON BLIND TEST SET        
        scoreb        = model.evaluate(X_testb, y_testb, verbose=1)
        test_resultsb = f'Blind Test Score: {scoreb[0]}\nBlind Test Accuracy: {scoreb[1]}\n'
        predictions   = model.predict(X_testb)
        predictions   = np.round( predictions )
        clf_reportb   = classification_report(y_testb, predictions, digits=4)
        print('\nOne-time test on blind set\n', test_resultsb, clf_reportb)
        f.write('\nOne-time test on blind set\n' + test_resultsb + '\n' + clf_reportb + '\n')
                
        record_end = '='*70
        print( record_end )
        f.write( record_end + '\n')
                
        # (blind set accu, test set accu, 7 avg cv scores, clf_repb, clf_rep) - 7 cv scores are avg from 5 cv folds
        all_results[ message ] = ( scoreb[1], score[1], micro_f1, macro_f1, cl1_f1, cl1_prec, cl1_rec,
                                   val_accu, val_score, clf_reportb, clf_report, )
                
        gc.collect()


Timestamp: 20211003T0706
Embeddings=spaCy, LR=0.0005, batch_size=32, dropout=0.6, units=150, optimizer=Rmsprop

CV validation results - fold 1
Epoch 1/21
236/236 - 3s - loss: 0.6117 - accuracy: 0.6695 - val_loss: 0.5556 - val_accuracy: 0.7172
Epoch 2/21
236/236 - 3s - loss: 0.5731 - accuracy: 0.7030 - val_loss: 0.5408 - val_accuracy: 0.7315
Epoch 3/21
236/236 - 3s - loss: 0.5622 - accuracy: 0.7095 - val_loss: 0.5294 - val_accuracy: 0.7452
Epoch 4/21
236/236 - 3s - loss: 0.5397 - accuracy: 0.7203 - val_loss: 0.5256 - val_accuracy: 0.7474
Epoch 5/21
236/236 - 3s - loss: 0.5343 - accuracy: 0.7303 - val_loss: 0.4999 - val_accuracy: 0.7542
Epoch 6/21

Epoch 00006: ReduceLROnPlateau reducing learning rate to 0.00020000000949949026.
236/236 - 3s - loss: 0.5176 - accuracy: 0.7403 - val_loss: 0.5122 - val_accuracy: 0.7569
Epoch 7/21

Epoch 00007: ReduceLROnPlateau reducing learning rate to 8.000000379979611e-05.
236/236 - 3s - loss: 0.5007 - accuracy: 0.7494 - val_loss: 0.5006 - val_accuracy: 

In [None]:
# SAVE ALL_RESULTS
time_stamp = time.strftime("%Y%m%dT%H%M")
all_results_path = f'logs/{time_stamp}_all_results.pkl'

with open(all_results_path, 'wb') as f:
    pickle.dump( all_results, f, protocol=pickle.HIGHEST_PROTOCOL )

In [None]:
# CONVERT ALL_RESULTS INTO DF FOR EASY SORTING
cols = [ 'accu_blind', 'accu_test', 'micro_f1', 'macro_f1', 'cl1_f1', 'cl1_prec', 'cl1_rec',
         'val_accu', 'val_score', 'clf_reportb', 'clf_report' ]
df_res = pd.DataFrame.from_dict( all_results, orient='index', columns=cols )
df_res['params'] = df_res.index
df_res = df_res.reset_index(drop=True)
df_res = df_res[ ['params']+cols ]
df_res.head()

In [None]:
df_res.to_pickle(f'logs/{time_stamp}_df_res.pkl')
df_res.to_csv(f'logs/{time_stamp}_df_res.tsv', sep='\t', encoding='utf-8', index=True)

In [None]:
gc.collect()