In [None]:
import pandas as pd
import xml.etree.ElementTree as ET
import glob, os
import numpy as np
from comet_ml import Experiment, Optimizer


In [None]:
os.environ['TF_KERAS'] = '1'
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional, \
            Input, concatenate, Add, Lambda
from tensorflow.keras import regularizers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import callbacks, optimizers
from tensorflow.keras import backend as K
from tensorflow.keras.utils import plot_model

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

# Read data

In [None]:
datadir_T1 = '/home/ana/eRisk/data/eRisk2020_T1_train/eRISK2020_T1_training_data/eRISK2020_training_data/data/'
labels_file_T1 = '/home/ana/eRisk/data/eRisk2020_T1_train/eRISK2020_T1_training_data/eRISK2020_training_data/golden_truth.txt'

In [None]:
labels_T1 = pd.read_csv(labels_file_T1, delimiter=' ', names=['subject', 'label'])

In [None]:
labels_T1.label.hist()

In [None]:
labels_T1 = labels_T1.set_index('subject')
labels_T1

## Read text

In [None]:
writings = []
for subject_file in os.listdir(datadir_T1):
    print(subject_file)
    with open(os.path.join(datadir_T1, subject_file)) as sf:
        contents = sf.read()
        root = ET.fromstring(contents)
        try:
            subject = root.findall('ID')[0].text
        except Exception:
            print('Cannot extract ID', contents[:500], '\n-------\n')        
        for w in root.iter('WRITING'):
            subject_writings = {'subject': subject}
            for title in w.findall('TITLE'):
                subject_writings['title'] = title.text
            for text in w.findall('TEXT'):
                subject_writings['text'] = text.text
            for date in w.findall('DATE'):
                subject_writings['date'] = date.text
            writings.append(subject_writings)
            # TODO: Date



In [None]:
writings_df = pd.DataFrame(writings)

In [None]:
writings_df[writings_df['text'].isna()][~writings_df['title'].isna()]

In [None]:
writings_df['label'] = writings_df['subject'].apply(
    lambda s: labels_T1.loc[s, 'label'])

In [None]:
writings_df.label.hist()

In [None]:
tokenizer = RegexpTokenizer(r'\w+')

def tokenize(t):
    return tokenizer.tokenize(t.lower())

In [None]:
tokenize("I wasn't ready to leave! buh-buw(dd). Sasa .")

In [None]:
writings_df['tokenized_title'] = writings_df['title'].apply(lambda t: tokenize(t) if type(t)==str else None)
writings_df['title_len'] = writings_df['tokenized_title'].apply(lambda t: len(t) if type(t)==list else None)
writings_df['tokenized_text'] = writings_df['text'].apply(lambda t: tokenize(t) if type(t)==str else None)
writings_df['text_len'] = writings_df['tokenized_text'].apply(lambda t: len(t) if type(t)==list else None)

In [None]:
writings_df.text_len.describe()

In [None]:
writings_df.title_len.describe()

In [None]:
writings_df.groupby('subject').mean()

In [None]:
writings_df[(~writings_df['text_len'].isnull()) & (writings_df['text_len'] > 100)]


# Recurrent NN

## Extract features and encode data

In [None]:
hyperparams_features = {
    "max_features": 20000,
    # cut texts after this number of words
    # (among top max_features most common words)
    "maxlen": 100,
    "embedding_dim": 100
}


### Emotions

In [None]:
def load_NRC(nrc_path):
    word_emotions = {}
    emotion_words = {}
    with open(nrc_path) as in_f:
        for line in in_f:
            line = line.strip()
            if not line:
                continue
            word, emotion, label = line.split()
            if word not in word_emotions:
                word_emotions[word] = set()
            if emotion not in emotion_words:
                emotion_words[emotion] = set()
            label = int(label)
            if label:
                word_emotions[word].add(emotion)
                emotion_words[emotion].add(word)
    return emotion_words

nrc_lexicon_path = '/home/ana/resources/NRC-Sentiment-Emotion-Lexicons/NRC-Sentiment-Emotion-Lexicons/NRC-Emotion-Lexicon-v0.92/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt'
nrc_lexicon = load_NRC(nrc_lexicon_path)
emotions = list(nrc_lexicon.keys())


In [None]:
def encode_emotions(tokens, emotion_lexicon, emotions, relative=True):
    text_len = len(tokens)
    encoded_emotions = [0 for e in emotions]
    for i, emotion in enumerate(emotions):
        try:
            emotion_words = [t for t in tokens if t in emotion_lexicon[emotion]]
            if relative:
                encoded_emotions[i] = len(emotion_words) / len(tokens)
            else:
                encoded_emotions[i] = len(emotion_words)
        except ValueError:
            print("Emotion not found.")
    return encoded_emotions

#### Sentiment

### Style features

#### Char n-grams

In [None]:
def extract_ngrams(tokens):
    pass

#### Personal pronouns

In [None]:
first_person_pronouns = {"i", "me", "my", "mine"}
def encode_pronouns(tokens, pronouns={"i", "me", "my", "mine"}, relative=True):
    if not tokens:
        return np.nan
    text_len = len(tokens)
    nr_pronouns = len([t for t in tokens if t in pronouns])
    if relative:
        return nr_pronouns/text_len
    else:
        return nr_pronouns

#### Stopwords

In [None]:
stopword_list = stopwords.words("english")
def encode_stopwords(tokens, stopwords=stopword_list):
    encoded_stopwords = [0 for s in stopword_list]
    if not tokens:
        return encoded_stopwords
    for i, stopword in enumerate(stopwords):
        if stopword in tokens:
            encoded_stopwords[i] += 1
    return encoded_stopwords

### Topics

### Encode data

In [None]:
from collections import Counter
def load_erisk_data(writings_df, voc_size, emotion_lexicon, seq_len, emotions =  
                    ['anger', 'anticipation', 'disgust', 'fear', 'joy', 
                     'negative', 'positive', 'sadness', 'surprise', 'trust'],
                    pronouns = ["i", "me", "my", "mine"],
                    train_prop=0.7, min_post_len=3, min_word_len=1):
    print("Loading data...")
    vocabulary = {}
    word_freqs = Counter()
    for words in writings_df.tokenized_text:
        word_freqs.update(words)
    for words in writings_df.tokenized_title:
        word_freqs.update(words)
    i = 1
    for w, f in word_freqs.most_common(voc_size-2): # keeping voc_size-1 for unk
        if len(w) < min_word_len:
            continue
        vocabulary[w] = i
        i += 1
    tokens_data_train = []
    categ_data_train = []
    sparse_data_train = []
    tokens_data_test = []
    categ_data_test = []
    sparse_data_test = []
    labels_train = []
    labels_test = []
    # TODO: shuffle?
    all_subjects = sorted(list(set(writings_df.subject)))
    training_subjects_size = int(len(all_subjects) * train_prop)
    training_subjects = all_subjects[:training_subjects_size]
    training_rows = writings_df[writings_df['subject'].isin(training_subjects)].sample(frac=1) # shuffling
    test_rows = writings_df[~writings_df['subject'].isin(training_subjects)].sample(frac=1)
    def encode_text(tokens):
        # Using voc_size-1 value for OOV token
        encoded_tokens = [vocabulary.get(w, voc_size-1) for w in tokens]
        encoded_emotions = encode_emotions(tokens, emotion_lexicon, emotions)
        encoded_pronouns = encode_pronouns(tokens, pronouns)
        encoded_stopwords = encode_stopwords(tokens)
        return (encoded_tokens, encoded_emotions, encoded_pronouns, encoded_stopwords)
    for row in training_rows.itertuples():
        words = []
        if row.tokenized_title:
            words.extend(row.tokenized_title)
        if row.tokenized_text:
            words.extend(row.tokenized_text)
        if not words or len(words)<min_post_len:
            continue
        label = row.label
        encoded_tokens, encoded_emotions, encoded_pronouns, encoded_stopwords = encode_text(words)
        tokens_data_train.append(encoded_tokens)
        categ_data_train.append(encoded_emotions + [encoded_pronouns])
        sparse_data_train.append(encoded_stopwords)
        labels_train.append(label)
    for row in test_rows[~test_rows['tokenized_text'].isna()].itertuples():
        words = []
        if row.tokenized_title:
            words.extend(row.tokenized_title)
        if row.tokenized_text:
            words.extend(row.tokenized_text)   
        if not words or len(words)<min_post_len:
            continue
        label = row.label
        encoded_tokens, encoded_emotions, encoded_pronouns, encoded_stopwords = encode_text(words)
        tokens_data_test.append(encoded_tokens)
        categ_data_test.append(encoded_emotions + [encoded_pronouns])
        sparse_data_test.append(encoded_stopwords)
        labels_test.append(label)
        
    # using zeros for padding
    tokens_data_train_padded = sequence.pad_sequences(tokens_data_train, maxlen=seq_len)
    tokens_data_test_padded = sequence.pad_sequences(tokens_data_test, maxlen=seq_len)
        
    return ([np.array(tokens_data_train_padded), np.array(categ_data_train), np.array(sparse_data_train)], 
            np.array(labels_train)), \
            ([np.array(tokens_data_test_padded), np.array(categ_data_test), np.array(sparse_data_test)], 
             np.array(labels_test)), vocabulary

In [None]:
(x_train, y_train), (x_test, y_test), voc = load_erisk_data(writings_df, 
                                                            seq_len=hyperparams_features['maxlen'],
                                                            voc_size=hyperparams_features['max_features'],
                                                           emotion_lexicon=nrc_lexicon,
                                                           emotions=emotions)

In [None]:
x_train_seq, x_train_categ, x_train_sparse = x_train
x_test_seq, x_test_categ, x_test_sparse = x_test
print(len(x_train_seq), 'train sequences')
print(len(x_test_seq), 'test sequences')

In [None]:
print(pd.Series(y_train).sum(), "positive training examples")
print(pd.Series(y_test).sum(), "positive test examples")

In [None]:
from sklearn.utils import class_weight

class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(y_train),
                                                 y_train)
class_weights

In [None]:
def load_embeddings(path, embedding_dim, voc):
    # random matrix with mean value = 0
    embedding_matrix = np.random.random((len(voc)+2, embedding_dim)) - 0.5 # voc + unk + pad value(0)
#     embedding_matrix = np.zeros((len(voc)+1, embedding_dim))

    f = open(path)
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        word_i = voc.get(word)
        if word_i is not None:
            embedding_matrix[word_i] = coefs
    f.close()

    print('Total %s word vectors.' % len(embedding_matrix))

 
    return embedding_matrix

# pretrained_embeddings_path = '/home/ana/resources/glove.6B/glove.6B.%dd.txt' % hyperparams_features['embedding_dim']
pretrained_embeddings_path = '/home/ana/resources/glove.twitter.27B/glove.twitter.27B.%dd.txt' % hyperparams_features['embedding_dim']
embedding_matrix = load_embeddings(pretrained_embeddings_path, hyperparams_features['embedding_dim'], voc)


In [None]:
x_train_sparse[90]

## Define model

In [None]:
hyperparams = {
    'content_dense_units': 32,
    'stopwords_dense_units': 5,
    'dropout': 0.14,
    'l2_dense': 0.0001,
    'optimizer': 'adagrad', #None,
    'decay': 0.0001,
    'lr': 0.00001,
    "batch_size": 128,
    "trainable_embeddings": False,

}
if not hyperparams['optimizer']:
    hyperparams['optimizer'] = optimizers.Adam(lr=hyperparams['lr'], beta_1=0.9, beta_2=0.999, epsilon=1e-08,
                                   decay=hyperparams['decay'])

In [None]:
def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
def build_model(hyperparams, hyperparams_features, embedding_matrix, emotions, stopwords_list,
               ignore_layer=[]):

    tokens_features = Input(shape=(hyperparams_features['maxlen'],), name='word_seq')
    embedding_layer = Embedding(hyperparams_features['max_features'], 
                                hyperparams_features['embedding_dim'], 
                                input_length=hyperparams_features['maxlen'],
                                mask_zero=True,
                                weights=[embedding_matrix], 
                                trainable=hyperparams['trainable_embeddings'],
                               name='embeddings_layer')(
        tokens_features)
    summed_embeddings = Lambda(lambda x: K.mean(x, axis=1), name='averaged_embeddings')(embedding_layer)
    content_dense_layer = Dense(units=hyperparams['content_dense_units'],
                        kernel_regularizer=regularizers.l2(hyperparams['l2_dense']),
                      name='content_dense_layer')(summed_embeddings)
    
    numerical_features = Input(shape=(len(emotions) + 1,), name='numeric_input') # emotions and pronouns
    dense_layer = Dense(units=1,
                        kernel_regularizer=regularizers.l2(hyperparams['l2_dense']),
                        name='numerical_dense_layer',
                       )(numerical_features)
    sparse_features = Input(shape=(len(stopwords_list),), name='sparse_input') # stopwords
    dense_layer_sparse = Dense(units=hyperparams['stopwords_dense_units'],
                              name='sparse_feat_dense_layer',
                            kernel_regularizer=regularizers.l2(hyperparams['l2_dense']),
                              )(sparse_features)
#     # TODO: this is getting out of hand. Refactor this ablation part.
    if 'lstm_layers' in ignore_layer:
        output_layer = Dense(1, activation='sigmoid')(numerical_features)
    elif 'numerical_dense_layer' in ignore_layer and 'sparse_feat_dense_layer' in ignore_layer:
        output_layer = Dense(1, activation='sigmoid')(content_dense_layer)
    elif 'numerical_dense_layer' in ignore_layer:
        merged_layers = concatenate([content_dense_layer, dense_layer_sparse])
        output_layer = Dense(1, activation='sigmoid')(merged_layers)

    elif 'sparse_feat_dense_layer' in ignore_layer:
        merged_layers = concatenate([content_dense_layer, dense_layer])
        output_layer = Dense(1, activation='sigmoid')(merged_layers)

    else:
        merged_layers = concatenate([content_dense_layer, dense_layer, dense_layer_sparse])
        output_layer = Dense(1, activation='sigmoid')(merged_layers)

    # Compile model
    model = Model(inputs=[tokens_features, numerical_features, sparse_features], outputs=output_layer)
    model.compile(hyperparams['optimizer'], 'binary_crossentropy',
                  metrics=['binary_accuracy', f1_m, precision_m, recall_m])
    
    return model


In [None]:
model = build_model(hyperparams, hyperparams_features, embedding_matrix, emotions, stopword_list,
                   ignore_layer=[])
model.summary()

In [None]:
plot_model(model, 'models/mlp_plus2.png')

In [None]:
experiment = Experiment(api_key="eoBdVyznAhfg3bK9pZ58ZSXfv",
                        project_name="mental", workspace="ananana")

experiment.log_parameters(hyperparams_features)

experiment.log_parameter('emotion_lexicon', nrc_lexicon_path)
experiment.log_parameter('emotions', emotions)
experiment.log_parameter('embeddings_path', pretrained_embeddings_path)

experiment.log_parameters(hyperparams)

In [None]:
voc

## Train

In [None]:
class WeightsHistory(callbacks.Callback):
    def on_train_begin(self, logs={}):
        pass

    def on_epoch_end(self, epoch, logs={}):
        experiment.log_histogram_3d(self.model.get_layer('content_dense_layer').get_weights()[0], 
                                    name='content_dense_weights',
                                   step=epoch)  
        experiment.log_histogram_3d(model.get_layer('embeddings_layer').get_weights()[0], 
                            name='embedding_weights',
                           step=epoch)
        experiment.log_histogram_3d(model.get_layer('numerical_dense_layer').get_weights()[0], 
                                    name='numerical_dense_weights',
                                   step=epoch)
        experiment.log_histogram_3d(model.get_layer('sparse_feat_dense_layer').get_weights()[0], 
                            name='sparse_dense_weights',
                           step=epoch)
        
class FreezeLayer(callbacks.Callback):
    def __init__(self, logs={}, patience=5, layer='embeddings_layer', verbose=1, set_to=False):
        super(FreezeLayer, self).__init__()
        self.freeze_epoch = patience
        self.freeze_layer = layer
        self.verbose = verbose
        self.set_to = set_to

    def on_epoch_begin(self, epoch, logs={}):
        if epoch == self.freeze_epoch:
            layer = model.get_layer(self.freeze_layer)
            old_value = layer.trainable
            layer.trainable = self.set_to
            if self.verbose:
                print("Setting %s layer from %s to trainable=%s..." % (layer.name, old_value,
                                                               model.get_layer(self.freeze_layer).trainable))
        
weights_history = WeightsHistory()

freeze_layer = FreezeLayer(patience=6, set_to=True)

reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.05,
                              patience=4, min_lr=0.0001, verbose=1)

In [None]:
def train_model(model, 
                x_train, y_train, x_test, y_test, 
                batch_size, epochs, class_weight, start_epoch=0, workers=4,
                model_path='/tmp/model'):
    print('Train...')
    experiment.log_parameter('class_weight', class_weight.values())

    history = model.fit(x_train, y_train,
              batch_size=batch_size,
              epochs=epochs, initial_epoch=start_epoch, 
              class_weight=class_weight,
              validation_data=[x_test, y_test],
#               validation_split=0.3,
                       workers=workers,
            callbacks = [
                callbacks.ModelCheckpoint(filepath='%s_best' % model_path, verbose=1, save_best_only=True),
                callbacks.EarlyStopping(patience=15), weights_history, reduce_lr, freeze_layer,
            ])
    model.save(model_path)
    experiment.log_parameter('model_path', model_path)
    return history

In [None]:
%%time
history = train_model(model, x_train, y_train, x_test, y_test,
           epochs=30, batch_size=hyperparams['batch_size'],
                      class_weight={0:0.5, 1:12}, 
                      model_path='models/mlp_plus3', workers=3)

In [None]:
model.get_weights()

In [None]:
[(e,len(nrc_lexicon[e])) for e in nrc_lexicon]

In [None]:
plot_model(model, to_file='models/mlp_plus3.png')

In [None]:
dependencies = {
    'f1_m': f1_m,
    'precision_m': precision_m,
    'recall_m': recall_m
}
# model = load_model('models/lstm_plus1', custom_objects=dependencies)

In [None]:
model.evaluate(x_test, y_test)

In [None]:
predictions = model.predict(x_test)

In [None]:
pd.Series(predictions.flatten()).hist()

In [None]:
sum(predictions>0.5)

In [None]:
sum(predictions<0.5)

## Extra analysis


In [None]:
def merge_tokens(row):
    tokens = []
    if row.tokenized_text:
        tokens += row.tokenized_text
    if row.tokenized_title:
        tokens += row.tokenized_title
    return tokens
writings_df['all_tokens'] = writings_df.apply (lambda row: merge_tokens(row), axis=1)

In [None]:
# TODO: include the title
def extract_emotions(tokens, emotion, relative=True):
    if not tokens:
        return None
    emotion_words = [t for t in tokens 
                     if t in nrc_lexicon[emotion]]
    if relative:
        return len(emotion_words) / len(tokens)
    else:
        return len(emotion_words)
    
    return encoded_emotions

from functools import partial
for emotion in emotions:
    writings_df[emotion] = writings_df['all_tokens'].apply(partial(extract_emotions, emotion=emotion, 
                                                                   relative=True))


In [None]:
writings_df['pronouns'] = writings_df['all_tokens'].apply(partial(encode_pronouns, relative=True))

In [None]:
writings_df[['text', 'label', 'pronouns', 'text_len'] + emotions].corr()

In [None]:
writings_df[['text', 'label', 'pronouns', 'text_len'] + emotions].groupby('label').mean()

In [None]:
from nltk.sentiment import SentimentAnalyzer, SentimentIntensityAnalyzer

In [None]:
sid = SentimentIntensityAnalyzer()


In [None]:
sid.polarity_scores("We are here today happiness is all around")

In [None]:
writings_df['neg_vader'] = writings_df.text.apply(lambda t: sid.polarity_scores(t)['neg']
                                                 if type(t)==str else 0)

In [None]:
writings_df

In [None]:
writings_df['pos_vader'] = writings_df.text.apply(lambda t: sid.polarity_scores(t)['pos']
                                                 if type(t)==str else 0)

In [None]:
writings_df[['text', 'label', 'pronouns', 'text_len', 'neg_vader', 'pos_vader'] + emotions].groupby('label').mean()

In [None]:
writings_df[['text', 'label', 'pronouns', 'text_len', 'neg_vader', 'pos_vader'] + emotions].corr('spearman')

## Hyperparameter tuning

In [None]:
# Declare your hyperparameters search:
config = {
      "algorithm": "random",
      "parameters": {
          "lstm_units": {"type": "integer", "min": 10, "max": 100},
          "lr": {"type": "float", "min": 0.00001, "max": 0.05, "scalingType": "loguniform"},
          "l2_dense": {"type": "float", "min": 0.00001, "max": 0.5, "scalingType": "loguniform"},
          "dropout": {"type": "float", "min": 0, "max": 0.7, "scalingType": "uniform"},
          "optimizer": {"type": "categorical", "values": ["adam", "adagrad", ""]},
          "batch_size": {"type": "integer", "min": 10, "max": 512, "scalingType": "loguniform"},
          "positive_class_weight": {"type": "integer", "min": 1, "max": 25},
          "trainable_embeddings": {"type": "discrete", "values": [True, False]},
          "decay": {"type": "float", "min": 0.00000001, "max": 0.5, "scalingType": "loguniform"},          
      },
      "spec": {
          "metric": "loss",
          "objective": "minimize",
      },
  }
optimizer = Optimizer(config, api_key="eoBdVyznAhfg3bK9pZ58ZSXfv")

for experiment in optimizer.get_experiments(project_name="mental"):
    experiment.add_tag("tune")
    
    # Test the model
    hyperparams_config = {
        "lstm_units": experiment.get_parameter('lstm_units'),
        "l2_dense": experiment.get_parameter('l2_dense'),
        "dropout": experiment.get_parameter('dropout'),
        "optimizer": experiment.get_parameter('optimizer'),
        "trainable_embeddings": experiment.get_parameter('trainable_embeddings'),
        "decay": experiment.get_parameter('decay'),
        "lr": experiment.get_parameter('lr'),
        }
    if not hyperparams_config['optimizer']:
        hyperparams_config['optimizer'] = optimizers.Adam(lr=hyperparams_config['lr'], 
                                   decay=hyperparams_config['decay'])
    model = build_model(hyperparams=hyperparams_config,
                        hyperparams_features=hyperparams_features, 
                        embedding_matrix=embedding_matrix, emotions=emotions)
    history = train_model(model, 
            x_train, y_train, x_test, y_test,
            epochs=5, batch_size=experiment.get_parameter('batch_size'),
                      class_weight={0:1, 1:experiment.get_parameter('positive_class_weight')}, 
                          workers=4,
                      model_path='models/experiment')
    loss = history.history['loss'][-1]
    
    # Report the loss, if not auto-logged:
    experiment.log_metric("loss", loss)