In [None]:
import pandas as pd
import xml.etree.ElementTree as ET
import glob, os
import numpy as np
from comet_ml import Experiment, Optimizer
import pickle
import logging
import sys
from sklearn.utils import class_weight

In [2]:
os.environ['TF_KERAS'] = '1'
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, Lambda, BatchNormalization, TimeDistributed, \
    CuDNNLSTM, Bidirectional, Input, concatenate, Flatten, RepeatVector, Activation, Multiply, Permute
from tensorflow.keras import regularizers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import callbacks, optimizers
from tensorflow.keras import backend as K
from tensorflow.keras.utils import plot_model, Sequence

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

import tensorflow as tf

In [3]:
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
# only reserve 1 GPU

In [4]:
# tf.enable_eager_execution()
my_seed = 1234
tf.set_random_seed(my_seed)

In [5]:
logger = logging.getLogger('training')
logger.addHandler(logging.StreamHandler(sys.stdout))
logger.setLevel(logging.DEBUG)

In [6]:
import sys
sys.setrecursionlimit(10000)

In [7]:
tf.test.is_built_with_cuda()

True

# Read data

In [8]:
def read_subject_writings(subject_file):
    writings = []
    with open(subject_file) as sf:
        contents = sf.read()
        root = ET.fromstring(contents)
        try:
            subject = root.findall('ID')[0].text.strip()
        except Exception:
            print('Cannot extract ID', contents[:500], '\n-------\n')        
        for w in root.iter('WRITING'):
            subject_writings = {'subject': subject}
            for title in w.findall('TITLE'):
                subject_writings['title'] = title.text
            for text in w.findall('TEXT'):
                subject_writings['text'] = text.text
            for date in w.findall('DATE'):
                subject_writings['date'] = date.text
            writings.append(subject_writings)
    return writings

In [9]:
root_dir = '/home/anasab/' 
# root_dir = '/home/ana/'

### eRisk 2020 T1

In [10]:
datadir_T1 = root_dir + '/eRisk/data/eRisk2020_T1_train/eRISK2020_T1_training_data/eRISK2020_training_data/data/'
labels_file_T1 = root_dir + '/eRisk/data//eRisk2020_T1_train/eRISK2020_T1_training_data/eRISK2020_training_data/golden_truth.txt'

In [11]:
def read_texts_2020(datadir_T1, labels_file_T1):
    writings = []
    for subject_file in os.listdir(datadir_T1):
        print(subject_file)
        writings.extend(read_subject_writings(os.path.join(datadir_T1, subject_file)))
    writings_df = pd.DataFrame(writings)

    labels_T1 = pd.read_csv(labels_file_T1, delimiter=' ', names=['subject', 'label'])
    labels_T1 = labels_T1.set_index('subject')

    writings_df['label'] = writings_df['subject'].apply(
    lambda s: labels_T1.loc[s, 'label'])
    
    return writings_df



### eRisk 2019 T1 (Anorexia)

In [12]:
datadirs_T1_2019 = {
    'train': ['2018 test/', '2018 train/positive_examples/', '2018 train/negative_examples/'],
    'test': ['data/']
}
datadir_root_T1_2019 = {
    'train': root_dir + '/eRisk/data/past/eRisk2019_T1/training data - t1/',
    'test': root_dir + '/eRisk/data/past/eRisk2019_T1/test data - T1/'
}
    
labels_files_T1_2019 = {
    'train': ['2018 train/risk_golden_truth.txt', '2018 test/risk-golden-truth-test.txt'],
    'test': ['T1_erisk_golden_truth.txt']
}

In [13]:
def read_texts_2019(datadir_root_T1_2019,
                   datadirs_T1_2019,
                   labels_files_T1_2019,
                   test_suffix='0000'):
    writings = {'train': [], 'test': []}
    writings_df = pd.DataFrame()
    labels_df = pd.DataFrame()

    for subset in ('train', 'test'):
        for subdir in [os.path.join(datadir_root_T1_2019[subset], subp) for subp in datadirs_T1_2019[subset]]:
            if subset=='train':
                chunkdirs = [os.path.join(datadir_root_T1_2019[subset], subdir, chunkdir) 
                             for chunkdir in os.listdir(subdir)]
            else:
                chunkdirs = [os.path.join(datadir_root_T1_2019[subset], subdir)]
                
            for chunkdir in chunkdirs:
                if not os.path.isdir(chunkdir):
                    continue
                for subject_file in os.listdir(chunkdir):
                    writings[subset].extend(read_subject_writings(os.path.join(chunkdir, subject_file)))
        writings_df_part = pd.DataFrame(writings[subset])
        # add a suffix for users in the test -- the numbers are duplicated with the ones in train
        if subset=='test':
            writings_df_part['subject'] = writings_df_part['subject'].apply(lambda s: s+test_suffix)
            print(subset, writings_df_part.subject)
        writings_df_part['subset'] = subset
        writings_df = pd.concat([writings_df, writings_df_part])
        writings_df.reindex()

        for label_file in labels_files_T1_2019[subset]:
            labels = pd.read_csv(os.path.join(datadir_root_T1_2019[subset], label_file), 
                                 delimiter='\s+', names=['subject', 'label'])
            # add a suffix for users in the test -- the numbers are duplicated with the ones in train
            if subset=='test':
                labels['subject'] = labels['subject'].apply(lambda s: s+test_suffix)
            labels_df = pd.concat([labels_df, labels])
    labels_df = labels_df.drop_duplicates()
    labels_df = labels_df.set_index('subject')

    writings_df = writings_df.drop_duplicates()
    
    writings_df = writings_df.join(labels_df, on='subject')
    
    return writings_df

## Preprocess text

In [14]:
# writings_df = read_texts_2020(datadir_T1, labels_file_T1)
# writings_df = read_texts_2019(datadir_root_T1_2019,
#                    datadirs_T1_2019,
#                    labels_files_T1_2019)
writings_df = pickle.load(open('writings_df_selfharm_liwc', 'rb'))

In [15]:
writings_df.label.hist()

<matplotlib.axes._subplots.AxesSubplot at 0x7fb114339908>

In [16]:
writings_df.head()

Unnamed: 0,subject,title,date,text,label,tokenized_title,title_len,tokenized_text,text_len,all_tokens,...,feel,excl,future,nonfl,ppron,shehe,i,we,you,they
0,subject8292,If anyone could help with which sub to put thi...,2016-08-02 09:22:12,,0,"[if, anyone, could, help, with, which, sub, to...",11.0,,,"[if, anyone, could, help, with, which, sub, to...",...,0.0,0.090909,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,subject8292,I'm literally never gonna stop waiting...,2016-08-05 09:35:55,,0,"[i, m, literally, never, gonna, stop, waiting]",7.0,,,"[i, m, literally, never, gonna, stop, waiting]",...,0.0,0.0,0.285714,0.0,0.142857,0.0,0.142857,0.0,0.0,0.0
2,subject8292,This is a really interesting study! Makes sens...,2016-08-05 21:36:24,,0,"[this, is, a, really, interesting, study, make...",9.0,,,"[this, is, a, really, interesting, study, make...",...,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,subject8292,The only thing Frank is building ...,2016-08-07 23:35:23,"... Is hype. Think about it, every time he wor...",0,"[the, only, thing, frank, is, building]",6.0,"[is, hype, think, about, it, every, time, he, ...",26.0,"[is, hype, think, about, it, every, time, he, ...",...,0.0,0.0,0.0625,0.0,0.0625,0.03125,0.0,0.03125,0.0,0.0
4,subject8292,Mostly always me during this whole charade,2016-08-09 08:39:41,,0,"[mostly, always, me, during, this, whole, char...",7.0,,,"[mostly, always, me, during, this, whole, char...",...,0.0,0.0,0.0,0.0,0.142857,0.0,0.142857,0.0,0.0,0.0


In [17]:
tokenizer = RegexpTokenizer(r'\w+')

def tokenize(t):
    return tokenizer.tokenize(t.lower())

In [18]:
def tokenize_fields(writings_df):
    writings_df['tokenized_title'] = writings_df['title'].apply(lambda t: tokenize(t) 
                                                                if type(t)==str and t else None)
    writings_df['title_len'] = writings_df['tokenized_title'].apply(lambda t: len(t) 
                                                                    if type(t)==list and t else None)
    writings_df['tokenized_text'] = writings_df['text'].apply(lambda t: tokenize(t) 
                                                              if type(t)==str and t else None)
    writings_df['text_len'] = writings_df['tokenized_text'].apply(lambda t: len(t) 
                                                                  if type(t)==list and t else None)
    return writings_df

In [19]:
writings_df.text_len.describe()

count    127941.000000
mean         32.268929
std          82.590713
min           0.000000
25%           6.000000
50%          13.000000
75%          31.000000
max        7201.000000
Name: text_len, dtype: float64

In [20]:
writings_df.title_len.describe()

count    49762.000000
mean        10.699771
std          9.282454
min          0.000000
25%          4.000000
50%          8.000000
75%         14.000000
max        149.000000
Name: title_len, dtype: float64

In [21]:
writings_df.groupby('subject').mean().describe()

Unnamed: 0,label,title_len,text_len,funct,article,affect,negemo,sad,cogmech,inhib,...,feel,excl,future,nonfl,ppron,shehe,i,we,you,they
count,340.0,336.0,340.0,340.0,340.0,340.0,340.0,340.0,340.0,340.0,...,340.0,340.0,340.0,340.0,340.0,340.0,340.0,340.0,340.0,340.0
mean,0.120588,9.514427,33.122855,0.425434,0.049284,0.08099,0.023242,0.003515,0.125608,0.00473,...,0.005483,0.02251,0.089113,0.00274,0.07982,0.008573,0.040263,0.004642,0.020469,0.005873
std,0.326128,4.714271,31.874155,0.085804,0.013413,0.032889,0.011015,0.002891,0.031706,0.002874,...,0.004131,0.00974,0.028436,0.004729,0.027475,0.0071,0.020218,0.004367,0.01231,0.00404
min,0.0,1.0,1.0,0.018182,0.005237,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,6.60119,14.402079,0.379003,0.042321,0.06309,0.01611,0.001949,0.109709,0.002829,...,0.003234,0.016795,0.073775,0.000669,0.064952,0.004052,0.027496,0.002302,0.013474,0.003566
50%,0.0,8.660264,24.212121,0.439643,0.049415,0.074207,0.022189,0.002867,0.127451,0.004594,...,0.004879,0.022203,0.088307,0.001581,0.077149,0.006945,0.03742,0.003735,0.018725,0.005123
75%,0.0,11.526931,37.878342,0.47855,0.057188,0.090063,0.028348,0.004319,0.14571,0.006224,...,0.006712,0.0273,0.105534,0.002902,0.093965,0.01143,0.04984,0.005544,0.025045,0.007488
max,1.0,32.166667,266.446446,0.646948,0.095561,0.27052,0.073699,0.020833,0.251136,0.02218,...,0.042094,0.090475,0.202499,0.045799,0.213871,0.069447,0.133143,0.037712,0.099026,0.041093


In [22]:
writings_df.groupby('subject').max().groupby('label').count()

Unnamed: 0_level_0,date,title_len,text_len,all_tokens,funct,article,affect,negemo,sad,cogmech,...,feel,excl,future,nonfl,ppron,shehe,i,we,you,they
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,299,296,299,299,299,299,299,299,299,299,...,299,299,299,299,299,299,299,299,299,299
1,41,40,41,41,41,41,41,41,41,41,...,41,41,41,41,41,41,41,41,41,41


In [23]:
print("Average number of posts per user", writings_df.groupby('subject').count().title.mean())
print("Average number of comments per user", writings_df.groupby('subject').count().text.mean())


Average number of posts per user 146.35882352941175
Average number of comments per user 376.2970588235294


In [24]:
writings_df.groupby('subject').count().title.describe()

count    340.000000
mean     146.358824
std      240.998992
min        0.000000
25%       13.000000
50%       42.500000
75%      148.500000
max      998.000000
Name: title, dtype: float64

In [25]:
writings_df.groupby('subject').count().text.describe()

count     340.000000
mean      376.297059
std       379.091730
min         1.000000
25%        54.000000
50%       214.500000
75%       646.000000
max      1350.000000
Name: text, dtype: float64

# Recurrent NN

## Extract features and encode data

In [26]:
hyperparams_features = {
    "max_features": 40000,
    # cut texts after this number of words
    # (among top max_features most common words)
    "maxlen": 100,
    "embedding_dim": 100,
    "user_level": True,
    "posts_per_user": 10,
    "batch_size": 32,
}


### Emotions

In [27]:
def load_NRC(nrc_path):
    word_emotions = {}
    emotion_words = {}
    with open(nrc_path) as in_f:
        for line in in_f:
            line = line.strip()
            if not line:
                continue
            word, emotion, label = line.split()
            if word not in word_emotions:
                word_emotions[word] = set()
            if emotion not in emotion_words:
                emotion_words[emotion] = set()
            label = int(label)
            if label:
                word_emotions[word].add(emotion)
                emotion_words[emotion].add(word)
    return emotion_words

nrc_lexicon_path = root_dir + '/resources/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt'
nrc_lexicon = load_NRC(nrc_lexicon_path)
emotions = list(nrc_lexicon.keys())


In [28]:
def encode_emotions(tokens, emotion_lexicon, emotions, relative=True):
    text_len = len(tokens)
    encoded_emotions = [0 for e in emotions]
    for i, emotion in enumerate(emotions):
        try:
            emotion_words = [t for t in tokens if t in emotion_lexicon[emotion]]
            if relative:
                encoded_emotions[i] = len(emotion_words) / len(tokens)
            else:
                encoded_emotions[i] = len(emotion_words)
        except ValueError:
            print("Emotion not found.")
    return encoded_emotions

In [29]:
from liwc_readDict import readDict

liwc = readDict(root_dir + '/resources/liwc.dic')

categories = set([c for (w,c) in liwc])
len(categories)

64

### Style features

#### Char n-grams

In [30]:
def extract_ngrams(tokens):
    pass

#### Personal pronouns

In [31]:
first_person_pronouns = {"i", "me", "my", "mine", "myself"}
def encode_pronouns(tokens, pronouns={"i", "me", "my", "mine", "myself"}, relative=True):
    if not tokens:
        return np.nan
    text_len = len(tokens)
    nr_pronouns = len([t for t in tokens if t in pronouns])
    if relative:
        return nr_pronouns/text_len
    else:
        return nr_pronouns

#### Stopwords

In [32]:
stopword_list = stopwords.words("english")
def encode_stopwords(tokens, stopwords=stopword_list):
    encoded_stopwords = [0 for s in stopword_list]
    if not tokens:
        return encoded_stopwords
    for i, stopword in enumerate(stopwords):
        if stopword in tokens:
            encoded_stopwords[i] += 1
    return encoded_stopwords

### Topics

## BERT


In [33]:
import tensorflow_hub as hub
bert_path = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"
from bert.tokenization import FullTokenizer

# TODO: do this at the beginning? Also initialize variables?
sess = tf.Session()

W0305 17:02:22.859902 140401963235072 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14


In [34]:
class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, label=None):
        """Constructs a InputExample.
    Args:
      guid: Unique id for the example.
      text_a: string. The untokenized text of the first sequence. For single
        sequence tasks, only this sequence must be specified.
      text_b: (Optional) string. The untokenized text of the second sequence.
        Only must be specified for sequence pair tasks.
      label: (Optional) string. The label of the example. This should be
        specified for train and dev examples, but not for test examples.
    """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label

In [35]:
def encode_text_for_bert(tokenizer, example, max_seq_length=256):
    """Converts a single `InputExample` into a single `InputFeatures`."""

#     if isinstance(example, PaddingInputExample):
#         input_ids = [0] * max_seq_length
#         input_mask = [0] * max_seq_length
#         segment_ids = [0] * max_seq_length
#         label = 0
#         return input_ids, input_mask, segment_ids, label

    tokens_a = tokenizer.tokenize(example.text_a)
    if len(tokens_a) > max_seq_length - 2:
        tokens_a = tokens_a[0 : (max_seq_length - 2)]

    tokens = []
    segment_ids = []
    tokens.append("[CLS]")
    segment_ids.append(0)
    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(0)
    tokens.append("[SEP]")
    segment_ids.append(0)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        input_mask.append(0)
        segment_ids.append(0)

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    return input_ids, input_mask, segment_ids, example.label



In [36]:
def create_tokenizer_from_hub_module():
    """Get the vocab file and casing info from the Hub module."""
    bert_module =  hub.Module(bert_path)
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    vocab_file, do_lower_case = sess.run(
        [
            tokenization_info["vocab_file"],
            tokenization_info["do_lower_case"],
        ]
    )

    return FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)

In [37]:
# Instantiate tokenizer
bert_tokenizer = create_tokenizer_from_hub_module()

encode_text_for_bert(bert_tokenizer, InputExample(None, 
                                               "Ana are mere"), 
                       hyperparams_features['maxlen'])

Instructions for updating:
Colocations handled automatically by placer.


W0305 17:02:25.607889 140401963235072 deprecation.py:323] From /usr/local/tensorflow/python3.5/1.13.1/lib/python3.5/site-packages/tensorflow/python/ops/control_flow_ops.py:3632: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0305 17:02:27.510293 140401963235072 saver.py:1483] Saver not created because there are no variables in the graph to restore


([101,
  9617,
  2024,
  8210,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
 

In [38]:
def initialize_vars(sess):
    sess.run(tf.local_variables_initializer())
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    K.set_session(sess)

### Encode data

In [39]:
from collections import Counter
def load_erisk_data(writings_df, voc_size, emotion_lexicon, seq_len, emotions =  
                    ['anger', 'anticipation', 'disgust', 'fear', 'joy', 
                     'negative', 'positive', 'sadness', 'surprise', 'trust'],
                    liwc_categories = categories,
                    pronouns = ["i", "me", "my", "mine", "myself"],
                    train_prop=0.7, valid_prop=0.3, test_slice=2,
                    nr_slices=5,
                    min_post_len=3, min_word_len=1, 
                    user_level=True, vocabulary=None,
                   logger=logger):
    logger.debug("Loading data...\n")
    if not vocabulary:
        vocabulary = {}
        word_freqs = Counter()
        for words in writings_df.tokenized_text:
            word_freqs.update(words)
        for words in writings_df.tokenized_title:
            word_freqs.update(words)
        i = 1
        for w, f in word_freqs.most_common(voc_size-2): # keeping voc_size-1 for unk
            if len(w) < min_word_len:
                continue
            vocabulary[w] = i
            i += 1
   
    if 'subset' in writings_df.columns:
        training_subjects = list(set(writings_df[writings_df['subset']=='train'].subject))
        test_subjects = list(set(writings_df[writings_df['subset']=='test'].subject))
    else:
        all_subjects = sorted(list(set(writings_df.subject)))
        training_subjects_size = int(len(all_subjects) * train_prop)
        test_subjects_size = len(all_subjects) - training_subjects_size
        # Cross-validation, with fixed slice as input
        test_prop = 1-train_prop
        test_slice = min(test_slice, nr_slices)
        logger.debug("start index: %f, from %f\n" % (
            len(all_subjects)*(1/nr_slices)*test_slice, test_prop*test_slice))
        start_slice = int(len(all_subjects)*(1/nr_slices)*test_slice)
        test_subjects = all_subjects[start_slice: start_slice+test_subjects_size]
        training_subjects = [s for s in all_subjects if s not in test_subjects]
    training_subjects = sorted(training_subjects) # ensuring reproducibility
    valid_subjects_size = int(len(training_subjects) * valid_prop)
    valid_subjects = training_subjects[:valid_subjects_size]
    training_subjects = training_subjects[valid_subjects_size:]
    categories = [c for c in liwc_categories if c in writings_df.columns]
    logger.debug("%d training users, %d validation users, %d test users." % (
        len(training_subjects), 
          len(valid_subjects),
          len(test_subjects)))
    subjects_split = {'train': training_subjects, 
                      'valid': valid_subjects, 
                      'test': test_subjects}

    user_level_texts = {}
    for row in writings_df.sort_values(by='date').itertuples():
        words = []
        raw_text = ""
        if row.tokenized_title:
            words.extend(row.tokenized_title)
            raw_text += row.title
        if row.tokenized_text:
            words.extend(row.tokenized_text)
            raw_text += row.text
        if not words or len(words)<min_post_len:
            print(row.subject)
            continue
        label = row.label
        liwc_categs = [getattr(row, categ) for categ in categories]
        if row.subject not in user_level_texts.keys():
            user_level_texts[row.subject] = {}
            user_level_texts[row.subject]['texts'] = [words]
            user_level_texts[row.subject]['label'] = label
            user_level_texts[row.subject]['liwc'] = [liwc_categs]
            user_level_texts[row.subject]['raw'] = [raw_text]
        else:
            user_level_texts[row.subject]['texts'].append(words)
            user_level_texts[row.subject]['liwc'].append(liwc_categs)
            user_level_texts[row.subject]['raw'].append(raw_text)
            
    return user_level_texts, subjects_split, vocabulary


In [40]:
user_level_data, subjects_split, vocabulary = load_erisk_data(writings_df, 
                                                            seq_len=hyperparams_features['maxlen'],
                                                            voc_size=hyperparams_features['max_features'],
                                                           emotion_lexicon=nrc_lexicon,
                                                           emotions=emotions,
                                                           user_level=hyperparams_features['user_level'],
                                                                                logger=logger
#                                                            vocabulary=pickle.load(open('vocabulary20K_selfharm.pkl', 'rb'))
                                                                               )

Loading data...



I0305 17:02:28.167489 140401963235072 <ipython-input-39-8f3a93e32636>:12] Loading data...



start index: 136.000000, from 0.600000



I0305 17:02:29.258577 140401963235072 <ipython-input-39-8f3a93e32636>:38] start index: 136.000000, from 0.600000



166 training users, 71 validation users, 103 test users.


I0305 17:02:29.260706 140401963235072 <ipython-input-39-8f3a93e32636>:50] 166 training users, 71 validation users, 103 test users.


subject1623
subject8978
subject1623
subject1914
subject1914
subject8978
subject2947
subject2495
subject1914
subject8001
subject2495
subject6013
subject6013
subject6013
subject8001
subject2495
subject1914
subject2495
subject8472
subject8472
subject3881
subject1623
subject8472
subject8472
subject8978
subject6035
subject6035
subject8001
subject2947
subject8472
subject8472
subject3881
subject5000
subject3881
subject3881
subject814
subject814
subject814
subject814
subject814
subject814
subject814
subject814
subject814
subject203
subject6035
subject6035
subject6035
subject814
subject814
subject814
subject814
subject814
subject814
subject814
subject814
subject814
subject814
subject814
subject814
subject814
subject814
subject814
subject814
subject814
subject203
subject814
subject814
subject814
subject814
subject814
subject814
subject814
subject203
subject814
subject814
subject814
subject814
subject203
subject203
subject5000
subject5000
subject814
subject203
subject814
subject203
subject203
sub

subject4414
subject5908
subject1748
subject4414
subject4414
subject4414
subject4414
subject4414
subject8472
subject4414
subject5908
subject6035
subject519
subject5908
subject8795
subject6035
subject4414
subject4414
subject6146
subject4414
subject8795
subject8795
subject5699
subject6035
subject8795
subject814
subject8795
subject1748
subject8795
subject4414
subject8795
subject4414
subject3868
subject8472
subject814
subject8472
subject1748
subject1748
subject8472
subject47
subject47
subject203
subject203
subject8472
subject8795
subject8795
subject8472
subject1748
subject4414
subject4414
subject8472
subject4414
subject4414
subject1748
subject4414
subject4831
subject1748
subject1748
subject2948
subject1748
subject1748
subject4414
subject4414
subject4414
subject4414
subject4831
subject47
subject8472
subject5699
subject6146
subject6035
subject5036
subject1914
subject5908
subject6670
subject3725
subject6670
subject6670
subject5699
subject8472
subject3191
subject7435
subject8472
subject7435
sub

subject0
subject1748
subject1748
subject8657
subject8657
subject5908
subject9725
subject8472
subject8472
subject6013
subject9725
subject9725
subject47
subject8472
subject203
subject9725
subject203
subject203
subject5036
subject8472
subject47
subject7898
subject8472
subject5840
subject5699
subject1748
subject4247
subject4002
subject8472
subject2495
subject5140
subject1485
subject4459
subject4459
subject8472
subject9725
subject6013
subject5699
subject5699
subject8657
subject8064
subject8064
subject6013
subject5140
subject6013
subject5036
subject5036
subject6035
subject814
subject5840
subject1748
subject4414
subject4414
subject4414
subject4414
subject4414
subject5036
subject5036
subject5036
subject5036
subject5036
subject2605
subject4414
subject4414
subject4414
subject2605
subject2605
subject8657
subject2605
subject4414
subject4414
subject4414
subject4414
subject6146
subject6146
subject9114
subject682
subject2605
subject6146
subject6146
subject6146
subject6013
subject1728
subject2605
subj

subject203
subject8657
subject5036
subject4526
subject8657
subject7857
subject8657
subject5699
subject6146
subject1210
subject5409
subject203
subject814
subject1105
subject1105
subject1105
subject4459
subject4074
subject5409
subject5409
subject4074
subject4074
subject1485
subject203
subject5409
subject5793
subject6518
subject6518
subject8472
subject8200
subject1210
subject8472
subject5908
subject4831
subject1105
subject733
subject2495
subject6786
subject1793
subject501
subject8200
subject9725
subject4459
subject2495
subject1793
subject8472
subject7262
subject4831
subject2495
subject8472
subject4831
subject5699
subject5699
subject4414
subject4414
subject4414
subject8472
subject4414
subject5282
subject8472
subject5908
subject5908
subject8200
subject4196
subject4831
subject4831
subject4831
subject4831
subject5793
subject4831
subject4831
subject4831
subject4831
subject4831
subject4414
subject4414
subject4414
subject4414
subject4414
subject4414
subject4831
subject4831
subject4831
subject483

subject5793
subject7318
subject7318
subject5878
subject1105
subject7318
subject641
subject641
subject1485
subject8978
subject8978
subject9160
subject6833
subject4074
subject7857
subject5878
subject641
subject5409
subject641
subject641
subject7318
subject641
subject7318
subject1793
subject1793
subject1210
subject8472
subject641
subject379
subject641
subject2097
subject7318
subject7318
subject1793
subject6833
subject6833
subject6833
subject4278
subject4762
subject6833
subject5409
subject4074
subject4074
subject4074
subject6833
subject6833
subject7318
subject6786
subject7857
subject7318
subject7857
subject7318
subject5409
subject641
subject7698
subject641
subject8978
subject7698
subject641
subject641
subject7857
subject7692
subject641
subject217
subject641
subject6013
subject6035
subject6833
subject8726
subject641
subject641
subject6833
subject1210
subject3178
subject4071
subject1793
subject4071
subject6833
subject6644
subject7318
subject6013
subject5979
subject7857
subject641
subject641


subject8329
subject8292
subject6786
subject2605
subject8329
subject3277
subject8657
subject8657
subject9260
subject5140
subject2446
subject8990
subject5409
subject203
subject2605
subject5979
subject3277
subject3277
subject1793
subject2685
subject2685
subject2685
subject2685
subject2685
subject2446
subject2605
subject2446
subject8292
subject4226
subject7499
subject7355
subject9114
subject511
subject8726
subject733
subject2685
subject4226
subject2685
subject7857
subject7857
subject2685
subject5383
subject2685
subject9114
subject2685
subject2685
subject2685
subject7439
subject8292
subject2685
subject3277
subject2685
subject2685
subject8472
subject2685
subject9114
subject1210
subject2685
subject3277
subject9114
subject47
subject8329
subject4226
subject7318
subject2685
subject6035
subject4074
subject8329
subject1105
subject9725
subject379
subject6833
subject6833
subject6833
subject4074
subject4074
subject5140
subject6428
subject2446
subject6035
subject6035
subject2685
subject2685
subject268

subject6352
subject8472
subject8472
subject5409
subject1288
subject5833
subject5833
subject5833
subject5833
subject5833
subject6428
subject6428
subject4762
subject6352
subject1288
subject3555
subject6428
subject1288
subject1288
subject8081
subject7439
subject5979
subject203
subject4074
subject7439
subject5833
subject3178
subject8200
subject8726
subject4510
subject733
subject6352
subject7439
subject8200
subject8472
subject8472
subject5383
subject8726
subject8973
subject6352
subject2980
subject2980
subject1288
subject2980
subject8726
subject5409
subject203
subject7857
subject1055
subject5833
subject7439
subject2690
subject6428
subject6428
subject6428
subject4934
subject7318
subject1105
subject8292
subject2239
subject8292
subject8726
subject2088
subject8472
subject1623
subject4644
subject6428
subject6786
subject7439
subject7439
subject4526
subject2239
subject8472
subject7857
subject4074
subject8726
subject1793
subject2239
subject6428
subject569
subject9725
subject2690
subject5833
subject8

subject7229
subject7439
subject9725
subject9725
subject1105
subject4762
subject7439
subject6352
subject5793
subject6652
subject2495
subject5100
subject5100
subject6930
subject3994
subject4278
subject7581
subject8973
subject7439
subject6946
subject6946
subject7439
subject7439
subject7439
subject7439
subject7439
subject7581
subject4702
subject4226
subject3277
subject4226
subject3178
subject6833
subject5409
subject5282
subject5282
subject6833
subject4278
subject4278
subject5036
subject8329
subject501
subject1288
subject379
subject5833
subject6946
subject1793
subject6833
subject3555
subject3555
subject6946
subject4934
subject8329
subject6247
subject1793
subject1793
subject8973
subject6428
subject4526
subject4526
subject8292
subject6035
subject6093
subject2577
subject6833
subject4526
subject7581
subject8657
subject6833
subject7581
subject8770
subject4333
subject2577
subject5878
subject1093
subject1105
subject2446
subject7692
subject7439
subject4226
subject7263
subject6833
subject6352
subjec

subject8770
subject8726
subject8726
subject8726
subject6238
subject8933
subject7489
subject2088
subject2088
subject2495
subject2495
subject2495
subject8802
subject2495
subject2495
subject2495
subject2605
subject6238
subject2495
subject1288
subject7263
subject8292
subject8292
subject2495
subject6453
subject2495
subject4526
subject8802
subject992
subject4795
subject4795
subject7462
subject3994
subject8292
subject2690
subject5920
subject2495
subject5282
subject1469
subject3994
subject7581
subject4318
subject2495
subject7262
subject2495
subject4318
subject2495
subject4762
subject7262
subject6786
subject6833
subject3270
subject3014
subject5833
subject6903
subject5833
subject8726
subject8933
subject5409
subject2495
subject8721
subject2495
subject2495
subject2495
subject3635
subject2495
subject992
subject992
subject6238
subject4318
subject9725
subject3994
subject3994
subject4278
subject6453
subject8802
subject7229
subject6946
subject7229
subject7462
subject4934
subject6453
subject5148
subject

subject7698
subject7698
subject7698
subject8802
subject4702
subject4702
subject379
subject379
subject379
subject379
subject379
subject8721
subject501
subject4318
subject1210
subject8726
subject1485
subject5622
subject7262
subject3635
subject2577
subject2088
subject5833
subject3270
subject9260
subject4729
subject807
subject1914
subject2088
subject4247
subject9222
subject6786
subject3635
subject4513
subject4513
subject8802
subject1210
subject3635
subject3635
subject3635
subject4318
subject992
subject8933
subject3283
subject8802
subject5148
subject9260
subject8973
subject7439
subject7489
subject5840
subject8292
subject8292
subject5148
subject3555
subject4934
subject4513
subject3283
subject3270
subject992
subject9260
subject8472
subject5148
subject992
subject7377
subject1064
subject4318
subject6652
subject8329
subject8721
subject4318
subject3270
subject4071
subject6786
subject7439
subject8292
subject379
subject3270
subject7229
subject7262
subject3117
subject7229
subject2088
subject7229
sub

subject7857
subject7489
subject5282
subject5282
subject8973
subject8233
subject8481
subject4729
subject9260
subject8626
subject8472
subject8472
subject8933
subject4526
subject5342
subject6833
subject3014
subject9249
subject4071
subject7439
subject1793
subject7229
subject7439
subject5793
subject8626
subject2857
subject4513
subject2857
subject2857
subject4278
subject3014
subject2088
subject8802
subject2980
subject4526
subject4526
subject3994
subject1793
subject6946
subject203
subject379
subject4071
subject3283
subject8472
subject8472
subject8233
subject4071
subject9260
subject2567
subject7262
subject379
subject8626
subject8481
subject6786
subject8973
subject4702
subject2567
subject7661
subject6041
subject4071
subject992
subject1288
subject3283
subject8933
subject4526
subject9260
subject9260
subject9260
subject3270
subject8933
subject8233
subject8933
subject3014
subject8933
subject2567
subject379
subject8233
subject4526
subject379
subject8933
subject2857
subject2567
subject2567
subject379

subject8292
subject7263
subject8626
subject9249
subject8292
subject7263
subject8292
subject8292
subject6035
subject1623
subject1623
subject8933
subject1623
subject5528
subject8193
subject8933
subject5528
subject3014
subject3014
subject2857
subject6833
subject2857
subject7229
subject8933
subject8933
subject4513
subject7229
subject8973
subject8973
subject8973
subject6041
subject8193
subject8200
subject3667
subject1623
subject7661
subject8065
subject379
subject8933
subject4961
subject5036
subject8481
subject1623
subject8193
subject5528
subject8065
subject5528
subject8292
subject1914
subject8200
subject835
subject3014
subject8802
subject4513
subject7229
subject2857
subject3283
subject3277
subject5282
subject1623
subject4074
subject7661
subject3596
subject8193
subject1623
subject8565
subject2857
subject4526
subject8845
subject6035
subject5528
subject8193
subject3014
subject3014
subject5528
subject5528
subject3014
subject1623
subject1623
subject5622
subject4074
subject4074
subject1623
subjec

subject1623
subject4071
subject5979
subject3277
subject3277
subject6428
subject3277
subject3596
subject5528
subject5528
subject6428
subject8233
subject2577
subject8626
subject7627
subject7627
subject7355
subject3014
subject3014
subject3014
subject7229
subject8001
subject8193
subject2857
subject7229
subject6428
subject2857
subject2857
subject8065
subject2239
subject8193
subject4071
subject3014
subject3555
subject3994
subject6041
subject4762
subject8472
subject8472
subject7355
subject5528
subject5528
subject5528
subject5528
subject8481
subject7229
subject6035
subject5270
subject5148
subject4071
subject8233
subject9114
subject4934
subject5833
subject1623
subject671
subject7698
subject2088
subject2974
subject7698
subject2239
subject9249
subject4702
subject6665
subject1623
subject3994
subject2948
subject5995
subject501
subject9156
subject2088
subject4071
subject5150
subject8626
subject4071
subject4071
subject5100
subject7229
subject9949
subject9949
subject4071
subject2857
subject6041
subjec

subject3612
subject8065
subject8990
subject2088
subject8193
subject4379
subject4379
subject2475
subject6238
subject8233
subject8626
subject6238
subject5342
subject5342
subject5342
subject4729
subject4729
subject4729
subject6453
subject5995
subject5840
subject4071
subject379
subject379
subject5256
subject4513
subject4513
subject4513
subject4513
subject8626
subject4513
subject8233
subject8065
subject8065
subject9260
subject9260
subject8933
subject8933
subject8933
subject8933
subject2577
subject8001
subject5699
subject7637
subject7229
subject7229
subject8065
subject2857
subject8065
subject4071
subject8933
subject8292
subject8933
subject8933
subject5995
subject4729
subject8065
subject7627
subject7627
subject5622
subject2238
subject6453
subject8065
subject6453
subject5622
subject5622
subject5622
subject2947
subject6428
subject4074
subject9318
subject4074
subject3596
subject3596
subject4074
subject8990
subject8933
subject4513
subject6238
subject4729
subject8933
subject3277
subject379
subject

subject4702
subject9318
subject2577
subject7637
subject4198
subject8292
subject4570
subject1105
subject4247
subject4702
subject8933
subject4071
subject9260
subject4143
subject4379
subject51
subject51
subject7764
subject7764
subject2547
subject7764
subject7764
subject3994
subject6041
subject2547
subject7107
subject7462
subject5148
subject8933
subject4513
subject9961
subject9961
subject4513
subject8193
subject8933
subject9318
subject7627
subject7627
subject6786
subject4702
subject8065
subject9260
subject4702
subject9497
subject9381
subject9381
subject4071
subject9381
subject5622
subject7637
subject5622
subject5342
subject1288
subject7740
subject7740
subject4513
subject4071
subject4729
subject8933
subject8933
subject8933
subject8933
subject8933
subject2088
subject5342
subject5270
subject4961
subject9497
subject2547
subject2547
subject2547
subject2547
subject2547
subject2547
subject2547
subject2547
subject2547
subject2547
subject2547
subject9381
subject4729
subject9381
subject1469
subject2

subject5622
subject5033
subject280
subject7627
subject3270
subject8094
subject3914
subject4702
subject4702
subject9411
subject9411
subject2567
subject4570
subject9961
subject4570
subject1947
subject1947
subject3277
subject7801
subject4143
subject4143
subject1027
subject3612
subject7462
subject4702
subject4702
subject280
subject9411
subject8802
subject2580
subject8626
subject4702
subject2088
subject6035
subject7637
subject8626
subject4513
subject4702
subject7764
subject7764
subject4702
subject5100
subject4702
subject2580
subject4513
subject4143
subject4795
subject8065
subject6423
subject4702
subject4513
subject4513
subject9411
subject6866
subject6866
subject6866
subject8065
subject3635
subject3612
subject6453
subject280
subject280
subject9260
subject8472
subject1655
subject4071
subject4702
subject3904
subject6423
subject7801
subject4702
subject9411
subject4143
subject280
subject4570
subject4071
subject4143
subject9242
subject1288
subject1288
subject9156
subject4729
subject1950
subject28

subject1947
subject2690
subject8065
subject4843
subject7777
subject3994
subject7777
subject6453
subject7777
subject8233
subject8472
subject3844
subject1027
subject1027
subject7777
subject7777
subject4014
subject4014
subject7777
subject7777
subject2580
subject7262
subject4143
subject4379
subject4014
subject7777
subject1545
subject4729
subject4729
subject4729
subject4729
subject4729
subject3844
subject4071
subject4843
subject1545
subject8990
subject4702
subject3844
subject5549
subject2247
subject4843
subject4843
subject4379
subject3191
subject7262
subject7262
subject7262
subject7262
subject3844
subject7801
subject3844
subject5920
subject4014
subject4014
subject7262
subject6041
subject9918
subject5270
subject4961
subject1288
subject4702
subject4379
subject4014
subject3844
subject2247
subject3994
subject4843
subject4702
subject1469
subject2247
subject2580
subject4702
subject3844
subject4702
subject8233
subject4843
subject6259
subject2857
subject3596
subject4729
subject9242
subject7262
subj

subject8193
subject4513
subject7661
subject5456
subject3612
subject3612
subject3612
subject3612
subject3612
subject3612
subject3612
subject5456
subject5456
subject8626
subject3612
subject3612
subject3612
subject5456
subject3612
subject4513
subject7338
subject7338
subject5456
subject4143
subject4143
subject8472
subject9095
subject9095
subject9260
subject4513
subject4513
subject4513
subject4513
subject4513
subject4513
subject4513
subject4513
subject4513
subject4513
subject2577
subject2690
subject3612
subject6464
subject4513
subject8193
subject3612
subject3612
subject1469
subject3612
subject3612
subject3612
subject3844
subject2247
subject8233
subject6957
subject8193
subject4513
subject8990
subject9095
subject2567
subject6946
subject7661
subject9077
subject7777
subject7777
subject4702
subject8065
subject9095
subject3844
subject3014
subject8193
subject8481
subject8481
subject4702
subject8193
subject6464
subject2567
subject4843
subject8481
subject8882
subject6464
subject4843
subject2580
subj

subject8233
subject5409
subject8822
subject992
subject2690
subject2247
subject2088
subject2690
subject1469
subject9411
subject6041
subject5833
subject6041
subject4934
subject9829
subject4526
subject3014
subject2690
subject6866
subject6899
subject4702
subject1469
subject6464
subject2857
subject3844
subject4071
subject9156
subject4702
subject6899
subject6464
subject3844
subject9829
subject8822
subject9829
subject9829
subject8822
subject9829
subject3844
subject6899
subject6899
subject6899
subject6899
subject522
subject6899
subject9829
subject6041
subject6899
subject8481
subject3596
subject1288
subject4071
subject3844
subject3844
subject8544
subject9197
subject2247
subject5409
subject5150
subject5833
subject2690
subject2690
subject2247
subject1824
subject6918
subject6093
subject6918
subject4526
subject6918
subject2690
subject8822
subject7338
subject6464
subject2690
subject3014
subject4526
subject3844
subject9411
subject8990
subject6464
subject8626
subject8200
subject8822
subject2690
subjec

### Data Generator

In [41]:
class DataGenerator(Sequence):
    'Generates data for Keras'
    def __init__(self, user_level_data, subjects_split, set_type='train',
                 batch_size=hyperparams_features['batch_size'], seq_len=hyperparams_features['maxlen'], 
                 voc_size=hyperparams_features['max_features'], emotion_lexicon=nrc_lexicon,
                 emotions=emotions, pronouns=["i", "me", "my", "mine", "myself"], 
                 max_posts_per_user=hyperparams_features['posts_per_user'],
                 shuffle=True):
        'Initialization'
        self.seq_len = seq_len
        self.subjects_split = subjects_split
        self.set = set_type
        self.emotion_lexicon = emotion_lexicon
        self.batch_size = batch_size
        self.data = user_level_data
        self.emotions = emotions
        self.pronouns = pronouns
        self.shuffle = shuffle
        self.voc_size = voc_size
        self.max_posts_per_user = max_posts_per_user
        self.on_epoch_end()

    def __encode_text(self, tokens):
        # Using voc_size-1 value for OOV token
        encoded_tokens = [vocabulary.get(w, self.voc_size-1) for w in tokens]
        encoded_emotions = encode_emotions(tokens, self.emotion_lexicon, self.emotions)
        encoded_pronouns = encode_pronouns(tokens, self.pronouns)
        encoded_stopwords = encode_stopwords(tokens)
        return (encoded_tokens, encoded_emotions, encoded_pronouns, encoded_stopwords)
        
    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.subjects_split[self.set]) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        user_indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        # Find users
        users = [self.subjects_split[self.set][i] for i in user_indexes
                    if self.subjects_split[self.set][i] in self.data.keys()] # TODO: maybe needs a warning that user is missing

        post_indexes = {}
        # Sample post ids
        for subject in users:
            posts_len = len(self.data[subject]['texts'])
            posts_index_sample = sorted(np.random.choice(posts_len, 
                                                         min(self.max_posts_per_user, posts_len),
                                                         replace=False))
            post_indexes[subject] = posts_index_sample
        # Generate data
        X, y = self.__data_generation(users, post_indexes)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.subjects_split[self.set]))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, users, post_indexes):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        tokens_data = []
        categ_data = []
        sparse_data = []
        subjects = []
        labels = []
        for subject in users:
            texts = self.data[subject]['texts']
            label = self.data[subject]['label']
            
            # Sample
            texts = [texts[i] for i in post_indexes[subject]]
            liwc_selection = [self.data[subject]['liwc'][i] for i in post_indexes[subject]]
            all_words = [sum(texts, [])] # merge all texts in one list
            liwc_aggreg = [np.array(liwc_selection).mean(axis=0).tolist()]

            for i, words in enumerate(all_words):
                encoded_tokens, encoded_emotions, encoded_pronouns, encoded_stopwords = self.__encode_text(words)
                subject_id = int(subject.split('t')[1])
                tokens_data.append(encoded_tokens)
                categ_data.append(encoded_emotions + [encoded_pronouns] + liwc_aggreg[i])
                sparse_data.append(encoded_stopwords)
                labels.append(label)
                subjects.append(subject_id)

        
        # using zeros for padding
        tokens_data_padded = sequence.pad_sequences(tokens_data, maxlen=self.seq_len)

        return ([np.array(tokens_data_padded), np.array(categ_data), np.array(sparse_data),
                np.array(subjects)],
                np.array(labels))


In [49]:
class DataGeneratorHierarchical(Sequence):
    'Generates data for Keras'
    def __init__(self, user_level_data, subjects_split, set_type='train',
                 batch_size=hyperparams_features['batch_size'], seq_len=hyperparams_features['maxlen'], 
                 voc_size=hyperparams_features['max_features'], emotion_lexicon=nrc_lexicon,
                 emotions=emotions, pronouns=["i", "me", "my", "mine", "myself"], 
                 max_posts_per_user=hyperparams_features['posts_per_user'], stopwords=stopword_list,
                 liwc_categories=categories, bert_tokenizer=bert_tokenizer,
                 shuffle=True):
        'Initialization'
        self.seq_len = seq_len
        self.subjects_split = subjects_split
        self.set = set_type
        self.emotion_lexicon = emotion_lexicon
        self.bert_tokenizer = bert_tokenizer
        self.batch_size = batch_size
        self.data = user_level_data
        self.emotions = emotions
        self.pronouns = pronouns
        self.shuffle = shuffle
        self.voc_size = voc_size
        self.max_posts_per_user = max_posts_per_user
        self.categ_dim = len(emotions) + 1 + len(liwc_categories)
        self.sparse_dim = len(stopwords)
        if not shuffle:
            # Sort users so that similar post length users will be in the same batch
            self.subjects_split[self.set] = sorted(self.subjects_split[self.set],
                                                  key = lambda s: len(self.data[s]['texts'] if s in self.data
                                                               else 0))
        self.on_epoch_end()

    def __encode_text(self, tokens, raw_text):
        # Using voc_size-1 value for OOV token
        encoded_tokens = [vocabulary.get(w, self.voc_size-1) for w in tokens]
        encoded_emotions = encode_emotions(tokens, self.emotion_lexicon, self.emotions)
        encoded_pronouns = encode_pronouns(tokens, self.pronouns)
        encoded_stopwords = encode_stopwords(tokens)
        bert_ids, bert_masks, bert_segments, label = encode_text_for_bert(self.bert_tokenizer, InputExample(None, 
                                               raw_text), self.seq_len)
        return (encoded_tokens, encoded_emotions, encoded_pronouns, encoded_stopwords,
               bert_ids, bert_masks, bert_segments)
        
    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.subjects_split[self.set]) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        user_indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
#         if len(user_indexes)<self.batch_size:
#             return
        # Find users
        users = [self.subjects_split[self.set][i] for i in user_indexes
                    if self.subjects_split[self.set][i] in self.data.keys()] # TODO: maybe needs a warning that user is missing

        post_indexes = {}
        # Sample post ids
        for subject in users:
            posts_len = len(self.data[subject]['texts'])
            posts_index_sample = sorted(np.random.choice(posts_len, 
                                                         min(self.max_posts_per_user, posts_len),
                                                         replace=False))
            post_indexes[subject] = posts_index_sample
        # Generate data
        X, y = self.__data_generation(users, post_indexes)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.subjects_split[self.set]))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, users, post_indexes):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        user_tokens = []
        user_categ_data = []
        user_sparse_data = []
        user_bert_ids_data = []
        user_bert_masks_data = []
        user_bert_segments_data = []
        
        labels = []
        for subject in users:
            tokens_data = []
            categ_data = []
            sparse_data = []
            bert_ids_data = []
            bert_masks_data = []
            bert_segments_data = []
            
            texts = self.data[subject]['texts']
            raw_texts = self.data[subject]['raw']
            label = self.data[subject]['label']
            liwc_scores = self.data[subject]['liwc']
            
#             if len(texts) < self.max_posts_per_user:
#                 # TODO: pad with zeros
#                 pass
  
            for i in post_indexes[subject]:
                raw_text = raw_texts[i]
                words = texts[i]
                liwc = liwc_scores[i]
                encoded_tokens, encoded_emotions, encoded_pronouns, encoded_stopwords, \
                    bert_ids, bert_masks, bert_segments = self.__encode_text(words, raw_text)
                subject_id = int(subject.split('t')[1])
                tokens_data.append(encoded_tokens)
                # using zeros for padding
                # TODO: there is something wrong with this
                categ_data.append(encoded_emotions + [encoded_pronouns] + liwc)
                sparse_data.append(encoded_stopwords)
                bert_ids_data.append(bert_ids)
                bert_masks_data.append(bert_masks)
                bert_segments_data.append(bert_segments)
            tokens_data_padded = np.array(sequence.pad_sequences(tokens_data, maxlen=self.seq_len))
            user_tokens.append(tokens_data_padded)

            user_categ_data.append(categ_data)
            user_sparse_data.append(sparse_data)
            
            user_bert_ids_data.append(bert_ids_data)
            user_bert_masks_data.append(bert_masks_data)
            user_bert_segments_data.append(bert_segments_data)

            labels.append(label)
        # TODO: check this is correct. for when there are fewer posts than minimum
        user_tokens = sequence.pad_sequences(user_tokens, value=np.zeros(self.seq_len))
        user_tokens = np.rollaxis(np.dstack(user_tokens), -1)
        
        user_categ_data = sequence.pad_sequences(user_categ_data,  maxlen=self.max_posts_per_user)
        user_categ_data = np.rollaxis(np.dstack(user_categ_data), -1
                                     )
        user_sparse_data = sequence.pad_sequences(user_sparse_data,  maxlen=self.max_posts_per_user)
        user_sparse_data = np.rollaxis(np.dstack(user_sparse_data), -1)
        
        user_bert_ids_data = sequence.pad_sequences(user_bert_ids_data,  maxlen=self.max_posts_per_user)
        user_bert_ids_data = np.rollaxis(np.dstack(user_bert_ids_data), -1)
        
        user_bert_masks_data = sequence.pad_sequences(user_bert_masks_data,  maxlen=self.max_posts_per_user)
        user_bert_masks_data = np.rollaxis(np.dstack(user_bert_masks_data), -1)
        
        user_bert_segments_data = sequence.pad_sequences(user_bert_segments_data, maxlen=self.max_posts_per_user)
        user_bert_segments_data = np.rollaxis(np.dstack(user_bert_segments_data), -1)
        
        return ((user_tokens, user_categ_data, user_sparse_data, 
                 user_bert_ids_data, user_bert_masks_data, user_bert_segments_data),
                np.array(labels))


SyntaxError: invalid syntax (<ipython-input-49-f0d4f79eed4b>, line 145)

In [43]:
logger.setLevel(logging.DEBUG)

# TODO: it is skipping the last batch
x_data = {'train': [], 'valid': [], 'test': []}
y_data = {'train': [], 'valid': [], 'test': []}
for set_type in ['train', 'valid', 'test']:
    total_positive = 0
    for x, y in DataGeneratorHierarchical(user_level_data, subjects_split, 
                                          set_type=set_type):
        total_positive += pd.Series(y).sum()
        x_data[set_type].append(x)
        y_data[set_type].append(y)
    logger.info("%d %s positive examples\n" % (total_positive, set_type))


26 train positive examples



I0305 17:02:35.255606 140401963235072 <ipython-input-43-eb07efc520e4>:13] 26 train positive examples



4 valid positive examples



I0305 17:02:35.635763 140401963235072 <ipython-input-43-eb07efc520e4>:13] 4 valid positive examples



9 test positive examples



I0305 17:02:36.164901 140401963235072 <ipython-input-43-eb07efc520e4>:13] 9 test positive examples



In [44]:
tf.test.is_built_with_cuda()

True

In [45]:
encoded_for_bert = encode_text_for_bert(bert_tokenizer, InputExample(None, 
                                               "Ana are mere"), 200)

In [46]:
ids, masks, segments, label = encoded_for_bert

In [47]:
# class_weights = class_weight.compute_class_weight('balanced',
#                                                  np.unique(y_data['train']),
#                                                  y_data['train'])
# class_weights

In [48]:
def load_embeddings(path, embedding_dim, voc):
    # random matrix with mean value = 0
    embedding_matrix = np.random.random((len(voc)+2, embedding_dim)) - 0.5 # voc + unk + pad value(0)

    f = open(path)
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        word_i = voc.get(word)
        if word_i is not None:
            embedding_matrix[word_i] = coefs
    f.close()

    print('Total %s word vectors.' % len(embedding_matrix))

 
    return embedding_matrix

pretrained_embeddings_path = root_dir + '/resources/glove.twitter.27B/glove.twitter.27B.%dd.txt' % hyperparams_features['embedding_dim']
embedding_matrix = load_embeddings(pretrained_embeddings_path, hyperparams_features['embedding_dim'], vocabulary)


Total 40000 word vectors.


## Define model

In [None]:
hyperparams = {
    'lstm_units': 64,
    'lstm_units_user': 10,
    'dense_bow_units': 10,
    'bert_dense_units': 100,
    'bert_finetune_layers': 1,
    'dropout': 0.0,
    'l2_dense': 0.0000011,
    'l2_embeddings': 0.000001,
    'dense_sentence_units': 0,#50,
    'dense_user_units': 50,
    'optimizer': None,#'adam',
    'decay': 0.00001,
    'lr': 0.001,
    "trainable_embeddings": True,
    "reduce_lr_factor": 0.0002,
    "reduce_lr_patience": 50,
    "freeze_patience": 500,
    'threshold': 0.5,
    'ignore_layer': [],
    'norm_momentum': 0.1,

}
if not hyperparams['optimizer']:
    hyperparams['optimizer'] = optimizers.Adam(lr=hyperparams['lr'], #beta_1=0.9, beta_2=0.999, epsilon=0.0001,
                                   decay=hyperparams['decay'])

In [None]:
class Metrics():
    def __init__(self, threshold=0.5):
        self.threshold=threshold
        
    def recall_m(self, y_true, y_pred):
            y_labels = y_true
            y_pred = K.cast(K.greater(K.clip(y_pred, 0, 1), self.threshold), K.floatx())        
            possible_positives = K.sum(K.round(K.clip(y_labels, 0, 1)))
            true_positives = K.sum(K.round(K.clip(y_labels * y_pred, 0, 1)))
            recall = true_positives / (possible_positives + K.epsilon())
            return recall

    def precision_m(self, y_true, y_pred):
            y_labels = y_true
            y_pred = K.cast(K.greater(K.clip(y_pred, 0, 1), self.threshold), K.floatx())        
            true_positives = K.sum(K.round(K.clip(y_labels * y_pred, 0, 1)))
            predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
            precision = true_positives / (predicted_positives + K.epsilon())
            return precision

    def f1_m(self, y_true, y_pred):
        precision = self.precision_m(y_true, y_pred)
        recall = self.recall_m(y_true, y_pred)
        return 2*((precision*recall)/(precision+recall+K.epsilon()))

def binary_crossentropy_custom(y_true, y_pred):
    y_labels = y_true
    return K.binary_crossentropy(y_labels, 
                                 y_pred)

metrics_class = Metrics(threshold=hyperparams['threshold'])

In [None]:
class BertLayer(tf.keras.layers.Layer):
    def __init__(
        self,
        n_fine_tune_layers=10,
        pooling="first",
        bert_path="https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1", 
        **kwargs
    ):
        self.n_fine_tune_layers = n_fine_tune_layers
        self.trainable = True
        self.output_size = 768
        self.pooling = pooling
        self.bert_path = bert_path
        if self.pooling not in ["first", "mean"]:
            raise NameError(
               "Undefined pooling type (must be either first or mean, but is %s)" % self.pooling
            )

        super(BertLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.bert = hub.Module(
            self.bert_path, trainable=self.trainable, name="%s_module" % self.name
        )

        # Remove unused layers
        trainable_vars = self.bert.variables
        if self.pooling == "first":
            trainable_vars = [var for var in trainable_vars if not "/cls/" in var.name]
            trainable_layers = ["pooler/dense"]

        elif self.pooling == "mean":
            trainable_vars = [
                var
                for var in trainable_vars
                if not "/cls/" in var.name and not "/pooler/" in var.name
            ]
            trainable_layers = []
        else:
            raise NameError(
                "Undefined pooling type (must be either first or mean, but is %s)" % self.pooling
            )

        # Select how many layers to fine tune
        for i in range(self.n_fine_tune_layers):
            trainable_layers.append("encoder/layer_%s" % str(11 - i))

        # Update trainable vars to contain only the specified layers
        trainable_vars = [
            var
            for var in trainable_vars
            if any([l in var.name for l in trainable_layers])
        ]

        # Add to trainable weights
        for var in trainable_vars:
            self._trainable_weights.append(var)

        for var in self.bert.variables:
            if var not in self._trainable_weights:
                self._non_trainable_weights.append(var)

        super(BertLayer, self).build(input_shape)

    def call(self, inputs):
        inputs = [K.cast(x, dtype="int32") for x in inputs]
        input_ids, input_mask, segment_ids = inputs
        bert_inputs = dict(
            input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids
        )
        if self.pooling == "first":
            pooled = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)[
                "pooled_output"
            ]
        elif self.pooling == "mean":
            result = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)[
                "sequence_output"
            ]

            mul_mask = lambda x, m: x * tf.expand_dims(m, axis=-1)
            masked_reduce_mean = lambda x, m: tf.reduce_sum(mul_mask(x, m), axis=1) / (
                    tf.reduce_sum(m, axis=1, keepdims=True) + 1e-10)
            input_mask = tf.cast(input_mask, tf.float32)
            pooled = masked_reduce_mean(result, input_mask)
        else:
            raise NameError("Undefined pooling type (must be either first or mean, but is %s)" % self.pooling)

        return pooled

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.output_size)

In [None]:
def build_model(hyperparams, hyperparams_features, embedding_matrix, emotions, stopwords_list,
                liwc_categories,
               ignore_layer=[]):

    tokens_features = Input(shape=(hyperparams_features['maxlen'],), name='word_seq')
    embedding_layer = Embedding(hyperparams_features['max_features'], 
                                hyperparams_features['embedding_dim'], 
                                input_length=hyperparams_features['maxlen'],
                                embeddings_regularizer=regularizers.l2(hyperparams['l2_embeddings']),
                                weights=[embedding_matrix], 
                                trainable=hyperparams['trainable_embeddings'],
                               name='embeddings_layer')(
        tokens_features)
#     if 'batchnorm' not in ignore_layer:
#         embedding_layer_norm = BatchNormalization(axis=1, momentum=hyperparams['norm_momentum'],
#                                                      name='embeddings_layer_norm')(embedding_layer)
#     lstm_layers = Bidirectional(LSTM(hyperparams['lstm_units']))(embedding_layer)

    if tf.test.is_gpu_available():
        lstm_layers = CuDNNLSTM(hyperparams['lstm_units'], 
                                return_sequences='attention' not in ignore_layer, # only True if using attention
                      name='LSTM_layer')(embedding_layer)
    else:
        lstm_layers = LSTM(hyperparams['lstm_units'], 
                           return_sequences='attention' not in ignore_layer,
                      name='LSTM_layer')(embedding_layer)
    
    # Attention
    if 'attention' not in ignore_layer:
        attention = Dense(1, activation='tanh', name='attention')(lstm_layers)
        attention = Flatten()(attention)
        attention = Activation('softmax')(attention)
        attention = RepeatVector(hyperparams['lstm_units'])(attention)
        attention = Permute([2, 1])(attention)

        sent_representation = Multiply()([lstm_layers, attention])
        sent_representation = Lambda(lambda xin: K.sum(xin, axis=1), 
                                     output_shape=(hyperparams['lstm_units'],)
                                    )(sent_representation)

        
    else:
        sent_representation = lstm_layers
        
    
    sent_representation = Dropout(hyperparams['dropout'], name='lstm_att_dropout')(sent_representation)
    if hyperparams['dense_sentence_units']:
        sent_representation = Dense(units=hyperparams['dense_sentence_units'],
                                   name='dense_sent_representation')(sent_representation)
    numerical_features = Input(shape=(len(emotions) + 1 + len(liwc_categories),), name='numeric_input') # emotions and pronouns
    dense_layer = Dense(units=1,
                        kernel_regularizer=regularizers.l2(hyperparams['l2_dense']),
                        name='numerical_dense_layer',
                       )(numerical_features)
    sparse_features = Input(shape=(len(stopwords_list),), name='sparse_input') # stopwords

    dense_layer_sparse = Dense(units=hyperparams['dense_bow_units'],
                              name='sparse_feat_dense_layer',
                                kernel_regularizer=regularizers.l2(hyperparams['l2_dense']),
                              )(sparse_features)
    
    if 'batchnorm' not in ignore_layer:
        numerical_features_norm = BatchNormalization(axis=1, momentum=hyperparams['norm_momentum'],
                                                     name='numerical_features_norm')(numerical_features)
        sent_representation_norm = BatchNormalization(axis=1, momentum=hyperparams['norm_momentum'],
                                                      name='sent_repr_norm')(sent_representation)
        dense_layer_sparse_norm = BatchNormalization(axis=1, momentum=hyperparams['norm_momentum'],
                                                     name='sparse_features_norm')(dense_layer_sparse)
        
    subjects = Input(shape=(1,), name='subjects')
    

    all_layers = {
        'lstm_layers': sent_representation,
        'numerical_dense_layer': numerical_features,
        'sparse_feat_dense_layer': dense_layer_sparse
    }
    if 'batchnorm' not in ignore_layer:
        all_layers = {
            'lstm_layers': sent_representation_norm,
            'numerical_dense_layer': numerical_features_norm,
            'sparse_feat_dense_layer': dense_layer_sparse_norm
        }
    layers_to_merge = []
    for n, l in all_layers.items():
        if n in ignore_layer:
            continue
        layers_to_merge.append(l)
        
    if len(layers_to_merge) == 1:
        merged_layers = layers_to_merge[0]
    else:
        merged_layers = concatenate(layers_to_merge)
    output_layer = Dense(1, activation='sigmoid',
                         name='output_layer',
                        kernel_regularizer=regularizers.l2(hyperparams['l2_dense']))(merged_layers)

    # Compile model
    model = Model(inputs=[tokens_features, numerical_features, sparse_features, subjects], 
                  outputs=output_layer)

    model.compile(hyperparams['optimizer'], binary_crossentropy_custom,
                  metrics=[metrics_class.f1_m, metrics_class.precision_m, metrics_class.recall_m])
    return model



In [None]:
def build_hierarchical_model(hyperparams, hyperparams_features, embedding_matrix, emotions, stopwords_list,
                liwc_categories,
               ignore_layer=[]):

    # Post/sentence representation - word sequence
    tokens_features = Input(shape=(hyperparams_features['maxlen'],), name='word_seq')
    embedding_layer = Embedding(hyperparams_features['max_features'], 
                                hyperparams_features['embedding_dim'], 
                                input_length=hyperparams_features['maxlen'],
                                embeddings_regularizer=regularizers.l2(hyperparams['l2_embeddings']),
                                weights=[embedding_matrix], 
                                trainable=hyperparams['trainable_embeddings'],
                               name='embeddings_layer')(
        tokens_features)
    
    if tf.test.is_gpu_available():
        lstm_layers = CuDNNLSTM(hyperparams['lstm_units'], 
                                return_sequences='attention' not in ignore_layer, # only True if using attention
                      name='LSTM_layer')(embedding_layer)
    else:
        lstm_layers = LSTM(hyperparams['lstm_units'], 
                           return_sequences='attention' not in ignore_layer,
                      name='LSTM_layer')(embedding_layer)
    
    # Attention
    if 'attention' not in ignore_layer:
        attention = Dense(1, activation='tanh', name='attention')(lstm_layers)
        attention = Flatten()(attention)
        attention = Activation('softmax')(attention)
        attention = RepeatVector(hyperparams['lstm_units'])(attention)
        attention = Permute([2, 1])(attention)

        sent_representation = Multiply()([lstm_layers, attention])
        sent_representation = Lambda(lambda xin: K.sum(xin, axis=1), 
                                     output_shape=(hyperparams['lstm_units'],)
                                    )(sent_representation)       
    else:
        sent_representation = lstm_layers
    
    if 'batchnorm' not in ignore_layer:
        sent_representation = BatchNormalization(axis=1, momentum=hyperparams['norm_momentum'],
                                                          name='sent_repr_norm')(sent_representation)
    sent_representation = Dropout(hyperparams['dropout'], name='lstm_att_dropout')(sent_representation)


    # Hierarchy
    sentEncoder = Model(inputs=tokens_features, 
                        outputs=sent_representation)
    sentEncoder.summary()

    posts_history_input = Input(shape=(hyperparams_features['posts_per_user'], 
                                 hyperparams_features['maxlen']
                                      ), name='hierarchical_word_seq_input')

    user_encoder = TimeDistributed(sentEncoder, name='user_encoder')(posts_history_input)    
        
    # BERT encoder
    in_id_bert = Input(shape=(hyperparams_features['maxlen'],), name="input_ids_bert")
    in_mask_bert = Input(shape=(hyperparams_features['maxlen'],), name="input_masks_bert")
    in_segment_bert = Input(shape=(hyperparams_features['maxlen'],), name="segment_ids_bert")
    bert_inputs = [in_id_bert, in_mask_bert, in_segment_bert]
    
    bert_output = BertLayer(n_fine_tune_layers=hyperparams['bert_finetune_layers'], pooling="first")(bert_inputs)
    dense_bert = Dense(hyperparams['bert_dense_units'], activation='relu')(bert_output)
    bertSentEncoder = Model(bert_inputs, dense_bert)

    
    in_id_bert_history = Input(shape=(hyperparams_features['posts_per_user'],
                                                      hyperparams_features['maxlen'],), name="input_ids_bert_hist")
    in_mask_bert_history = Input(shape=(hyperparams_features['posts_per_user'],
                                                        hyperparams_features['maxlen'],), name="input_masks_bert_hist")
    in_segment_bert_history = Input(shape=(hyperparams_features['posts_per_user'],
                                                           hyperparams_features['maxlen'],), name="segment_ids_bert_hist")
    bert_inputs_history = [in_id_bert_history, in_mask_bert_history, in_segment_bert_history]
    bert_inputs_concatenated = concatenate(bert_inputs_history)
    inputs_indices = [hyperparams_features['maxlen']*i for i in range(3)]
    # slice the input in equal slices on the last dimension
    bert_encoder_layer = TimeDistributed(Lambda(lambda x: bertSentEncoder([x[:,inputs_indices[0]:inputs_indices[1]], 
                                                                  x[:,inputs_indices[1]:inputs_indices[2]],
                                                                          x[:,inputs_indices[2]:]])))(
                        bert_inputs_concatenated)
    bertUserEncoder = Model(bert_inputs_history, bert_encoder_layer)
    
    bert_user_encoder = bertUserEncoder(bert_inputs_history)
    
    # Other features 
    numerical_features_history = Input(shape=(
            hyperparams_features['posts_per_user'],
            len(emotions) + 1 + len(liwc_categories)
        ), name='numeric_input_hist') # emotions and pronouns
    sparse_features_history = Input(shape=(
            hyperparams_features['posts_per_user'],
            len(stopwords_list)
        ), name='sparse_input_hist') # stopwords
    
    
    dense_layer_sparse = Dense(units=hyperparams['dense_bow_units'],
                              name='sparse_feat_dense_layer',
                                kernel_regularizer=regularizers.l2(hyperparams['l2_dense']),
                              )
    dense_layer_sparse_user = TimeDistributed(dense_layer_sparse)(sparse_features_history)

    
    # Concatenate features
    if 'batchnorm' not in ignore_layer:
        numerical_features_history_norm = BatchNormalization(axis=1, momentum=hyperparams['norm_momentum'],
                                                     name='numerical_features_norm')(numerical_features_history)
        dense_layer_sparse_user = BatchNormalization(axis=1, momentum=hyperparams['norm_momentum'],
                                                     name='sparse_features_norm')(dense_layer_sparse_user)
    all_layers = {
        'lstm_layers': user_encoder,
        'bert_layers': bert_user_encoder,
        'numerical_dense_layer': numerical_features_history if 'batchnorm' in ignore_layer \
                    else numerical_features_history_norm,
        'sparse_feat_dense_layer': dense_layer_sparse_user,
    }
    
    layers_to_merge = [l for n,l in all_layers.items() if n not in ignore_layer]
    print(layers_to_merge)
    if len(layers_to_merge) == 1:
        merged_layers = layers_to_merge[0]
    else:
        merged_layers = concatenate(layers_to_merge)
    
    if tf.test.is_gpu_available():
        lstm_user_layers = CuDNNLSTM(hyperparams['lstm_units_user'], 
                                return_sequences='attention_user' not in ignore_layer, # only True if using attention
                      name='LSTM_layer_user')(merged_layers)
    else:
        lstm_user_layers = LSTM(hyperparams['lstm_units_user'], 
                           return_sequences='attention_user' not in ignore_layer,
                      name='LSTM_layer_user')(merged_layers)
    
    # Attention
    if 'attention' not in ignore_layer:
        attention_user = Dense(1, activation='tanh', name='attention_user')(lstm_user_layers)
        attention_user = Flatten()(attention_user)
        attention_user = Activation('softmax')(attention_user)
        attention_user = RepeatVector(hyperparams['lstm_units_user'])(attention_user)
        attention_user = Permute([2, 1])(attention_user)

        user_representation = Multiply()([lstm_user_layers, attention_user])
        user_representation = Lambda(lambda xin: K.sum(xin, axis=1), 
                                     output_shape=(hyperparams['lstm_units_user'],)
                                    )(user_representation)     
    else:
        user_representation = lstm_user_layers
    
    user_representation = Dropout(hyperparams['dropout'], name='lstm_att_dropout_user')(user_representation)
    
    
    if hyperparams['dense_user_units']:
        user_representation = Dense(units=hyperparams['dense_user_units'],
                                   name='dense_user_representation')(user_representation)
    
    # TODO: concatenate before hierarchy? (include all features in th hierarchy)
    
    output_layer = Dense(1, activation='sigmoid',
                         name='output_layer',
                        kernel_regularizer=regularizers.l2(hyperparams['l2_dense']))(user_representation)

    # Compile model
    hierarchical_model = Model(inputs=[posts_history_input, 
                                       numerical_features_history, sparse_features_history,
                                      in_id_bert_history, in_mask_bert_history, in_segment_bert_history], 
                  outputs=output_layer)
    hierarchical_model.summary()
    
    hierarchical_model.compile(hyperparams['optimizer'], binary_crossentropy_custom,
                  metrics=[metrics_class.f1_m, metrics_class.precision_m, metrics_class.recall_m])
    return hierarchical_model



In [None]:
model = build_hierarchical_model(hyperparams, hyperparams_features, embedding_matrix, emotions, stopword_list,
                    liwc_categories=[c for c in categories if c in writings_df.columns]
,
                   ignore_layer=hyperparams['ignore_layer'])


In [None]:
plot_model(model, 'models/hierarchical_model.png')

In [None]:
initialize_vars(sess)

In [None]:
experiment = Experiment(api_key="eoBdVyznAhfg3bK9pZ58ZSXfv",
                        project_name="mental", workspace="ananana", disabled=False)

experiment.log_parameters(hyperparams_features)

experiment.log_parameter('emotion_lexicon', nrc_lexicon_path)
experiment.log_parameter('emotions', emotions)
experiment.log_parameter('embeddings_path', pretrained_embeddings_path)
if 'subset' in writings_df.columns:
    experiment.add_tag('anorexia')

experiment.log_parameters(hyperparams)

## Train

In [None]:
class WeightsHistory(callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.log_weights(0)

    def on_epoch_end(self, epoch, logs={}):
        if epoch % 10 == 0:
            self.log_weights(epoch)
        
    def log_weights(self, step):
        for layer in model.layers:
            try:
                experiment.log_histogram_3d(layer.get_weights()[0], 
                                            name=layer.name, step=step)
            except Exception as e:
#                 logger.debug("Logging weights error: " + str(e) + "\n")
                # Layer probably does not exist
                pass


class FreezeLayer(callbacks.Callback):
    def __init__(self, logs={}, patience=5, layer={'user_encoder':'embeddings_layer'}, verbose=1, set_to=False):
        super(FreezeLayer, self).__init__()
        self.freeze_epoch = patience
        self.freeze_layer = layer
        self.verbose = verbose
        self.set_to = set_to

    def on_epoch_begin(self, epoch, logs={}):
        if type(self.freeze_layer)==dict:
            submodel = model.get_layer(list(self.freeze_layer.keys())[0])
        else:
            submodel = model
        logging.debug("Trainable embeddings", submodel.get_layer(self.freeze_layer).trainable)
        if epoch == self.freeze_epoch:
            try:
                layer = submodel.get_layer(self.freeze_layer)
                old_value = layer.trainable
                layer.trainable = self.set_to
                # TODO: does this reset the optimizer? should I also compile the top-level model?
                model.compile(hyperparams['optimizer'], binary_crossentropy_custom,
                  metrics=[metrics_class.f1_m, metrics_class.precision_m, metrics_class.recall_m])
                if self.verbose:
                    logging.debug("Setting %s layer from %s to trainable=%s...\n" % (layer.name, old_value,
                                                                   submodel.get_layer(self.freeze_layer).trainable))
            except Exception as e:
                # layer probably does not exist
                pass

In [None]:
early_stopping_patience=50
def train_model(model, 
                data_generator_train, data_generator_valid,
                epochs, class_weight, start_epoch=0, workers=4,
                callback_list = [],
                model_path='/tmp/model',
               verbose=1):
    logging.info('Train...')
    experiment.log_parameter('class_weight', class_weight.values())
    experiment.log_parameter('callbacks', callbacks)

    history = model.fit_generator(data_generator_train,
                steps_per_epoch=100,
              epochs=epochs, initial_epoch=start_epoch, 
              class_weight=class_weight,
              validation_data=data_generator_valid,
                        verbose=verbose,
#               validation_split=0.3,
                       workers=workers,
            callbacks = [
                callbacks.ModelCheckpoint(filepath='%s_best' % model_path, verbose=1, 
                                          save_best_only=True),
                callbacks.EarlyStopping(patience=early_stopping_patience), *callback_list
            ])
    model.save(model_path)
    experiment.log_parameter('model_path', model_path)
    return model, history

In [None]:
%%time

freeze_layer = FreezeLayer(patience=hyperparams['freeze_patience'], set_to=not hyperparams['trainable_embeddings'])
weights_history = WeightsHistory()
reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=hyperparams['reduce_lr_factor'],
                          patience=hyperparams['reduce_lr_patience'], min_lr=0.000001, verbose=1)
data_generator_train = DataGeneratorHierarchical(user_level_data, subjects_split, set_type='train',
                                                max_posts_per_user=hyperparams_features['posts_per_user'])
data_generator_valid = DataGeneratorHierarchical(user_level_data, subjects_split, set_type='valid',
                                                max_posts_per_user=hyperparams_features['posts_per_user'])
# Note: FreezeLayer callback doesn't work with hierarchical architecture
model, history = train_model(model, data_generator_train, data_generator_valid,
                       epochs=100,
                      class_weight={0:0.5, 1:5}, start_epoch=0,
                      callback_list = [],#[weights_history, reduce_lr],
                      model_path='models/ham_bert_user_selfharm', workers=4)

In [None]:
# TODO: properly extract the test data without sampling
model.evaluate(DataGenerator(user_level_data, subjects_split, set_type='test', max_posts_per_user=1000))

In [None]:
dependencies = {
    'f1_m': metrics_class.f1_m,
    'precision_m': metrics_class.precision_m,
    'recall_m': metrics_class.recall_m,
    'binary_crossentropy_custom': binary_crossentropy_custom
}
model = load_model('ham_bert_user_selfharm_best', custom_objects=dependencies)


In [None]:
pd.Series([v for v in model.get_layer('attention').get_weights()[0].flatten()]).rolling(50).mean().plot()

In [None]:
pd.Series([abs(v) for v in model.get_layer('output_layer').get_weights()[0].flatten()]).plot()

### Feature importance

In [None]:
features = [
    (e, 'nrc') for e in emotions] + ['pers_pronouns'] + [
    (c, 'liwc') for c in list(categories) if c in writings_df.columns] + [
(st, 'stopword') for st in stopword_list]
weights = model.get_layer('output_layer').get_weights()[0].tolist()[-(len(features)):]

print(len(weights), len(features))
feature_importance = {}
for (i, f) in enumerate(features):
    feature_importance[f] = weights[i][0]

sorted(feature_importance.items(), key=lambda t: abs(t[1]), reverse=True)

## Evaluate per user

In [None]:
def get_data_for_point(subject, voc, hyperparams_features=hyperparams_features, nrc_lexicon=nrc_lexicon,
                      emotions=emotions):
    eval_writings_df = writings_df[writings_df['subject']==subject]
    correct_label = eval_writings_df.label.values[0]
    (x_train, y_train), (x_valid, y_valid), (x_test, y_test), voc = load_erisk_data(eval_writings_df,
                        seq_len=hyperparams_features['maxlen'],
                        voc_size=hyperparams_features['max_features'],
                        emotion_lexicon=nrc_lexicon,
                        emotions=emotions, user_level=False,
                        train_prop=0.0, vocabulary=voc)
    return x_test, y_test, correct_label

In [None]:
def predict_per_user(writings_df, majority_prop=0.2, train_prop=0.7, majority_nr=0, validate=False, voc=None,
                    random=False, nr_slices=5, test_slice=2):
    all_predictions = []
    all_labels = []
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    thresh=0.5
    majority_proportion=majority_prop
    valid_prop = 0.3
    
    if 'subset' in writings_df.columns:
        training_subjects = list(set(writings_df[writings_df['subset']=='train'].subject))
        test_subjects = list(set(writings_df[writings_df['subset']=='test'].subject))
    else:
        all_subjects = sorted(list(set(writings_df.subject)))
        training_subjects_size = int(len(all_subjects) * train_prop)
        test_subjects_size = len(all_subjects) - training_subjects_size
        # Cross-validation, with fixed slice as input
        test_prop = 1-train_prop
        test_slice = min(test_slice, nr_slices)
        logger.debug("start index: %f, from %f\n" % (
            len(all_subjects)*(1/nr_slices)*test_slice, test_prop*test_slice))
        start_slice = int(len(all_subjects)*(1/nr_slices)*test_slice)
        test_subjects = all_subjects[start_slice: start_slice+test_subjects_size]
        training_subjects = [s for s in all_subjects if s not in test_subjects]
    training_subjects = sorted(training_subjects) # ensuring reproducibility
    valid_subjects_size = int(len(training_subjects) * valid_prop)
    valid_subjects = training_subjects[:valid_subjects_size]
    training_subjects = training_subjects[valid_subjects_size:]
    
    if validate:
        subjects = valid_subjects
    else:
        subjects = test_subjects
    for subject in subjects:
        x_test_user, y_test_user, label = get_data_for_point(subject, voc=voc)
        outputs = model.predict(x_test_user)
        if random:
            sigma = np.std(outputs)
            mu = np.mean(outputs)
            print("generating random outputs with sigma", sigma, "and mu", mu)
            outputs = sigma*np.random.randn(len(outputs))+mu
        positive_pred = sum(outputs>=thresh)
        negative_pred = sum(outputs<thresh)
        majority_pred = 0
        if majority_proportion and positive_pred >= majority_proportion*negative_pred:
            majority_pred = 1
        if majority_nr and positive_pred>=majority_nr:
            majority_pred = 1
        if label == 1:
            if majority_pred == 1:
                tp+=1
            else:
                fn+=1
        else:
            if majority_pred == 0:
                tn+=1
            else:
                fp+=1
        print(negative_pred, positive_pred, majority_pred)
        all_predictions.append(majority_pred)
        all_labels.append(label)
    def prec_recall_f1(tp, fp, tn, fn):
        recall = tp/(tp+fn+0.0000001)
        precision = tp/(tp+fp+0.0000001)
        f1 = 2*precision*recall/(precision+recall+0.0000001)
        print("Recall", recall, "Precision", precision, "F1", f1)
    if majority_prop:
        print("Vote proportion", majority_prop)
    if majority_nr:
        print("Vote points", majority_nr)
    prec_recall_f1(tp, fp, tn, fn)

        

In [None]:
predict_per_user(writings_df=writings_df, voc=voc, majority_prop=0.2)

## Cross-validation

In [None]:
results_per_slice = {}

In [None]:
nr_slices=5
logger.setLevel(logging.INFO)
for tslice in range(nr_slices): 
    (x_train, y_train), (x_valid, y_valid), (x_test, y_test), voc = load_erisk_data(writings_df, 
                                                                seq_len=hyperparams_features['maxlen'],
                                                                voc_size=hyperparams_features['max_features'],
                                                               emotion_lexicon=nrc_lexicon,
                                                               emotions=emotions,
                                                               user_level=hyperparams_features['user_level'],
                                                                                    test_slice=tslice,
                                                                                    nr_slices=nr_slices,
    #                                                            vocabulary=pickle.load(open('vocabulary20K_selfharm.pkl', 'rb'))
                                                                                   logger=logger)
    model, history = train_model(model, x_train, y_train, x_valid, y_valid,
           epochs=200, batch_size=hyperparams['batch_size'],
                      class_weight={0:0.5, 1:5}, start_epoch=0,
                      callback_list = [freeze_layer, weights_history, reduce_lr],
                      workers=2, verbose=0)
    results_per_slice[tslice] = model.evaluate(x_test, y_test)
    logger.info("Results for slice %d: %s\n" % (tslice, results_per_slice[tslice]))

In [None]:
print("Average F1 score: ", np.array([results_per_slice[s][1] for s in results_per_slice.keys()]).mean(),
     "all F1 scores: ", {s: v[1] for (s,v) in results_per_slice.items()} )

## Extra analysis


In [None]:
def merge_tokens(row):
    tokens = []
    if row.tokenized_text:
        tokens += row.tokenized_text
    if row.tokenized_title:
        tokens += row.tokenized_title
    return tokens
writings_df['all_tokens'] = writings_df.apply (lambda row: merge_tokens(row), axis=1)

In [None]:
# TODO: include the title
def extract_emotions(tokens, emotion, relative=True):
    if not tokens:
        return None
    emotion_words = [t for t in tokens 
                     if t in nrc_lexicon[emotion]]
    if relative:
        return len(emotion_words) / len(tokens)
    else:
        return len(emotion_words)
    
    return encoded_emotions

from functools import partial
for emotion in emotions:
    writings_df[emotion] = writings_df['all_tokens'].apply(partial(extract_emotions, emotion=emotion, 
                                                                   relative=True))


In [None]:
writings_df['pronouns'] = writings_df['all_tokens'].apply(partial(encode_pronouns, relative=True))

In [None]:
writings_df[['text', 'label', 'pronouns', 'text_len'] + emotions].corr()

In [None]:
writings_df[['text', 'label', 'pronouns', 'text_len'] + emotions].groupby('label').mean()

In [None]:
from nltk.sentiment import SentimentAnalyzer, SentimentIntensityAnalyzer

In [None]:
sid = SentimentIntensityAnalyzer()


In [None]:
sid.polarity_scores("We are here today happiness is all around")

In [None]:
writings_df['neg_vader'] = writings_df.text.apply(lambda t: sid.polarity_scores(t)['neg']
                                                 if type(t)==str else 0)

In [None]:
writings_df

In [None]:
writings_df['pos_vader'] = writings_df.text.apply(lambda t: sid.polarity_scores(t)['pos']
                                                 if type(t)==str else 0)

In [None]:
writings_df[['text', 'label', 'pronouns', 'text_len', 'neg_vader', 'pos_vader'] + emotions].groupby('label').mean()

In [None]:
writings_df[['text', 'label', 'pronouns', 'text_len', 'neg_vader', 'pos_vader'] + emotions].corr('spearman')

### LIWC

In [None]:
from liwc_readDict import readDict

liwc = readDict('/home/ana/resources/FakeOrFact/features/LIWC/LIWC/liwc.dic')

In [None]:
categories = [c for (w,c) in liwc]
set(categories)

In [None]:
liwc

In [None]:
liwc_dict = {}
for (w, c) in liwc:
    if c not in liwc_dict:
        liwc_dict[c] = []
    liwc_dict[c].append(w)


In [None]:
liwc_dict['pronoun']

In [None]:
def encode_liwc_categories(tokens, category_words, relative=True):
    category_cnt = 0
    if not tokens:
        return None
    text_len = len(tokens)
    for t in tokens:
        for word in category_words:
            if t==word or (word[-1]=='*' and t.startswith(word[:-1])) \
            or (t==word.split("'")[0]):
                category_cnt += 1
                break # one token cannot belong to more than one word in the category
    if relative:
        return category_cnt/text_len
    else:
        return category_cnt

In [None]:
%%time
from functools import partial
# for categ in ['negemo', 'posemo', 'affect', 'sad', 'anx', 'pronoun']:#liwc_dict.keys():
for categ in liwc_dict.keys():
    if categ in writings_df.columns:
        continue
    print("Encoding for category %s..." % categ)
    writings_df[categ] = writings_df['all_tokens'].apply(partial(encode_liwc_categories, 
                                                                   category_words=liwc_dict[categ], 
                                                                   relative=True))


In [None]:
writings_df.groupby('subject').mean()[['label', 'negemo', 'posemo', 'affect', 'sad', 'anx', 'pronoun']].corr()

In [None]:
writings_df[['label', 'negemo', 'posemo', 'affect', 'sad', 'anx', 'pronoun']].groupby('label').mean()

In [None]:
writings_df.groupby('subject').mean()[['label'] + categories].corr()

## Hyperparameter tuning

In [None]:
# Declare your hyperparameters search:
tune_epochs=150
config = {
      "algorithm": "random",
      "parameters": {
          "lstm_units": {"type": "integer", "min": 10, "max": 1000},
          "dense_bow_units": {"type": "integer", "min": 1, "max": 50},
          "lr": {"type": "float", "min": 0.00001, "max": 0.5, "scalingType": "loguniform"},
          "l2_dense": {"type": "float", "min": 0.0000001, "max": 0.05, "scalingType": "loguniform"},
          "l2_embeddings": {"type": "float", "min": 0.0000001, "max": 0.05, "scalingType": "loguniform"},
          "dropout": {"type": "float", "min": 0, "max": 0.7, "scalingType": "uniform"},
          "norm_momentum": {"type": "float", "min": 0.01, "max": 0.99, "scalingType": "uniform"},
          "optimizer": {"type": "categorical", "values": ["adam", "adagrad", ""]},
          "batch_size": {"type": "integer", "min": 10, "max": 512, "scalingType": "loguniform"},
          "positive_class_weight": {"type": "integer", "min": 1, "max": 25},
          "trainable_embeddings": {"type": "discrete", "values": [True, False]},
          "freeze_patience": {"type": "integer", "min": 2, "max": tune_epochs+1},
          "lr_reduce_factor": {"type": "float", "min": 0.0001, "max": 0.8},
          "lr_reduce_patience": {"type": "integer", "min": 2, "max": tune_epochs+1},
          "decay": {"type": "float", "min": 0.00000001, "max": 0.5, "scalingType": "loguniform"},
          "ignore_layers_values": {"type": "categorical", "values": ["attention", "batchnorm", ""]}
      },
      "spec": {
          "metric": "loss",
          "objective": "minimize",
      },
  }
optimizer = Optimizer(config, api_key="eoBdVyznAhfg3bK9pZ58ZSXfv")

for experiment in optimizer.get_experiments(project_name="mental"):
    experiment.add_tag("tune")
    
    # Test the model
    hyperparams_config = {
        param: experiment.get_parameter(param) for param in config['parameters'].keys()}
    if not hyperparams_config['optimizer']:
        hyperparams_config['optimizer'] = optimizers.Adam(lr=hyperparams_config['lr'], 
                                   decay=hyperparams_config['decay'])
    hyperparams_config["ignore_layers"] = []
    if hyperparams_config["ignore_layers_values"]:
        hyperparams_config["ignore_layers"] = [hyperparams_config["ignore_layers_values"]]
    model = build_model(hyperparams=hyperparams_config,
                        hyperparams_features=hyperparams_features, 
                        embedding_matrix=embedding_matrix, emotions=emotions,
                       stopwords_list=stopword_list, liwc_categories=categories)
    freeze_layer = FreezeLayer(patience=experiment.get_parameter('freeze_patience'),
                              set_to=not experiment.get_parameter('trainable_embeddings'))
    reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_loss', 
                                            factor=experiment.get_parameter('lr_reduce_factor'),
                                            patience=experiment.get_parameter('lr_reduce_patience'), 
                                            min_lr=0.000001, verbose=1)
    model, history = train_model(model, 
            x_train, y_train, x_test, y_test,
            epochs=tune_epochs, batch_size=experiment.get_parameter('batch_size'),
                      class_weight={0:1, 1:experiment.get_parameter('positive_class_weight')}, 
                          workers=2,
                          callback_list = [freeze_layer, reduce_lr],
                      model_path='models/experiment')
    loss = history.history['loss'][-1]
    
    # Report the loss, if not auto-logged:
    experiment.log_metric("loss", loss)