In [1]:
import pandas as pd
import xml.etree.ElementTree as ET
import glob, os
import numpy as np
from comet_ml import Experiment, Optimizer
import pickle
import logging
import sys
from sklearn.utils import class_weight

In [2]:
os.environ['TF_KERAS'] = '1'
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, Lambda, BatchNormalization, TimeDistributed, \
    CuDNNLSTM, Bidirectional, Input, concatenate, Flatten, RepeatVector, Activation, Multiply, Permute
from tensorflow.keras import regularizers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import callbacks, optimizers
from tensorflow.keras import backend as K
from tensorflow.keras.utils import plot_model, Sequence

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

import tensorflow as tf

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
# only reserve 1 GPU

In [4]:
# tf.enable_eager_execution()
my_seed = 1234
tf.set_random_seed(my_seed)

In [5]:
logger = logging.getLogger('training')
logger.addHandler(logging.StreamHandler(sys.stdout))
logger.setLevel(logging.DEBUG)

In [6]:
import sys
sys.setrecursionlimit(10000)

In [7]:
tf.test.is_built_with_cuda()

True

# Read data

In [8]:
def read_subject_writings(subject_file):
    writings = []
    with open(subject_file) as sf:
        contents = sf.read()
        root = ET.fromstring(contents)
        try:
            subject = root.findall('ID')[0].text.strip()
        except Exception:
            print('Cannot extract ID', contents[:500], '\n-------\n')        
        for w in root.iter('WRITING'):
            subject_writings = {'subject': subject}
            for title in w.findall('TITLE'):
                subject_writings['title'] = title.text
            for text in w.findall('TEXT'):
                subject_writings['text'] = text.text
            for date in w.findall('DATE'):
                subject_writings['date'] = date.text
            writings.append(subject_writings)
    return writings

In [9]:
root_dir = '/home/anasab/' 
# root_dir = '/home/ana/'

### eRisk 2020 T1

In [10]:
datadir_T1 = root_dir + '/eRisk/data/eRisk2020_T1_train/eRISK2020_T1_training_data/eRISK2020_training_data/data/'
labels_file_T1 = root_dir + '/eRisk/data//eRisk2020_T1_train/eRISK2020_T1_training_data/eRISK2020_training_data/golden_truth.txt'

In [11]:
def read_texts_2020(datadir_T1, labels_file_T1):
    writings = []
    for subject_file in os.listdir(datadir_T1):
        print(subject_file)
        writings.extend(read_subject_writings(os.path.join(datadir_T1, subject_file)))
    writings_df = pd.DataFrame(writings)

    labels_T1 = pd.read_csv(labels_file_T1, delimiter=' ', names=['subject', 'label'])
    labels_T1 = labels_T1.set_index('subject')

    writings_df['label'] = writings_df['subject'].apply(
    lambda s: labels_T1.loc[s, 'label'])
    
    return writings_df



### eRisk 2019 T1 (Anorexia)

In [12]:
datadirs_T1_2019 = {
    'train': ['2018 test/', '2018 train/positive_examples/', '2018 train/negative_examples/'],
    'test': ['data/']
}
datadir_root_T1_2019 = {
    'train': root_dir + '/eRisk/data/past/eRisk2019_T1/training data - t1/',
    'test': root_dir + '/eRisk/data/past/eRisk2019_T1/test data - T1/'
}
    
labels_files_T1_2019 = {
    'train': ['2018 train/risk_golden_truth.txt', '2018 test/risk-golden-truth-test.txt'],
    'test': ['T1_erisk_golden_truth.txt']
}

In [13]:
def read_texts_2019(datadir_root_T1_2019,
                   datadirs_T1_2019,
                   labels_files_T1_2019,
                   test_suffix='0000'):
    writings = {'train': [], 'test': []}
    writings_df = pd.DataFrame()
    labels_df = pd.DataFrame()

    for subset in ('train', 'test'):
        for subdir in [os.path.join(datadir_root_T1_2019[subset], subp) for subp in datadirs_T1_2019[subset]]:
            if subset=='train':
                chunkdirs = [os.path.join(datadir_root_T1_2019[subset], subdir, chunkdir) 
                             for chunkdir in os.listdir(subdir)]
            else:
                chunkdirs = [os.path.join(datadir_root_T1_2019[subset], subdir)]
                
            for chunkdir in chunkdirs:
                if not os.path.isdir(chunkdir):
                    continue
                for subject_file in os.listdir(chunkdir):
                    writings[subset].extend(read_subject_writings(os.path.join(chunkdir, subject_file)))
        writings_df_part = pd.DataFrame(writings[subset])
        # add a suffix for users in the test -- the numbers are duplicated with the ones in train
        if subset=='test':
            writings_df_part['subject'] = writings_df_part['subject'].apply(lambda s: s+test_suffix)
            print(subset, writings_df_part.subject)
        writings_df_part['subset'] = subset
        writings_df = pd.concat([writings_df, writings_df_part])
        writings_df.reindex()

        for label_file in labels_files_T1_2019[subset]:
            labels = pd.read_csv(os.path.join(datadir_root_T1_2019[subset], label_file), 
                                 delimiter='\s+', names=['subject', 'label'])
            # add a suffix for users in the test -- the numbers are duplicated with the ones in train
            if subset=='test':
                labels['subject'] = labels['subject'].apply(lambda s: s+test_suffix)
            labels_df = pd.concat([labels_df, labels])
    labels_df = labels_df.drop_duplicates()
    labels_df = labels_df.set_index('subject')

    writings_df = writings_df.drop_duplicates()
    
    writings_df = writings_df.join(labels_df, on='subject')
    
    return writings_df

## Preprocess text

In [14]:
# writings_df = read_texts_2020(datadir_T1, labels_file_T1)
# writings_df = read_texts_2019(datadir_root_T1_2019,
#                    datadirs_T1_2019,
#                    labels_files_T1_2019)
writings_df = pickle.load(open('writings_df_anorexia_liwc', 'rb'))

In [15]:
writings_df.label.hist()

<matplotlib.axes._subplots.AxesSubplot at 0x7f0eef092a90>

In [16]:
writings_df.head()

Unnamed: 0,subject,title,date,text,label,tokenized_title,title_len,tokenized_text,text_len,all_tokens,...,feel,excl,future,nonfl,ppron,shehe,i,we,you,they
0,subject8292,If anyone could help with which sub to put thi...,2016-08-02 09:22:12,,0,"[if, anyone, could, help, with, which, sub, to...",11.0,,,"[if, anyone, could, help, with, which, sub, to...",...,0.0,0.090909,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,subject8292,I'm literally never gonna stop waiting...,2016-08-05 09:35:55,,0,"[i, m, literally, never, gonna, stop, waiting]",7.0,,,"[i, m, literally, never, gonna, stop, waiting]",...,0.0,0.0,0.285714,0.0,0.142857,0.0,0.142857,0.0,0.0,0.0
2,subject8292,This is a really interesting study! Makes sens...,2016-08-05 21:36:24,,0,"[this, is, a, really, interesting, study, make...",9.0,,,"[this, is, a, really, interesting, study, make...",...,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,subject8292,The only thing Frank is building ...,2016-08-07 23:35:23,"... Is hype. Think about it, every time he wor...",0,"[the, only, thing, frank, is, building]",6.0,"[is, hype, think, about, it, every, time, he, ...",26.0,"[is, hype, think, about, it, every, time, he, ...",...,0.0,0.0,0.0625,0.0,0.0625,0.03125,0.0,0.03125,0.0,0.0
4,subject8292,Mostly always me during this whole charade,2016-08-09 08:39:41,,0,"[mostly, always, me, during, this, whole, char...",7.0,,,"[mostly, always, me, during, this, whole, char...",...,0.0,0.0,0.0,0.0,0.142857,0.0,0.142857,0.0,0.0,0.0


In [17]:
tokenizer = RegexpTokenizer(r'\w+')

def tokenize(t):
    return tokenizer.tokenize(t.lower())

In [18]:
def tokenize_fields(writings_df):
    writings_df['tokenized_title'] = writings_df['title'].apply(lambda t: tokenize(t) 
                                                                if type(t)==str and t else None)
    writings_df['title_len'] = writings_df['tokenized_title'].apply(lambda t: len(t) 
                                                                    if type(t)==list and t else None)
    writings_df['tokenized_text'] = writings_df['text'].apply(lambda t: tokenize(t) 
                                                              if type(t)==str and t else None)
    writings_df['text_len'] = writings_df['tokenized_text'].apply(lambda t: len(t) 
                                                                  if type(t)==list and t else None)
    return writings_df

In [19]:
writings_df.text_len.describe()

count    127941.000000
mean         32.268929
std          82.590713
min           0.000000
25%           6.000000
50%          13.000000
75%          31.000000
max        7201.000000
Name: text_len, dtype: float64

In [20]:
writings_df.title_len.describe()

count    49762.000000
mean        10.699771
std          9.282454
min          0.000000
25%          4.000000
50%          8.000000
75%         14.000000
max        149.000000
Name: title_len, dtype: float64

In [21]:
writings_df.groupby('subject').mean().describe()

Unnamed: 0,label,title_len,text_len,funct,article,affect,negemo,sad,cogmech,inhib,...,feel,excl,future,nonfl,ppron,shehe,i,we,you,they
count,340.0,336.0,340.0,340.0,340.0,340.0,340.0,340.0,340.0,340.0,...,340.0,340.0,340.0,340.0,340.0,340.0,340.0,340.0,340.0,340.0
mean,0.120588,9.514427,33.122855,0.425434,0.049284,0.08099,0.023242,0.003515,0.125608,0.00473,...,0.005483,0.02251,0.089113,0.00274,0.07982,0.008573,0.040263,0.004642,0.020469,0.005873
std,0.326128,4.714271,31.874155,0.085804,0.013413,0.032889,0.011015,0.002891,0.031706,0.002874,...,0.004131,0.00974,0.028436,0.004729,0.027475,0.0071,0.020218,0.004367,0.01231,0.00404
min,0.0,1.0,1.0,0.018182,0.005237,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,6.60119,14.402079,0.379003,0.042321,0.06309,0.01611,0.001949,0.109709,0.002829,...,0.003234,0.016795,0.073775,0.000669,0.064952,0.004052,0.027496,0.002302,0.013474,0.003566
50%,0.0,8.660264,24.212121,0.439643,0.049415,0.074207,0.022189,0.002867,0.127451,0.004594,...,0.004879,0.022203,0.088307,0.001581,0.077149,0.006945,0.03742,0.003735,0.018725,0.005123
75%,0.0,11.526931,37.878342,0.47855,0.057188,0.090063,0.028348,0.004319,0.14571,0.006224,...,0.006712,0.0273,0.105534,0.002902,0.093965,0.01143,0.04984,0.005544,0.025045,0.007488
max,1.0,32.166667,266.446446,0.646948,0.095561,0.27052,0.073699,0.020833,0.251136,0.02218,...,0.042094,0.090475,0.202499,0.045799,0.213871,0.069447,0.133143,0.037712,0.099026,0.041093


In [22]:
writings_df.groupby('subject').max().groupby('label').count()

Unnamed: 0_level_0,date,title_len,text_len,all_tokens,funct,article,affect,negemo,sad,cogmech,...,feel,excl,future,nonfl,ppron,shehe,i,we,you,they
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,299,296,299,299,299,299,299,299,299,299,...,299,299,299,299,299,299,299,299,299,299
1,41,40,41,41,41,41,41,41,41,41,...,41,41,41,41,41,41,41,41,41,41


In [23]:
print("Average number of posts per user", writings_df.groupby('subject').count().title.mean())
print("Average number of comments per user", writings_df.groupby('subject').count().text.mean())


Average number of posts per user 146.35882352941175
Average number of comments per user 376.2970588235294


In [24]:
writings_df.groupby('subject').count().title.describe()

count    340.000000
mean     146.358824
std      240.998992
min        0.000000
25%       13.000000
50%       42.500000
75%      148.500000
max      998.000000
Name: title, dtype: float64

In [25]:
writings_df.groupby('subject').count().text.describe()

count     340.000000
mean      376.297059
std       379.091730
min         1.000000
25%        54.000000
50%       214.500000
75%       646.000000
max      1350.000000
Name: text, dtype: float64

# Recurrent NN

## Extract features and encode data

In [16]:
hyperparams_features = {
    "max_features": 40000,
    # cut texts after this number of words
    # (among top max_features most common words)
    "maxlen": 512,
    "embedding_dim": 100,
    "user_level": True,
    "posts_per_user": 50,
    "batch_size": 32,
}


### Emotions

In [17]:
def load_NRC(nrc_path):
    word_emotions = {}
    emotion_words = {}
    with open(nrc_path) as in_f:
        for line in in_f:
            line = line.strip()
            if not line:
                continue
            word, emotion, label = line.split()
            if word not in word_emotions:
                word_emotions[word] = set()
            if emotion not in emotion_words:
                emotion_words[emotion] = set()
            label = int(label)
            if label:
                word_emotions[word].add(emotion)
                emotion_words[emotion].add(word)
    return emotion_words

nrc_lexicon_path = root_dir + '/resources/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt'
nrc_lexicon = load_NRC(nrc_lexicon_path)
emotions = list(nrc_lexicon.keys())


In [18]:
def encode_emotions(tokens, emotion_lexicon, emotions, relative=True):
    text_len = len(tokens)
    encoded_emotions = [0 for e in emotions]
    for i, emotion in enumerate(emotions):
        try:
            emotion_words = [t for t in tokens if t in emotion_lexicon[emotion]]
            if relative:
                encoded_emotions[i] = len(emotion_words) / len(tokens)
            else:
                encoded_emotions[i] = len(emotion_words)
        except ValueError:
            print("Emotion not found.")
    return encoded_emotions

In [19]:
from liwc_readDict import readDict

liwc = readDict(root_dir + '/resources/liwc.dic')

categories = set([c for (w,c) in liwc])
len(categories)

64

### Style features

#### Char n-grams

In [20]:
def extract_ngrams(tokens):
    pass

#### Personal pronouns

In [21]:
first_person_pronouns = {"i", "me", "my", "mine", "myself"}
def encode_pronouns(tokens, pronouns={"i", "me", "my", "mine", "myself"}, relative=True):
    if not tokens:
        return np.nan
    text_len = len(tokens)
    nr_pronouns = len([t for t in tokens if t in pronouns])
    if relative:
        return nr_pronouns/text_len
    else:
        return nr_pronouns

#### Stopwords

In [22]:
stopword_list = stopwords.words("english")
def encode_stopwords(tokens, stopwords=stopword_list):
    encoded_stopwords = [0 for s in stopword_list]
    if not tokens:
        return encoded_stopwords
    for i, stopword in enumerate(stopwords):
        if stopword in tokens:
            encoded_stopwords[i] += 1
    return encoded_stopwords

### Topics

## BERT


In [23]:
import tensorflow_hub as hub
bert_path = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"
from bert.tokenization import FullTokenizer

# TODO: do this at the beginning? Also initialize variables?
sess = tf.Session()

W0306 17:19:08.063771 139706427881216 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14


In [24]:
class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, label=None):
        """Constructs a InputExample.
    Args:
      guid: Unique id for the example.
      text_a: string. The untokenized text of the first sequence. For single
        sequence tasks, only this sequence must be specified.
      text_b: (Optional) string. The untokenized text of the second sequence.
        Only must be specified for sequence pair tasks.
      label: (Optional) string. The label of the example. This should be
        specified for train and dev examples, but not for test examples.
    """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label

In [25]:
def encode_text_for_bert(tokenizer, example, max_seq_length=256):
    """Converts a single `InputExample` into a single `InputFeatures`."""

#     if isinstance(example, PaddingInputExample):
#         input_ids = [0] * max_seq_length
#         input_mask = [0] * max_seq_length
#         segment_ids = [0] * max_seq_length
#         label = 0
#         return input_ids, input_mask, segment_ids, label

    tokens_a = tokenizer.tokenize(example.text_a)
    if len(tokens_a) > max_seq_length - 2:
        tokens_a = tokens_a[0 : (max_seq_length - 2)]

    tokens = []
    segment_ids = []
    tokens.append("[CLS]")
    segment_ids.append(0)
    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(0)
    tokens.append("[SEP]")
    segment_ids.append(0)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        input_mask.append(0)
        segment_ids.append(0)

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    return input_ids, input_mask, segment_ids, example.label



In [26]:
def create_tokenizer_from_hub_module():
    """Get the vocab file and casing info from the Hub module."""
    bert_module =  hub.Module(bert_path)
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    vocab_file, do_lower_case = sess.run(
        [
            tokenization_info["vocab_file"],
            tokenization_info["do_lower_case"],
        ]
    )

    return FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)

In [27]:
# Instantiate tokenizer
bert_tokenizer = create_tokenizer_from_hub_module()

encode_text_for_bert(bert_tokenizer, InputExample(None, 
                                               "Ana are mere"), 
                       hyperparams_features['maxlen'])

Instructions for updating:
Colocations handled automatically by placer.


W0306 17:19:09.426190 139706427881216 deprecation.py:323] From /usr/local/tensorflow/python3.5/1.13.1/lib/python3.5/site-packages/tensorflow/python/ops/control_flow_ops.py:3632: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0306 17:19:11.807401 139706427881216 saver.py:1483] Saver not created because there are no variables in the graph to restore


([101,
  9617,
  2024,
  8210,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  

In [28]:
def initialize_vars(sess):
    sess.run(tf.local_variables_initializer())
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    K.set_session(sess)

### Encode data

In [29]:
from collections import Counter
def load_erisk_data(writings_df, voc_size, emotion_lexicon, seq_len, emotions =  
                    ['anger', 'anticipation', 'disgust', 'fear', 'joy', 
                     'negative', 'positive', 'sadness', 'surprise', 'trust'],
                    liwc_categories = categories,
                    pronouns = ["i", "me", "my", "mine", "myself"],
                    train_prop=0.7, valid_prop=0.3, test_slice=2,
                    nr_slices=5,
                    min_post_len=3, min_word_len=1, 
                    user_level=True, vocabulary=None,
                   logger=logger):
    logger.debug("Loading data...\n")
    if not vocabulary:
        vocabulary = {}
        word_freqs = Counter()
        for words in writings_df.tokenized_text:
            word_freqs.update(words)
        for words in writings_df.tokenized_title:
            word_freqs.update(words)
        i = 1
        for w, f in word_freqs.most_common(voc_size-2): # keeping voc_size-1 for unk
            if len(w) < min_word_len:
                continue
            vocabulary[w] = i
            i += 1
   
    if 'subset' in writings_df.columns:
        training_subjects = list(set(writings_df[writings_df['subset']=='train'].subject))
        test_subjects = list(set(writings_df[writings_df['subset']=='test'].subject))
    else:
        all_subjects = sorted(list(set(writings_df.subject)))
        training_subjects_size = int(len(all_subjects) * train_prop)
        test_subjects_size = len(all_subjects) - training_subjects_size
        # Cross-validation, with fixed slice as input
        test_prop = 1-train_prop
        test_slice = min(test_slice, nr_slices)
        logger.debug("start index: %f, from %f\n" % (
            len(all_subjects)*(1/nr_slices)*test_slice, test_prop*test_slice))
        start_slice = int(len(all_subjects)*(1/nr_slices)*test_slice)
        test_subjects = all_subjects[start_slice: start_slice+test_subjects_size]
        training_subjects = [s for s in all_subjects if s not in test_subjects]
    training_subjects = sorted(training_subjects) # ensuring reproducibility
    valid_subjects_size = int(len(training_subjects) * valid_prop)
    valid_subjects = training_subjects[:valid_subjects_size]
    training_subjects = training_subjects[valid_subjects_size:]
    categories = [c for c in liwc_categories if c in writings_df.columns]
    logger.debug("%d training users, %d validation users, %d test users." % (
        len(training_subjects), 
          len(valid_subjects),
          len(test_subjects)))
    subjects_split = {'train': training_subjects, 
                      'valid': valid_subjects, 
                      'test': test_subjects}

    user_level_texts = {}
    for row in writings_df.sort_values(by='date').itertuples():
        words = []
        raw_text = ""
        if row.tokenized_title:
            words.extend(row.tokenized_title)
            raw_text += row.title
        if row.tokenized_text:
            words.extend(row.tokenized_text)
            raw_text += row.text
        if not words or len(words)<min_post_len:
            print(row.subject)
            continue
        label = row.label
        liwc_categs = [getattr(row, categ) for categ in categories]
        if row.subject not in user_level_texts.keys():
            user_level_texts[row.subject] = {}
            user_level_texts[row.subject]['texts'] = [words]
            user_level_texts[row.subject]['label'] = label
            user_level_texts[row.subject]['liwc'] = [liwc_categs]
            user_level_texts[row.subject]['raw'] = [raw_text]
        else:
            user_level_texts[row.subject]['texts'].append(words)
            user_level_texts[row.subject]['liwc'].append(liwc_categs)
            user_level_texts[row.subject]['raw'].append(raw_text)
            
    return user_level_texts, subjects_split, vocabulary


In [30]:
user_level_data, subjects_split, vocabulary = load_erisk_data(writings_df, 
                                                            seq_len=hyperparams_features['maxlen'],
                                                            voc_size=hyperparams_features['max_features'],
                                                           emotion_lexicon=nrc_lexicon,
                                                           emotions=emotions,
                                                           user_level=hyperparams_features['user_level'],
                                                                                logger=logger
#                                                            vocabulary=pickle.load(open('vocabulary20K_selfharm.pkl', 'rb'))
                                                                               )

Loading data...



I0306 17:19:17.105950 139706427881216 <ipython-input-29-8f3a93e32636>:12] Loading data...



331 training users, 141 validation users, 815 test users.


I0306 17:19:24.046500 139706427881216 <ipython-input-29-8f3a93e32636>:50] 331 training users, 141 validation users, 815 test users.


subject20560000
subject20560000
subject20560000
subject20560000
subject20560000
subject20560000
subject20560000
subject20560000
subject20560000
subject20560000
subject39310000
subject20560000
subject20560000
subject20560000
subject20560000
subject20560000
subject20560000
subject39310000
subject20560000
subject20560000
subject20560000
subject39310000
subject20560000
subject20560000
subject39310000
subject20560000
subject39310000
subject20560000
subject20560000
subject20560000
subject39310000
subject20560000
subject39820000
subject39820000
subject39820000
subject26600000
subject26600000
subject26600000
subject83990000
subject83990000
subject83990000
subject83990000
subject83990000
subject83990000
subject39310000
subject83990000
subject39310000
subject83990000
subject39820000
subject26690000
subject26690000
subject83990000
subject26690000
subject83990000
subject26690000
subject83990000
subject51700000
subject26690000
subject26690000
subject26690000
subject26690000
subject56660000
subject8

subject99290000
subject99290000
subject99290000
subject40480000
subject1369
subject1369
subject50740000
subject3859
subject51700000
subject8338
subject75730000
subject9589
subject38440000
subject3763
subject3813
subject40480000
subject40480000
subject467
subject47530000
subject3763
subject5984
subject1397
subject3859
subject1369
subject467
subject20560000
subject9419
subject3813
subject3763
subject47530000
subject75480000
subject75480000
subject51700000
subject51700000
subject51700000
subject82780000
subject51700000
subject51700000
subject51700000
subject1397
subject82780000
subject51700000
subject3763
subject51700000
subject467
subject3813
subject1397
subject50740000
subject51700000
subject1397
subject3859
subject3859
subject5370000
subject75730000
subject78820000
subject75480000
subject51700000
subject51700000
subject1397
subject1397
subject1397
subject1397
subject82780000
subject3763
subject896
subject8338
subject5370000
subject8338
subject51700000
subject39310000
subject49500000
su

subject467
subject2845
subject467
subject14860000
subject6293
subject2845
subject5532
subject5532
subject5532
subject2845
subject2845
subject2845
subject467
subject2845
subject95780000
subject26690000
subject86280000
subject86280000
subject86280000
subject6293
subject5532
subject3813
subject2845
subject99270000
subject3504
subject51700000
subject6293
subject5532
subject8486
subject64700000
subject97260000
subject57570000
subject3859
subject73720000
subject5776
subject18560000
subject18560000
subject63710000
subject5776
subject86280000
subject86280000
subject86280000
subject896
subject18560000
subject5759
subject15370000
subject26690000
subject5776
subject3504
subject3813
subject5776
subject86280000
subject86280000
subject6293
subject5838
subject52940000
subject86280000
subject5532
subject64700000
subject35220000
subject95780000
subject95780000
subject8338
subject78100000
subject467
subject99270000
subject5776
subject467
subject88350000
subject6792
subject3763
subject9589
subject3504
su

subject18560000
subject98590000
subject83610000
subject3859
subject94420000
subject467
subject10780000
subject4110
subject901
subject83610000
subject81190000
subject65230000
subject65230000
subject35220000
subject5838
subject61070000
subject61070000
subject61070000
subject86280000
subject6083
subject13350000
subject7831
subject11570000
subject95780000
subject6083
subject12600000
subject901
subject6083
subject82900000
subject5711
subject6792
subject83610000
subject81190000
subject81190000
subject6083
subject5711
subject11570000
subject82900000
subject69410000
subject82410000
subject45820000
subject901
subject901
subject5759
subject5759
subject1417
subject75480000
subject77240000
subject1360000
subject5711
subject56660000
subject98590000
subject3504
subject82900000
subject6083
subject6216
subject6670
subject56660000
subject82900000
subject35220000
subject1417
subject1417
subject1417
subject1417
subject1417
subject98590000
subject350
subject350
subject94420000
subject12600000
subject18560

subject4930000
subject23200000
subject4858
subject4858
subject18560000
subject9710000
subject9710000
subject35220000
subject5759
subject6670
subject508
subject9710000
subject9710000
subject9710000
subject6139
subject9710000
subject9710000
subject9710000
subject94420000
subject33520000
subject9710000
subject9710000
subject9710000
subject15130000
subject65230000
subject9710000
subject78820000
subject11570000
subject11570000
subject896
subject77460000
subject896
subject35220000
subject6168
subject35220000
subject33520000
subject35220000
subject29770000
subject9710000
subject77460000
subject18560000
subject23200000
subject35220000
subject94270000
subject896
subject6216
subject23200000
subject3763
subject18560000
subject398
subject9710000
subject6088
subject82900000
subject23200000
subject5127
subject5127
subject5127
subject5127
subject6216
subject896
subject77620000
subject77620000
subject17290000
subject7927
subject7927
subject17290000
subject9710000
subject9710000
subject9710000
subject9

subject5776
subject5776
subject9710000
subject9710000
subject5426
subject77620000
subject8411
subject3400000
subject3400000
subject8411
subject8411
subject3400000
subject69410000
subject1910000
subject35220000
subject95720000
subject4110
subject57960000
subject8411
subject8411
subject3323
subject8411
subject15130000
subject3020000
subject3020000
subject3020000
subject3020000
subject3020000
subject4110
subject3020000
subject4858
subject8411
subject8411
subject50640000
subject57960000
subject5103
subject5103
subject4110
subject531
subject18560000
subject8411
subject2992
subject76730000
subject8411
subject8411
subject2013
subject2013
subject6216
subject18560000
subject85300000
subject77620000
subject94220000
subject18560000
subject18560000
subject9710000
subject8411
subject15130000
subject73720000
subject4590000
subject25300000
subject4110
subject531
subject9803
subject896
subject8411
subject8411
subject7320
subject5103
subject18560000
subject77620000
subject6216
subject1441
subject901
su

subject95720000
subject56660000
subject98370000
subject63450000
subject95720000
subject90080000
subject95720000
subject90910000
subject95720000
subject18800000
subject74130000
subject95720000
subject95720000
subject6216
subject50640000
subject95720000
subject13700000
subject57280000
subject90080000
subject90080000
subject5127
subject91390000
subject95720000
subject95720000
subject5127
subject5127
subject6951
subject3859
subject64700000
subject3763
subject90080000
subject69110000
subject7927
subject3135
subject6216
subject97260000
subject98590000
subject5364
subject7927
subject2159
subject5426
subject5364
subject69110000
subject5426
subject69110000
subject23200000
subject3763
subject9803
subject3835
subject1811
subject33040000
subject7371
subject5825
subject19290000
subject89490000
subject7320
subject69110000
subject18560000
subject4858
subject3763
subject35890000
subject82900000
subject3400000
subject19290000
subject98590000
subject98590000
subject98590000
subject81190000
subject459000

subject5220
subject4590000
subject2129
subject2129
subject62750000
subject2129
subject62750000
subject5233
subject5233
subject5233
subject5233
subject8411
subject8411
subject8411
subject99290000
subject7469
subject78900000
subject62750000
subject7478
subject8411
subject90080000
subject8411
subject78900000
subject62750000
subject3835
subject9003
subject82900000
subject65230000
subject8411
subject6302
subject87840000
subject22820000
subject94420000
subject62750000
subject15130000
subject62750000
subject6639
subject6639
subject62750000
subject6639
subject6639
subject6639
subject6639
subject6216
subject7416
subject5969
subject6302
subject6088
subject62940000
subject2129
subject98370000
subject2129
subject62750000
subject62750000
subject7422
subject5838
subject3323
subject35740000
subject33560000
subject7422
subject18560000
subject6951
subject65230000
subject6310
subject5838
subject35740000
subject70210000
subject85760000
subject531
subject47780000
subject8411
subject73160000
subject4778000

subject20560000
subject20560000
subject6302
subject38440000
subject4556
subject97770000
subject73160000
subject73160000
subject25300000
subject25300000
subject85760000
subject95720000
subject65230000
subject62750000
subject62750000
subject62750000
subject62750000
subject62750000
subject62750000
subject62750000
subject62750000
subject62750000
subject62750000
subject62750000
subject62750000
subject62750000
subject62750000
subject62750000
subject62750000
subject62750000
subject62750000
subject62750000
subject62750000
subject98590000
subject77620000
subject77620000
subject62750000
subject62750000
subject16270000
subject16270000
subject8561
subject6310
subject62750000
subject62750000
subject62750000
subject62750000
subject62750000
subject62750000
subject62750000
subject62750000
subject62750000
subject62750000
subject62750000
subject62750000
subject62750000
subject62750000
subject62750000
subject62750000
subject98590000
subject62750000
subject47780000
subject62750000
subject47780000
subject5

subject37850000
subject87840000
subject4556
subject59550000
subject8384
subject1811
subject6216
subject91390000
subject13700000
subject3083
subject1577
subject51800000
subject4244
subject8384
subject4244
subject9730000
subject2129
subject2923
subject9730000
subject1811
subject69110000
subject2923
subject50740000
subject9744
subject7371
subject83990000
subject78900000
subject6310
subject4556
subject18560000
subject23130000
subject51800000
subject51080000
subject51800000
subject69110000
subject78900000
subject50740000
subject55360000
subject98590000
subject98590000
subject4590000
subject9730000
subject6310
subject5517
subject5517
subject5517
subject5517
subject4529
subject11650000
subject39960000
subject98590000
subject55360000
subject78900000
subject46930000
subject5517
subject7371
subject46930000
subject4110
subject5517
subject6670
subject11650000
subject6139
subject3259
subject3259
subject3259
subject8561
subject98370000
subject6310
subject4187
subject7371
subject74130000
subject18800

subject1272
subject22520000
subject6525
subject51800000
subject11650000
subject13700000
subject89390000
subject77240000
subject91160000
subject5370000
subject8561
subject36380000
subject29050000
subject6210
subject1811
subject16020000
subject7469
subject4551
subject69110000
subject85760000
subject16020000
subject50740000
subject85760000
subject354
subject16020000
subject7371
subject41800000
subject73160000
subject398
subject5838
subject39960000
subject13700000
subject41800000
subject71350000
subject5370000
subject1101
subject30990000
subject97770000
subject97770000
subject8173
subject8607
subject30990000
subject9003
subject44590000
subject7371
subject65230000
subject8607
subject8607
subject3090000
subject6670
subject49610000
subject3090000
subject7410000
subject1577
subject23200000
subject6755
subject23130000
subject59340000
subject6947
subject2129
subject3090000
subject30990000
subject9166
subject8607
subject7927
subject23200000
subject83380000
subject3259
subject2992
subject97770000


subject7410000
subject99270000
subject51080000
subject3672
subject6947
subject8561
subject51800000
subject80660000
subject3135
subject55480000
subject55480000
subject9744
subject689
subject7478
subject2037
subject3359
subject70210000
subject97770000
subject55480000
subject4187
subject55360000
subject94420000
subject55360000
subject11650000
subject3359
subject10740000
subject55360000
subject55360000
subject5196
subject77800000
subject2062
subject77800000
subject49150000
subject4153
subject2037
subject62750000
subject9166
subject77800000
subject35620000
subject55360000
subject5854
subject6310
subject2037
subject2037
subject51800000
subject55360000
subject5196
subject55360000
subject4858
subject37850000
subject88200000
subject5196
subject6951
subject50740000
subject55480000
subject55480000
subject87480000
subject13700000
subject3359
subject4244
subject8607
subject63710000
subject3504
subject95780000
subject95780000
subject95780000
subject95780000
subject95780000
subject95780000
subject957

subject4196
subject7371
subject65230000
subject80660000
subject4196
subject6951
subject2129
subject2129
subject2129
subject8561
subject8384
subject36470000
subject4196
subject4196
subject4196
subject74370000
subject5614
subject2469
subject66450000
subject5095
subject96710000
subject96710000
subject2865
subject1101
subject62750000
subject4187
subject6956
subject94420000
subject96710000
subject94420000
subject94420000
subject21190000
subject73160000
subject45960000
subject94420000
subject97770000
subject45960000
subject4556
subject40350000
subject40350000
subject322
subject322
subject322
subject322
subject322
subject322
subject7248
subject41800000
subject1137
subject4260000
subject1272
subject46830000
subject21190000
subject21190000
subject4244
subject21190000
subject8561
subject6620
subject19290000
subject40350000
subject40350000
subject2865
subject35620000
subject2865
subject5095
subject96710000
subject96710000
subject2865
subject41800000
subject41800000
subject874
subject1101
subject7

subject73120000
subject66450000
subject2865
subject26690000
subject195
subject2865
subject195
subject9800000
subject89260000
subject6680
subject65230000
subject2865
subject2865
subject2865
subject5614
subject2865
subject5614
subject5614
subject5614
subject4153
subject5614
subject39310000
subject8754
subject8740
subject39310000
subject2129
subject95780000
subject9166
subject1637
subject39310000
subject39310000
subject39310000
subject39310000
subject5614
subject2865
subject2865
subject6037
subject1604
subject689
subject9800000
subject69230000
subject80660000
subject5614
subject5614
subject2159
subject2865
subject2865
subject2865
subject5614
subject5614
subject7940
subject5614
subject98590000
subject10010000
subject3278
subject35620000
subject4187
subject6670
subject1272
subject1604
subject8561
subject8340
subject5614
subject5614
subject5614
subject5614
subject5614
subject6755
subject1989
subject69230000
subject55360000
subject55360000
subject2865
subject2865
subject2865
subject9800000
su

subject55480000
subject54830000
subject5325
subject69110000
subject63030000
subject4112
subject4112
subject5614
subject21
subject98590000
subject91160000
subject5614
subject5614
subject5614
subject98590000
subject2159
subject99270000
subject77800000
subject7779
subject7940
subject4304
subject7469
subject21
subject6620
subject6620
subject2746
subject9153
subject9153
subject4592
subject5614
subject63030000
subject19020000
subject21190000
subject5614
subject5614
subject9153
subject7087
subject9800000
subject4112
subject69110000
subject21
subject98370000
subject2129
subject6755
subject12080000
subject5297
subject26690000
subject94990000
subject8607
subject6620
subject4556
subject46110000
subject5614
subject3400000
subject69110000
subject7087
subject5614
subject7940
subject5614
subject5614
subject5614
subject4470
subject91390000
subject5028
subject1577
subject4304
subject1369
subject88530000
subject6214
subject8340
subject8562
subject5614
subject1101
subject4556
subject4304
subject28570000


subject23130000
subject5028
subject785
subject5984
subject30460000
subject5325
subject10010000
subject52940000
subject5854
subject41840000
subject4187
subject5984
subject70810000
subject69110000
subject41840000
subject94990000
subject7087
subject52900000
subject62750000
subject7087
subject35620000
subject93950000
subject93950000
subject40290000
subject52980000
subject40290000
subject8754
subject7779
subject896
subject5984
subject52940000
subject5984
subject785
subject5614
subject7779
subject8340
subject8340
subject7720000
subject5984
subject5325
subject5325
subject2901
subject8377
subject6680
subject88700000
subject9334
subject5325
subject5325
subject7720000
subject2159
subject94990000
subject74090000
subject2469
subject71920000
subject5838
subject66450000
subject6214
subject23130000
subject1272
subject5028
subject5984
subject41820000
subject5984
subject85000000
subject73950000
subject52980000
subject85000000
subject18310000
subject5325
subject5614
subject5614
subject5614
subject8340
s

subject12080000
subject97870000
subject97870000
subject97870000
subject53070000
subject8754
subject5984
subject46110000
subject7371
subject5984
subject2419
subject1369
subject5614
subject70210000
subject2159
subject6029
subject811
subject9229
subject9710000
subject2419
subject8754
subject901
subject71350000
subject46830000
subject63710000
subject6029
subject2419
subject2419
subject2419
subject2419
subject2419
subject2419
subject2419
subject6620
subject689
subject2419
subject2419
subject2419
subject2419
subject5614
subject785
subject785
subject785
subject53070000
subject9710000
subject69110000
subject1811
subject60900000
subject82640000
subject9166
subject60900000
subject2419
subject2419
subject2419
subject2419
subject2419
subject73950000
subject4112
subject5984
subject5984
subject55360000
subject16270000
subject2419
subject785
subject46830000
subject5984
subject85000000
subject72200000
subject72200000
subject5984
subject55360000
subject55360000
subject72200000
subject1989
subject3323
s

subject7285
subject5614
subject785
subject73120000
subject9166
subject8411
subject8411
subject8411
subject8411
subject8411
subject3323
subject8411
subject8411
subject23130000
subject4112
subject30990000
subject97870000
subject4112
subject8561
subject13700000
subject5614
subject6310
subject1369
subject9229
subject9229
subject6947
subject6947
subject6947
subject21190000
subject6525
subject785
subject5469
subject1989
subject5984
subject6570000
subject5614
subject6570000
subject1272
subject785
subject2419
subject2419
subject2419
subject2419
subject2419
subject2419
subject4304
subject4112
subject4222
subject13980000
subject2746
subject467
subject4112
subject1811
subject4112
subject58730000
subject78090000
subject82640000
subject6302
subject6807
subject81800000
subject6302
subject4112
subject2419
subject2419
subject6216
subject21190000
subject62940000
subject41820000
subject41820000
subject41820000
subject18560000
subject45960000
subject8401
subject8401
subject9244
subject6029
subject6154
su

subject2865
subject8340
subject91500000
subject1789
subject2865
subject34730000
subject18310000
subject18310000
subject52900000
subject6714
subject21190000
subject7077
subject55360000
subject23490000
subject5984
subject4283
subject78100000
subject12080000
subject6154
subject6037
subject6029
subject6029
subject2419
subject2419
subject2419
subject2419
subject2419
subject2419
subject75730000
subject78820000
subject2419
subject18310000
subject18310000
subject7937
subject785
subject9710000
subject10010000
subject6154
subject18310000
subject29050000
subject71350000
subject3788
subject5984
subject5984
subject88010000
subject8561
subject7077
subject95780000
subject6037
subject898
subject2419
subject2419
subject5854
subject2159
subject73950000
subject5325
subject7285
subject7371
subject95780000
subject2419
subject6029
subject39670000
subject78820000
subject62000000
subject6994
subject62000000
subject62000000
subject62000000
subject322
subject32430000
subject2159
subject1397
subject7077
subject8

subject7779
subject98590000
subject3364
subject3364
subject71350000
subject32430000
subject91390000
subject26600000
subject26600000
subject5196
subject55480000
subject5984
subject8561
subject7940
subject5614
subject65340000
subject7278
subject195
subject5614
subject195
subject898
subject5984
subject5984
subject5984
subject29050000
subject5028
subject4371
subject9710000
subject18310000
subject18310000
subject18310000
subject5984
subject896
subject2797
subject5067
subject39960000
subject2577
subject4304
subject91500000
subject5325
subject2013
subject2013
subject2013
subject901
subject354
subject1195
subject73950000
subject31210000
subject5984
subject5984
subject58920000
subject6168
subject6168
subject4551
subject195
subject5532
subject8900
subject660000
subject51080000
subject5984
subject1789
subject1397
subject5984
subject660000
subject1397
subject63030000
subject5452
subject21190000
subject5452
subject366
subject5452
subject4551
subject20150000
subject5067
subject4551
subject5196
subje

subject5984
subject21190000
subject58860000
subject5577
subject73950000
subject57620000
subject2419
subject7927
subject69230000
subject7469
subject29050000
subject2419
subject2419
subject2419
subject2419
subject2777
subject5808
subject65340000
subject65340000
subject65340000
subject65340000
subject65340000
subject65340000
subject86280000
subject4304
subject7937
subject89930000
subject6029
subject5614
subject4112
subject7077
subject1137
subject41520000
subject41520000
subject26080000
subject26080000
subject49610000
subject5984
subject8607
subject6790
subject6029
subject62940000
subject6269
subject2419
subject3364
subject98590000
subject4592
subject626
subject1195
subject5825
subject6029
subject1811
subject89490000
subject70090000
subject2777
subject5984
subject5984
subject5984
subject1137
subject90080000
subject9800000
subject9098
subject5614
subject71350000
subject5984
subject5984
subject9166
subject3530
subject8050000
subject29050000
subject8340
subject89930000
subject4304
subject1208

subject71350000
subject48780000
subject5854
subject21190000
subject49280000
subject1811
subject6790
subject4153
subject3594
subject93490000
subject10740000
subject1604
subject89600000
subject6620
subject6522
subject81800000
subject1604
subject5808
subject4588
subject2419
subject2419
subject2419
subject2419
subject2419
subject5469
subject85000000
subject55480000
subject26080000
subject8340
subject8340
subject5532
subject5984
subject9903
subject2419
subject8377
subject1811
subject6214
subject195
subject3835
subject9950
subject9950
subject4244
subject6386
subject2129
subject94420000
subject1604
subject5614
subject1272
subject9903
subject5614
subject6029
subject5984
subject6029
subject77240000
subject3835
subject41820000
subject41820000
subject41820000
subject3835
subject4110
subject10740000
subject6214
subject9803
subject2419
subject12080000
subject6214
subject49280000
subject51630000
subject41520000
subject41520000
subject7470000
subject71040000
subject9230
subject5614
subject6790
subjec

subject1989
subject70090000
subject69110000
subject69110000
subject51630000
subject1811
subject6714
subject6029
subject5984
subject5577
subject5854
subject5984
subject5984
subject3323
subject1369
subject5577
subject5577
subject4112
subject71040000
subject5854
subject9803
subject9803
subject9803
subject5984
subject9803
subject41820000
subject9803
subject9803
subject1604
subject9803
subject17290000
subject6029
subject91500000
subject5325
subject41820000
subject10440000
subject2419
subject2419
subject2419
subject2419
subject2419
subject65230000
subject9950
subject73950000
subject73950000
subject7470000
subject4304
subject72030000
subject5241
subject10740000
subject35220000
subject74090000
subject21620000
subject74090000
subject6790
subject2129
subject1811
subject94220000
subject4304
subject2257
subject9043
subject8377
subject9043
subject9043
subject21190000
subject9043
subject9950
subject9950
subject88010000
subject6790
subject163
subject4112
subject45960000
subject163
subject5311
subject

subject4482
subject4482
subject8340
subject42210000
subject6114
subject4982
subject4982
subject9230
subject5984
subject10740000
subject6029
subject9098
subject2419
subject2419
subject6029
subject8200
subject5984
subject6302
subject660000
subject5241
subject46790000
subject88350000
subject5241
subject4244
subject4556
subject8384
subject7351
subject41520000
subject901
subject4244
subject49280000
subject5808
subject10010000
subject50740000
subject6029
subject7470000
subject7940
subject9950
subject1397
subject9950
subject9950
subject11650000
subject467
subject2013
subject7470000
subject8173
subject18640000
subject6994
subject3076
subject7351
subject7285
subject5311
subject8512
subject5311
subject4556
subject2257
subject4071
subject31210000
subject3835
subject5984
subject9043
subject98590000
subject5461
subject4244
subject8255
subject4304
subject3076
subject9244
subject57960000
subject88010000
subject88010000
subject82130000
subject6790
subject6139
subject3339
subject1143
subject4244
subjec

subject2419
subject9961
subject9873
subject41520000
subject96520000
subject4112
subject41820000
subject3183
subject4482
subject29010000
subject6620000
subject88010000
subject88010000
subject5984
subject2525
subject3364
subject3364
subject4482
subject29220000
subject41820000
subject7087
subject9043
subject9043
subject9043
subject9043
subject9043
subject63980000
subject2525
subject8754
subject5984
subject3183
subject40290000
subject23490000
subject23490000
subject3614
subject8561
subject6525
subject2519
subject2257
subject5984
subject70030000
subject8512
subject8512
subject7278
subject8754
subject1397
subject4470
subject18560000
subject6837
subject1397
subject1369
subject4443
subject26690000
subject26690000
subject1143
subject8754
subject4110
subject2797
subject9710000
subject91500000
subject3530
subject4110
subject79330000
subject4443
subject5808
subject4110
subject2419
subject2419
subject2419
subject2419
subject2419
subject2419
subject4110
subject4110
subject6088
subject2923
subject231

subject8340
subject8340
subject1499
subject4443
subject84880000
subject5469
subject96520000
subject55360000
subject6877
subject5830
subject70030000
subject21190000
subject9873
subject7529
subject8512
subject6680
subject6994
subject4080000
subject7077
subject9166
subject1499
subject5984
subject5984
subject4153
subject5984
subject41820000
subject5984
subject5984
subject5984
subject84880000
subject1577
subject47530000
subject8371
subject71350000
subject7433
subject8607
subject5241
subject85000000
subject85000000
subject9950
subject9950
subject9098
subject21190000
subject1499
subject1499
subject5469
subject5469
subject5469
subject5469
subject5984
subject5469
subject5984
subject9003
subject898
subject9710000
subject7529
subject10740000
subject69230000
subject3901
subject1563
subject4443
subject87560000
subject55360000
subject4482
subject3835
subject5984
subject5830
subject6877
subject46110000
subject2419
subject88010000
subject9903
subject6680
subject1577
subject489
subject55360000
subject5

subject5984
subject5614
subject5614
subject58730000
subject94220000
subject70090000
subject6714
subject2257
subject7278
subject3076
subject5660
subject1563
subject1369
subject41820000
subject41820000
subject6310
subject5311
subject4244
subject4244
subject4244
subject195
subject4410
subject5577
subject851
subject851
subject195
subject3530
subject1369
subject6877
subject48430000
subject2777
subject47680000
subject39670000
subject4470
subject3614
subject3614
subject23490000
subject23490000
subject23490000
subject23490000
subject7779
subject6877
subject3614
subject71040000
subject4410
subject803
subject4482
subject70510000
subject9229
subject195
subject67370000
subject3614
subject803
subject803
subject67880000
subject195
subject803
subject803
subject9950
subject2257
subject803
subject69110000
subject69230000
subject5984
subject69230000
subject5577
subject4470
subject8607
subject4443
subject7940
subject8512
subject40350000
subject67880000
subject6790
subject5984
subject5984
subject3386
subj

subject2519
subject244
subject244
subject4707
subject3835
subject9950
subject244
subject244
subject244
subject91500000
subject4707
subject4707
subject9903
subject4707
subject9950
subject9950
subject4443
subject3145
subject803
subject91160000
subject2419
subject2419
subject2419
subject4707
subject4707
subject3145
subject4707
subject84340000
subject4707
subject4707
subject4707
subject9744
subject4410
subject2923
subject2923
subject2519
subject4707
subject244
subject959
subject5984
subject77400000
subject4707
subject77400000
subject4410
subject2257
subject4707
subject2257
subject21190000
subject2257
subject4707
subject4707
subject6877
subject4707
subject4707
subject2728
subject2728
subject4707
subject8561
subject6790
subject2728
subject2728
subject4707
subject94220000
subject2419
subject2419
subject508
subject4196
subject244
subject2419
subject244
subject244
subject4196
subject93490000
subject1369
subject2728
subject2728
subject3386
subject6877
subject2419
subject2419
subject2419
subject2

subject5984
subject5984
subject46410000
subject3191
subject58860000
subject2728
subject26600000
subject94220000
subject5984
subject9744
subject69110000
subject96
subject99270000
subject2901
subject9744
subject6037
subject1577
subject5984
subject4222
subject1577
subject4443
subject8754
subject5325
subject6072
subject3247
subject2419
subject2419
subject2419
subject2419
subject2419
subject96330000
subject5469
subject785
subject6680
subject6680
subject2923
subject2923
subject785
subject5660
subject69230000
subject6994
subject5577
subject4080000
subject4080000
subject4080000
subject5830
subject3288
subject93490000
subject15130000
subject1989
subject6154
subject5160
subject2419
subject2419
subject2419
subject5241
subject3910000
subject3288
subject5241
subject2419
subject2419
subject97960000
subject4080000
subject4080000
subject1404
subject94220000
subject1397
subject3093
subject97960000
subject97960000
subject41820000
subject97960000
subject898
subject34770000
subject34770000
subject1563
sub

subject1195
subject5311
subject7066
subject1195
subject88010000
subject4588
subject3530
subject4482
subject2901
subject5241
subject5241
subject9950
subject3220
subject6620
subject851
subject5095
subject6029
subject9950
subject7529
subject1989
subject9950
subject3901
subject9043
subject785
subject5095
subject4112
subject1604
subject5115
subject2419
subject2159
subject5236
subject7710
subject2901
subject7710
subject6029
subject25300000
subject69230000
subject9244
subject2991
subject7710
subject8607
subject5802
subject2519
subject322
subject3288
subject48990000
subject9229
subject47160000
subject6029
subject83990000
subject48990000
subject5984
subject5984
subject4112
subject5854
subject96
subject17630000
subject8607
subject9153
subject51630000
subject5169
subject88010000
subject5311
subject22390000
subject5311
subject4556
subject3400000
subject3400000
subject3400000
subject3400000
subject12920000
subject12920000
subject5984
subject2997
subject47160000
subject47160000
subject2901
subject64

subject3901
subject5746
subject74070000
subject7710
subject9043
subject69230000
subject83380000
subject7087
subject49440000
subject7087
subject6333
subject94220000
subject4112
subject195
subject6029
subject1369
subject1369
subject1369
subject48990000
subject5311
subject10740000
subject10740000
subject99290000
subject6407
subject5984
subject5984
subject2997
subject57620000
subject57620000
subject77460000
subject6088
subject9230
subject3297
subject1397
subject6414
subject322
subject26080000
subject6414
subject5241
subject6333
subject3359
subject94220000
subject4410
subject8371
subject6414
subject2901
subject2419
subject2419
subject2901
subject8340
subject2901
subject2013
subject2901
subject3247
subject8561
subject49280000
subject8371
subject8371
subject94220000
subject87400000
subject2419
subject5746
subject5746
subject91500000
subject4110
subject5854
subject7529
subject5236
subject6029
subject5469
subject5469
subject1577
subject71970000
subject9043
subject5984
subject1811
subject9349000

subject4410
subject36980000
subject244
subject3339
subject4443
subject3065
subject89260000
subject36980000
subject6154
subject36980000
subject6333
subject1497
subject6333
subject39670000
subject7940
subject4482
subject4592
subject2419
subject1499
subject13700000
subject15930000
subject6758
subject63610000
subject69230000
subject7710
subject7710
subject4283
subject4482
subject77460000
subject4529
subject9950
subject1497
subject4072
subject9950
subject9950
subject2621
subject4410
subject9950
subject9950
subject6072
subject7087
subject3835
subject3835
subject2419
subject2419
subject2419
subject2419
subject2419
subject7285
subject5311
subject36980000
subject36980000
subject36980000
subject5311
subject55360000
subject10740000
subject55360000
subject36980000
subject5325
subject9744
subject3191
subject32470000
subject32470000
subject17630000
subject6414
subject84880000
subject48780000
subject32430000
subject40690000
subject17630000
subject3097
subject8444
subject2013
subject47160000
subject59

subject1423
subject78900000
subject1423
subject4482
subject13700000
subject3318
subject5311
subject7248
subject3910000
subject4961
subject4470
subject9873
subject49280000
subject5614
subject9950
subject1137
subject52900000
subject9710000
subject6790
subject6892
subject5984
subject2419
subject3232
subject9950
subject9950
subject5802
subject2840
subject4110
subject6680
subject36980000
subject6088
subject3232
subject7632
subject20560000
subject5577
subject6892
subject6892
subject3191
subject3835
subject3297
subject6310
subject6310
subject6310
subject53070000
subject6892
subject1989
subject6877
subject6877
subject3428
subject4244
subject73950000
subject1074
subject4244
subject10740000
subject94220000
subject6620
subject6620
subject1074
subject94850000
subject89860000
subject3247
subject61800000
subject1563
subject4592
subject5311
subject49280000
subject7710
subject6120000
subject1772
subject1497
subject9244
subject21190000
subject5984
subject6525
subject4592
subject4592
subject4592
subject

subject6994
subject6994
subject6994
subject6994
subject3297
subject3247
subject6994
subject6994
subject9090
subject4410
subject41520000
subject6994
subject6994
subject1811
subject3339
subject8384
subject41800000
subject6758
subject1499
subject6088
subject3504
subject7478
subject1074
subject2472
subject2991
subject5984
subject5984
subject5984
subject17310000
subject5984
subject1772
subject2901
subject1074
subject4592
subject1074
subject4592
subject4592
subject5984
subject4592
subject4443
subject7710
subject94490000
subject870000
subject870000
subject4592
subject4592
subject4592
subject5984
subject4592
subject4592
subject7710
subject6407
subject7710
subject1497
subject5984
subject5854
subject2840
subject5984
subject8371
subject53070000
subject1423
subject1423
subject1423
subject2901
subject9244
subject4592
subject7710
subject7710
subject5746
subject1323
subject4556
subject2418
subject7710
subject7710
subject2923
subject63030000
subject6994
subject6994
subject7710
subject6994
subject69230

subject65230000
subject3530
subject97960000
subject7529
subject6522
subject96
subject7469
subject2991
subject1497
subject1497
subject78820000
subject3269
subject2069
subject8701
subject7416
subject4071
subject3364
subject322
subject3232
subject8701
subject3835
subject8701
subject8701
subject8701
subject7710
subject9950
subject9950
subject5577
subject5577
subject8701
subject7355
subject2423
subject88
subject9950
subject1497
subject1497
subject1604
subject5802
subject2423
subject1604
subject4071
subject9950
subject9744
subject9744
subject8701
subject3498
subject9744
subject9744
subject8701
subject8701
subject7710
subject3232
subject8701
subject8701
subject2901
subject2901
subject5241
subject5241
subject2901
subject1074
subject5838
subject7710
subject7710
subject7710
subject7710
subject94220000
subject7710
subject8057
subject3498
subject381
subject381
subject63040000
subject89860000
subject2901
subject8701
subject3498
subject9654
subject3297
subject9244
subject2901
subject3772
subject3498

subject5241
subject5241
subject5085
subject5802
subject7779
subject6029
subject2069
subject2997
subject63610000
subject5311
subject2997
subject4556
subject1074
subject5984
subject89650000
subject5085
subject5530000
subject5241
subject6029
subject17310000
subject3530
subject3530
subject4073
subject5085
subject3727
subject5236
subject2894
subject244
subject244
subject39670000
subject5614
subject244
subject3504
subject7285
subject5577
subject9950
subject3247
subject3076
subject4443
subject4073
subject626
subject4073
subject4592
subject8444
subject3835
subject4592
subject4592
subject4592
subject8377
subject4592
subject13700000
subject4592
subject626
subject4592
subject1120
subject4592
subject3727
subject1423
subject4443
subject6877
subject1423
subject9229
subject8384
subject7710
subject7710
subject7710
subject7710
subject7710
subject7710
subject7710
subject40290000
subject7710
subject7371
subject7710
subject7710
subject7710
subject7710
subject7710
subject7710
subject41820000
subject7940
su

subject2419
subject5085
subject3274
subject2419
subject9789
subject1369
subject5085
subject5719
subject5719
subject5719
subject5719
subject5719
subject5719
subject5719
subject36910000
subject3339
subject5085
subject8754
subject1074
subject1074
subject7616
subject5085
subject1074
subject5241
subject5241
subject4244
subject4244
subject3672
subject4371
subject1074
subject17630000
subject5311
subject3788
subject5311
subject1074
subject898
subject4443
subject1074
subject36980000
subject5984
subject1369
subject1369
subject195
subject195
subject2901
subject95900000
subject2901
subject96
subject3672
subject1074
subject2901
subject2419
subject2777
subject95900000
subject4110
subject64900000
subject244
subject6138
subject898
subject1497
subject8512
subject5085
subject785
subject785
subject51120000
subject6138
subject4592
subject3428
subject9903
subject9903
subject3727
subject3727
subject62000000
subject2894
subject3727
subject2894
subject26980000
subject5085
subject4592
subject4592
subject649000

subject67880000
subject7470000
subject99270000
subject70030000
subject70030000
subject70030000
subject82640000
subject62940000
subject73410000
subject96710000
subject69230000
subject69230000
subject40350000
subject94220000
subject81800000
subject25590000
subject25510000
subject94220000
subject63610000
subject63610000
subject96880000
subject96880000
subject7470000
subject77800000
subject94220000
subject40690000
subject90080000
subject89860000
subject19210000
subject89490000
subject40690000
subject63610000
subject39670000
subject73410000
subject67880000
subject13350000
subject34620000
subject64900000
subject36980000
subject63610000
subject63610000
subject64900000
subject77400000
subject39960000
subject69110000
subject89860000
subject20120000
subject96750000
subject25590000
subject25590000
subject11740000
subject97960000
subject97960000
subject84340000
subject84340000
subject78820000
subject88010000
subject39960000
subject69230000
subject97960000
subject74070000
subject80660000
subject369

subject89860000
subject25590000
subject19020000
subject77620000
subject36980000
subject36980000
subject36980000
subject42240000
subject42240000
subject870000
subject96330000
subject46790000
subject39960000
subject30990000
subject42240000
subject36980000
subject48430000
subject35180000
subject22390000
subject35880000
subject26980000
subject35880000
subject23490000
subject73160000
subject23490000
subject23170000
subject35880000
subject30890000
subject30890000
subject660000
subject49020000
subject57470000
subject35800000
subject36980000
subject65340000
subject25590000
subject54830000
subject82900000
subject89860000
subject25590000
subject25590000
subject3910000
subject42240000
subject86290000
subject40690000
subject36980000
subject89650000
subject21190000
subject25590000
subject40690000
subject36980000
subject810000
subject69230000
subject54050000
subject7470000
subject42240000
subject84880000
subject42240000
subject36980000
subject36980000
subject810000
subject36980000
subject36980000
su

subject36980000
subject810000
subject94220000
subject9730000
subject86290000
subject62000000
subject11640000
subject49280000
subject61680000
subject24470000
subject49280000
subject17310000
subject17310000
subject23960000
subject88010000
subject51120000
subject45310000
subject89860000
subject18560000
subject89490000
subject94220000
subject83990000
subject23960000
subject63610000
subject18640000
subject7470000
subject42210000
subject810000
subject88010000
subject42210000
subject64900000
subject36980000
subject57470000
subject97960000
subject29050000
subject6570000
subject64900000
subject17310000
subject47680000
subject53880000
subject25590000
subject63610000
subject870000
subject17310000
subject84340000
subject99030000
subject63480000
subject23960000
subject63030000
subject40350000
subject41520000
subject61680000
subject36980000
subject50990000
subject45310000
subject71970000
subject97870000
subject870000
subject57710000
subject51120000
subject84340000
subject72030000
subject48780000
sub

subject79940000
subject79940000
subject79940000
subject32430000
subject97960000
subject42240000
subject26980000
subject64900000
subject26980000
subject99270000
subject63610000
subject84880000
subject80730000
subject80730000
subject80730000
subject80730000
subject80730000
subject80730000
subject80730000
subject81190000
subject81190000
subject81190000
subject27260000
subject27260000
subject55480000
subject45960000
subject70080000
subject45960000
subject69110000
subject13670000
subject41840000
subject25590000
subject870000
subject71350000
subject98030000
subject55480000
subject70030000
subject19670000
subject54050000
subject41000000
subject89860000
subject69230000
subject27260000
subject97960000
subject45020000
subject97960000
subject40270000
subject80520000
subject47530000
subject17630000
subject23960000
subject45020000
subject22390000
subject62050000
subject24470000
subject67280000
subject79940000
subject98030000
subject52880000
subject41830000
subject31210000
subject54050000
subject416

subject52120000
subject32090000
subject71350000
subject70080000
subject89860000
subject32430000
subject19670000
subject78090000
subject23490000
subject41260000
subject3910000
subject76470000
subject89860000
subject4080000
subject88010000
subject36480000
subject1300000
subject55530000
subject55530000
subject31660000
subject46790000
subject31660000
subject80520000
subject55360000
subject70200000
subject77510000
subject98030000
subject69230000
subject29050000
subject69230000
subject69230000
subject98030000
subject63530000
subject84880000
subject49500000
subject86750000
subject85400000
subject36480000
subject83610000
subject55530000
subject55530000
subject89490000
subject93490000
subject4080000
subject13600000
subject4080000
subject62000000
subject4080000
subject55530000
subject13150000
subject46110000
subject49020000
subject89860000
subject53470000
subject92460000
subject41820000
subject55480000
subject27260000
subject27260000
subject41260000
subject63040000
subject70030000
subject6304000

subject81190000
subject81190000
subject81190000
subject23960000
subject54830000
subject62020000
subject81800000
subject55360000
subject55360000
subject61920000
subject61920000
subject67280000
subject89860000
subject86320000
subject70080000
subject70080000
subject70080000
subject23170000
subject36980000
subject55530000
subject36980000
subject87560000
subject89260000
subject87620000
subject91500000
subject61680000
subject51980000
subject24470000
subject28220000
subject85600000
subject90320000
subject1300000
subject13700000
subject97870000
subject46110000
subject17850000
subject17850000
subject17850000
subject24160000
subject65230000
subject88010000
subject52980000
subject76470000
subject76470000
subject98030000
subject17850000
subject57600000
subject63040000
subject26980000
subject54050000
subject26600000
subject11640000
subject13700000
subject28570000
subject59550000
subject54050000
subject45820000
subject53880000
subject55360000
subject55360000
subject46790000
subject1850000
subject266

subject84880000
subject43030000
subject63710000
subject47530000
subject80520000
subject39960000
subject84880000
subject84880000
subject91500000
subject79940000
subject1850000
subject50270000
subject55480000
subject88010000
subject76470000
subject87560000
subject45820000
subject85430000
subject13150000
subject79940000
subject79940000
subject41000000
subject620000
subject29010000
subject73160000
subject1850000
subject73300000
subject57600000
subject73160000
subject810000
subject49500000
subject20120000
subject40690000
subject67450000
subject88010000
subject88010000
subject94490000
subject1630000
subject57600000
subject9730000
subject45020000
subject67330000
subject61680000
subject85830000
subject74070000
subject74070000
subject17310000
subject70030000
subject70030000
subject36980000
subject89860000
subject48820000
subject49500000
subject15370000
subject15370000
subject61250000
subject88010000
subject88010000
subject57550000
subject31130000
subject23490000
subject46110000
subject13150000


subject13150000
subject13150000
subject60900000
subject88010000
subject45410000
subject66270000
subject66270000
subject66270000
subject36910000
subject70090000
subject78600000
subject94220000
subject50640000
subject35220000
subject3910000
subject78600000
subject3910000
subject36980000
subject57620000
subject52120000
subject49500000
subject26980000
subject78600000
subject58730000
subject41000000
subject74070000
subject98030000
subject14460000
subject98030000
subject74070000
subject74070000
subject58860000
subject53880000
subject51630000
subject51630000
subject94420000
subject75640000
subject23170000
subject73160000
subject91500000
subject26140000
subject13980000
subject1850000
subject1850000
subject1850000
subject59320000
subject91610000
subject6570000
subject17850000
subject17850000
subject17850000
subject17850000
subject84340000
subject78900000
subject85430000
subject97870000
subject24990000
subject78600000
subject33230000
subject53880000
subject53880000
subject58730000
subject7860000

subject40690000
subject98580000
subject97960000
subject47910000
subject46250000
subject70080000
subject64900000
subject35880000
subject20270000
subject41260000
subject23130000
subject20270000
subject35880000
subject36470000
subject95780000
subject76470000
subject91610000
subject11650000
subject47680000
subject91500000
subject64900000
subject64900000
subject64900000
subject7860000
subject87920000
subject97960000
subject78600000
subject21190000
subject35880000
subject35880000
subject63040000
subject79940000
subject46120000
subject49500000
subject29050000
subject94420000
subject90320000
subject7860000
subject13700000
subject67330000
subject69110000
subject1300000
subject85430000
subject57710000
subject13150000
subject66270000
subject22040000
subject27260000
subject71350000
subject2450000
subject58860000
subject98580000
subject3910000
subject41070000
subject82630000
subject26140000
subject35880000
subject13150000
subject11710000
subject49500000
subject26140000
subject83380000
subject358800

subject77400000
subject41070000
subject94270000
subject1850000
subject94270000
subject47910000
subject85430000
subject28570000
subject67330000
subject55530000
subject70590000
subject39960000
subject87400000
subject86750000
subject70030000
subject70030000
subject78090000
subject51120000
subject51120000
subject51120000
subject9680000
subject9680000
subject61680000
subject78900000
subject28660000
subject44440000
subject66270000
subject33230000
subject85830000
subject59320000
subject55530000
subject89260000
subject35800000
subject96710000
subject17850000
subject66270000
subject66270000
subject2450000
subject98580000
subject2450000
subject870000
subject35800000
subject70080000
subject84340000
subject35800000
subject32470000
subject1850000
subject70080000
subject35670000
subject98030000
subject22390000
subject37850000
subject19350000
subject53070000
subject17630000
subject53880000
subject31660000
subject47530000
subject70510000
subject35880000
subject35880000
subject85400000
subject59320000


subject55360000
subject68760000
subject13150000
subject13150000
subject46250000
subject20270000
subject20270000
subject47910000
subject76470000
subject41260000
subject53420000
subject23490000
subject98100000
subject36440000
subject58730000
subject83610000
subject36440000
subject48990000
subject63040000
subject27550000
subject63040000
subject51120000
subject13150000
subject620000
subject27550000
subject58860000
subject59320000
subject17850000
subject17850000
subject17850000
subject91390000
subject11650000
subject19670000
subject70080000
subject74070000
subject70090000
subject77400000
subject51120000
subject47530000
subject90590000
subject36470000
subject36470000
subject79940000
subject55030000
subject88010000
subject41000000
subject88350000
subject36440000
subject76470000
subject70090000
subject88350000
subject59550000
subject48780000
subject17850000
subject36470000
subject36470000
subject85000000
subject76470000
subject41260000
subject22390000
subject66890000
subject620000
subject89860

subject26540000
subject78900000
subject78900000
subject46930000
subject17630000
subject17630000
subject17630000
subject23130000
subject17630000
subject47910000
subject96230000
subject17630000
subject17630000
subject17630000
subject17630000
subject89990000
subject41000000
subject17630000
subject17630000
subject17630000
subject17290000
subject17630000
subject17630000
subject41500000
subject17630000
subject17630000
subject17630000
subject17630000
subject17630000
subject17630000
subject90320000
subject17630000
subject17630000
subject17630000
subject17630000
subject17630000
subject17630000
subject17630000
subject17630000
subject17630000
subject78900000
subject17630000
subject17630000
subject96710000
subject17630000
subject870000
subject96710000
subject96710000
subject17630000
subject17630000
subject870000
subject17630000
subject17630000
subject17630000
subject17630000
subject54440000
subject54440000
subject54440000
subject83990000
subject17630000
subject17630000
subject17630000
subject46790

subject96330000
subject51630000
subject75760000
subject71350000
subject88010000
subject20150000
subject20150000
subject52980000
subject11710000
subject22390000
subject18760000
subject63470000
subject23130000
subject99210000
subject13150000
subject13150000
subject88350000
subject84730000
subject70090000
subject70470000
subject15930000
subject90590000
subject36470000
subject90590000
subject35800000
subject98100000
subject14660000
subject70080000
subject70080000
subject14660000
subject70080000
subject14660000
subject14660000
subject80520000
subject28570000
subject14660000
subject14660000
subject61420000
subject55530000
subject76470000
subject80970000
subject98030000
subject7510000
subject14660000
subject7510000
subject98030000
subject71410000
subject7510000
subject23490000
subject13150000
subject48780000
subject68570000
subject14660000
subject48780000
subject14660000
subject14460000
subject29010000
subject14660000
subject40970000
subject70080000
subject70080000
subject14660000
subject4899

subject22000000
subject61920000
subject43030000
subject43030000
subject89990000
subject50640000
subject43030000
subject35800000
subject41500000
subject98580000
subject61690000
subject53880000
subject23130000
subject23130000
subject53880000
subject13150000
subject58860000
subject78820000
subject27260000
subject70080000
subject39670000
subject65230000
subject40970000
subject73160000
subject1850000
subject85760000
subject79940000
subject13150000
subject8400000
subject88350000
subject35800000
subject870000
subject91500000
subject65230000
subject65230000
subject85760000
subject89390000
subject91500000
subject88350000
subject84730000
subject84730000
subject59320000
subject83800000
subject47530000
subject47530000
subject41500000
subject87920000
subject82780000
subject26540000
subject26540000
subject2420000
subject7430000
subject88350000
subject82780000
subject96710000
subject70030000
subject35620000
subject35620000
subject78090000
subject13150000
subject70080000
subject70200000
subject440000


subject41500000
subject91770000
subject69230000
subject14160000
subject47910000
subject32470000
subject82600000
subject89600000
subject69230000
subject80730000
subject85430000
subject89600000
subject96230000
subject82130000
subject91770000
subject85020000
subject91500000
subject88700000
subject66270000
subject89860000
subject96230000
subject35670000
subject36440000
subject82600000
subject88010000
subject54440000
subject10370000
subject38590000
subject98590000
subject95580000
subject27260000
subject78900000
subject85430000
subject14660000
subject14460000
subject52240000
subject44760000
subject86270000
subject84740000
subject96230000
subject77400000
subject96230000
subject77400000
subject94790000
subject440000
subject47530000
subject67370000
subject70210000
subject47530000
subject82600000
subject70210000
subject57470000
subject49830000
subject98560000
subject76470000
subject40690000
subject63470000
subject82780000
subject41260000
subject96230000
subject43560000
subject96230000
subject962

subject38590000
subject55910000
subject96590000
subject11640000
subject55530000
subject85760000
subject55910000
subject11640000
subject27260000
subject620000
subject95780000
subject14460000
subject66270000
subject46250000
subject98380000
subject82130000
subject82130000
subject82780000
subject89330000
subject26080000
subject26690000
subject41260000
subject49830000
subject21190000
subject810000
subject23490000
subject43030000
subject57070000
subject41000000
subject41000000
subject41260000
subject85760000
subject57070000
subject57070000
subject14160000
subject19370000
subject85760000
subject96230000
subject41840000
subject41840000
subject55530000
subject14660000
subject98030000
subject14660000
subject61690000
subject43560000
subject43560000
subject43560000
subject43560000
subject43560000
subject26540000
subject43560000
subject84730000
subject18760000
subject91590000
subject20850000
subject66270000
subject66270000
subject66270000
subject41000000
subject10740000
subject66270000
subject42210

subject27260000
subject27260000
subject9730000
subject55530000
subject20150000
subject94790000
subject66270000
subject57070000
subject86290000
subject93130000
subject44760000
subject57470000
subject32470000
subject93950000
subject97870000
subject52690000
subject36910000
subject67880000
subject71250000
subject71350000
subject7510000
subject41520000
subject82600000
subject41520000
subject9150000
subject41260000
subject55530000
subject26130000
subject24680000
subject84880000
subject86290000
subject17290000
subject36910000
subject36910000
subject36910000
subject27260000
subject76470000
subject23130000
subject32750000
subject66270000
subject46040000
subject66270000
subject43560000
subject36910000
subject29050000
subject96560000
subject52980000
subject46250000
subject35670000
subject48780000
subject15880000
subject91500000
subject10740000
subject13150000
subject48780000
subject20150000
subject84620000
subject84880000
subject41000000
subject71350000
subject24160000
subject43520000
subject6347

subject41520000
subject46930000
subject7510000
subject7510000
subject54440000
subject14160000
subject96230000
subject96230000
subject41920000
subject39960000
subject7410000
subject11670000
subject23960000
subject74370000
subject70030000
subject70080000
subject4050000
subject49830000
subject16240000
subject43560000
subject11670000
subject11670000
subject51330000
subject3910000
subject14160000
subject18050000
subject85760000
subject52980000
subject43530000
subject89490000
subject89490000
subject43030000
subject64900000
subject84880000
subject27210000
subject55360000
subject55360000
subject55360000
subject98580000
subject2730000
subject78120000
subject27210000
subject78120000
subject26540000
subject7410000
subject40270000
subject11670000
subject96710000
subject87310000
subject5530000
subject87920000
subject45530000
subject26540000
subject21140000
subject9980000
subject86290000
subject49440000
subject68570000
subject26540000
subject59320000
subject11670000
subject23490000
subject93950000
s

subject53070000
subject49440000
subject38890000
subject66270000
subject81800000
subject26540000
subject53110000
subject79410000
subject52690000
subject36980000
subject2420000
subject11640000
subject82130000
subject48470000
subject11640000
subject11640000
subject66890000
subject32750000
subject2420000
subject2420000
subject98590000
subject14460000
subject8230000
subject23160000
subject61690000
subject88350000
subject63610000
subject41920000
subject36440000
subject14460000
subject79930000
subject7470000
subject870000
subject48470000
subject84730000
subject11650000
subject49830000
subject91610000
subject2450000
subject49830000
subject49830000
subject66270000
subject16240000
subject49440000
subject63610000
subject50640000
subject98560000
subject70030000
subject35150000
subject78330000
subject9980000
subject78330000
subject23490000
subject63470000
subject98380000
subject6620000
subject32750000
subject32750000
subject32750000
subject36440000
subject32750000
subject32750000
subject32750000
su

subject11640000
subject98580000
subject14660000
subject78330000
subject86940000
subject78330000
subject66580000
subject54440000
subject86940000
subject2420000
subject2420000
subject74200000
subject48490000
subject78900000
subject4080000
subject49290000
subject49290000
subject29010000
subject49290000
subject22000000
subject22000000
subject63470000
subject11640000
subject11640000
subject70210000
subject7410000
subject43560000
subject43560000
subject27210000
subject63530000
subject11640000
subject63470000
subject82600000
subject11640000
subject11640000
subject85430000
subject43560000
subject43560000
subject55030000
subject43560000
subject32750000
subject46410000
subject870000
subject2420000
subject66580000
subject39670000
subject91770000
subject46410000
subject59340000
subject41000000
subject66580000
subject26540000
subject1300000
subject46410000
subject14160000
subject35150000
subject36440000
subject7510000
subject7510000
subject7510000
subject7510000
subject7510000
subject36440000
subje

subject1850000
subject86940000
subject14660000
subject3700000
subject1850000
subject49290000
subject1850000
subject41500000
subject22000000
subject3700000
subject32750000
subject32750000
subject32750000
subject32750000
subject32750000
subject46790000
subject4050000
subject36440000
subject32750000
subject32750000
subject32750000
subject32750000
subject32750000
subject32750000
subject32750000
subject6570000
subject33040000
subject14660000
subject7510000
subject7510000
subject35670000
subject81800000
subject12240000
subject70210000
subject3700000
subject34930000
subject34930000
subject14460000
subject14660000
subject35800000
subject62050000
subject57600000
subject57600000
subject34930000
subject34930000
subject41500000
subject7510000
subject14460000
subject57870000
subject49440000
subject40290000
subject7510000
subject1390000
subject1390000
subject1390000
subject35800000
subject1390000
subject1390000
subject32430000
subject34930000
subject36440000
subject91500000
subject64900000
subject28

subject67450000
subject58250000
subject58250000
subject98590000
subject39960000
subject58860000
subject63030000
subject78900000
subject78900000
subject78900000
subject78900000
subject9150000
subject78900000
subject61690000
subject24680000
subject70700000
subject58250000
subject48490000
subject14480000
subject61680000
subject14480000
subject86290000
subject14480000
subject81800000
subject89860000
subject14480000
subject97870000
subject11640000
subject78330000
subject2420000
subject32430000
subject91590000
subject19480000
subject41830000
subject78330000
subject2420000
subject49440000
subject88700000
subject10740000
subject61420000
subject2420000
subject44760000
subject11640000
subject7510000
subject35800000
subject78330000
subject46930000
subject83920000
subject78330000
subject32750000
subject32750000
subject78330000
subject32750000
subject32750000
subject32750000
subject32750000
subject32750000
subject32750000
subject32750000
subject32750000
subject32750000
subject32750000
subject242000

subject32750000
subject32750000
subject32750000
subject32750000
subject50640000
subject32750000
subject32750000
subject32750000
subject32750000
subject32750000
subject63470000
subject13060000
subject96560000
subject7410000
subject22000000
subject57600000
subject48470000
subject46930000
subject22000000
subject45820000
subject89490000
subject660000
subject84880000
subject48490000
subject36440000
subject41830000
subject49290000
subject57600000
subject34930000
subject74370000
subject57470000
subject86940000
subject92400000
subject70210000
subject94790000
subject92400000
subject63470000
subject83480000
subject83380000
subject14480000
subject14480000
subject46250000
subject82780000
subject86940000
subject34200000
subject85760000
subject96230000
subject48470000
subject49440000
subject66580000
subject85760000
subject51840000
subject70700000
subject92400000
subject620000
subject22000000
subject41500000
subject94270000
subject23690000
subject48470000
subject30490000
subject94270000
subject425000

subject32750000
subject32750000
subject35800000
subject97960000
subject94790000
subject94790000
subject94790000
subject36440000
subject94940000
subject94790000
subject58860000
subject58860000
subject49500000
subject2420000
subject58860000
subject1390000
subject1390000
subject1390000
subject1390000
subject1390000
subject27550000
subject90590000
subject23840000
subject34930000
subject70210000
subject25590000
subject25590000
subject2730000
subject46380000
subject440000
subject76470000
subject52690000
subject43530000
subject73540000
subject62050000
subject870000
subject34930000
subject84880000
subject64900000
subject23840000
subject7510000
subject99230000
subject59340000
subject96560000
subject91610000
subject11200000
subject67330000
subject76660000
subject55530000
subject36440000
subject41260000
subject34930000
subject97960000
subject32400000
subject870000
subject870000
subject9980000
subject32750000
subject11670000
subject90590000
subject99230000
subject48490000
subject22470000
subject24

subject7510000
subject22470000
subject10740000
subject13980000
subject13980000
subject21800000
subject7510000
subject97770000
subject49830000
subject49830000
subject99230000
subject86940000
subject7510000
subject21800000
subject21800000
subject21800000
subject21800000
subject21800000
subject21800000
subject21800000
subject21800000
subject21800000
subject21800000
subject73160000
subject73160000
subject2420000
subject22470000
subject7510000
subject34930000
subject34930000
subject48880000
subject58250000
subject70700000
subject26540000
subject87920000
subject80840000
subject11640000
subject80840000
subject94850000
subject88700000
subject14480000
subject80840000
subject14480000
subject97870000
subject26540000
subject48490000
subject1390000
subject11650000
subject61690000
subject82600000
subject18920000
subject45410000
subject2420000
subject61690000
subject46040000
subject41260000
subject2420000
subject45410000
subject11650000
subject45410000
subject45410000
subject7510000
subject32430000
s

subject32750000
subject32750000
subject7510000
subject31660000
subject46410000
subject89260000
subject98560000
subject36440000
subject92400000
subject34770000
subject91590000
subject54440000
subject43530000
subject57770000
subject89860000
subject43530000
subject13150000
subject47680000
subject50270000
subject43530000
subject57770000
subject57770000
subject47910000
subject46120000
subject2520000
subject2520000
subject2520000
subject2520000
subject2520000
subject39190000
subject13150000
subject88350000
subject66420000
subject77400000
subject66420000
subject90570000
subject90570000
subject44460000
subject2520000
subject19290000
subject90590000
subject96230000
subject90590000
subject57770000
subject57770000
subject36910000
subject57770000
subject36910000
subject57770000
subject84340000
subject41000000
subject96230000
subject57770000
subject35150000
subject41000000
subject66420000
subject13150000
subject34930000
subject64900000
subject34930000
subject30990000
subject30990000
subject60740000

subject55530000
subject32750000
subject46930000
subject46250000
subject32750000
subject32750000
subject85020000
subject73540000
subject32750000
subject49290000
subject55870000
subject14480000
subject17850000
subject40970000
subject45030000
subject52690000
subject35980000
subject41600000
subject40970000
subject65340000
subject94730000
subject96230000
subject94940000
subject23690000
subject44460000
subject96710000
subject78870000
subject94730000
subject49440000
subject77460000
subject16920000
subject93130000
subject94730000
subject14460000
subject62050000
subject62050000
subject48470000
subject86940000
subject63710000
subject75730000
subject38890000
subject36440000
subject82640000
subject36440000
subject13150000
subject97960000
subject14160000
subject14160000
subject14160000
subject90130000
subject36440000
subject41600000
subject46040000
subject82640000
subject41820000
subject82600000
subject85760000
subject1390000
subject41820000
subject22000000
subject22000000
subject91590000
subject30

subject70090000
subject98560000
subject44460000
subject2420000
subject73540000
subject17850000
subject44460000
subject39960000
subject22470000
subject32750000
subject32750000
subject32750000
subject32750000
subject32750000
subject32750000
subject32750000
subject32750000
subject32750000
subject49440000
subject32750000
subject32750000
subject78330000
subject96330000
subject32750000
subject32750000
subject14460000
subject48470000
subject41000000
subject66420000
subject66420000
subject7510000
subject17850000
subject41000000
subject27260000
subject47680000
subject17850000
subject17850000
subject50640000
subject55480000
subject46380000
subject7470000
subject49710000
subject96230000
subject17870000
subject36910000
subject41500000
subject74200000
subject96750000
subject11640000
subject81190000
subject85500000
subject85500000
subject36440000
subject48470000
subject49290000
subject93400000
subject77460000
subject96330000
subject35800000
subject93400000
subject91640000
subject86940000
subject7746

subject80840000
subject32750000
subject38870000
subject7320000
subject49440000
subject88700000
subject620000
subject36910000
subject80840000
subject73540000
subject2420000
subject62010000
subject48780000
subject82600000
subject46040000
subject47530000
subject78900000
subject96230000
subject77520000
subject77520000
subject90130000
subject97340000
subject90320000
subject41830000
subject32750000
subject32430000
subject32750000
subject32750000
subject32750000
subject32750000
subject32750000
subject32750000
subject32750000
subject36910000
subject78330000
subject620000
subject2420000
subject82600000
subject77520000
subject52980000
subject77460000
subject98590000
subject41500000
subject36910000
subject44440000
subject77460000
subject36440000
subject35800000
subject41600000
subject14160000
subject53620000
subject98360000
subject59340000
subject78330000
subject32470000
subject29010000
subject620000
subject49710000
subject66580000
subject89990000
subject89990000
subject66580000
subject46790000
s

subject68380000
subject28660000
subject28660000
subject85300000
subject76470000
subject85300000
subject99230000
subject78330000
subject73540000
subject24160000
subject68570000
subject59550000
subject83490000
subject30490000
subject68380000
subject96230000
subject2520000
subject83800000
subject85300000
subject1390000
subject1390000
subject1390000
subject1390000
subject1390000
subject1390000
subject1390000
subject1390000
subject30490000
subject1390000
subject30490000
subject84340000
subject90130000
subject18760000
subject46380000
subject77460000
subject77460000
subject77460000
subject99230000
subject35880000
subject97860000
subject7510000
subject7510000
subject73540000
subject13150000
subject34200000
subject77460000
subject77460000
subject73540000
subject73540000
subject83490000
subject35800000
subject88120000
subject88120000
subject13980000
subject85300000
subject80920000
subject32750000
subject47680000
subject35880000
subject76470000
subject68380000
subject85300000
subject59900000
subj

subject80840000
subject7260000
subject66980000
subject41520000
subject96530000
subject7260000
subject59320000
subject38890000
subject66270000
subject7860000
subject76270000
subject90590000
subject13980000
subject58250000
subject48080000
subject81800000
subject34620000
subject620000
subject70210000
subject58250000
subject7320000
subject83800000
subject83800000
subject92460000
subject70700000
subject32430000
subject52140000
subject72390000
subject80840000
subject82640000
subject74370000
subject87920000
subject38870000
subject11200000
subject48470000
subject48470000
subject48470000
subject81800000
subject11200000
subject11200000
subject76270000
subject32430000
subject76270000
subject82130000
subject26540000
subject82640000
subject96710000
subject74370000
subject45310000
subject2420000
subject14660000
subject14460000
subject74370000
subject1390000
subject1390000
subject23690000
subject32430000
subject8230000
subject1390000
subject28660000
subject94790000
subject82780000
subject48470000
sub

subject35980000
subject86290000
subject26540000
subject26540000
subject78900000
subject49710000
subject24070000
subject85760000
subject70450000
subject94940000
subject2420000
subject59320000
subject24070000
subject90590000
subject26540000
subject60160000
subject46040000
subject79930000
subject83800000
subject6570000
subject620000
subject96750000
subject61680000
subject94790000
subject22470000
subject54440000
subject46930000
subject6570000
subject10740000
subject7320000
subject70700000
subject18940000
subject75760000
subject82130000
subject18940000
subject74370000
subject14460000
subject99950000
subject14460000
subject16240000
subject18940000
subject14460000
subject14460000
subject83480000
subject11710000
subject83480000
subject9960000
subject66540000
subject78330000
subject78330000
subject78330000
subject83480000
subject83480000
subject35980000
subject32430000
subject43530000
subject24680000
subject70700000
subject70700000
subject70700000
subject96520000
subject1390000
subject7260000
s

subject23840000
subject78330000
subject84880000
subject62140000
subject23840000
subject35800000
subject41920000
subject83800000
subject57690000
subject16340000
subject59320000
subject57690000
subject70210000
subject47160000
subject44460000
subject52690000
subject74750000
subject33230000
subject44460000
subject82600000
subject57690000
subject9980000
subject44460000
subject1390000
subject44460000
subject7320000
subject26140000
subject11710000
subject90320000
subject44460000
subject99230000
subject62050000
subject78440000
subject83490000
subject52120000
subject52120000
subject23960000
subject91610000
subject68380000
subject10790000
subject77460000
subject2420000
subject80660000
subject620000
subject44460000
subject80920000
subject43560000
subject59340000
subject24160000
subject2420000
subject4250000
subject52140000
subject47910000
subject44460000
subject44460000
subject80730000
subject98030000
subject98030000
subject13150000
subject36440000
subject23490000
subject77520000
subject46040000


subject94940000
subject68380000
subject61920000
subject7510000
subject68380000
subject93460000
subject65230000
subject77400000
subject94940000
subject82780000
subject44460000
subject78900000
subject82780000
subject620000
subject46380000
subject22000000
subject14160000
subject67180000
subject67180000
subject67180000
subject78330000
subject9980000
subject52140000
subject11650000
subject68380000
subject68380000
subject83800000
subject41600000
subject34930000
subject83800000
subject52140000
subject52140000
subject31240000
subject83800000
subject41600000
subject57600000
subject59320000
subject13600000
subject41600000
subject92400000
subject41600000
subject230000
subject75760000
subject7320000
subject32430000
subject23840000
subject13600000
subject13600000
subject13600000
subject57600000
subject13600000
subject13600000
subject94790000
subject55530000
subject41520000
subject39960000
subject620000
subject35800000
subject57690000
subject7510000
subject3700000
subject620000
subject620000
subject

subject1390000
subject85760000
subject27860000
subject55870000
subject96330000
subject1710000
subject46380000
subject68380000
subject48430000
subject77520000
subject16340000
subject90570000
subject46040000
subject92400000
subject52140000
subject20150000
subject99230000
subject52140000
subject53880000
subject870000
subject52140000
subject99230000
subject52140000
subject99230000
subject71040000
subject71040000
subject4250000
subject44780000
subject9150000
subject4250000
subject71040000
subject58860000
subject32750000
subject4250000
subject52140000
subject7860000
subject74750000
subject73540000
subject31240000
subject44780000
subject8860000
subject7260000
subject4540000
subject11650000
subject48470000
subject48470000
subject50640000
subject70700000
subject60020000
subject60020000
subject44340000
subject39960000
subject4540000
subject55870000
subject26540000
subject39960000
subject27860000
subject44780000
subject96090000
subject27860000
subject27860000
subject27860000
subject45310000
subje

subject41600000
subject620000
subject54830000
subject41600000
subject41600000
subject13060000
subject41600000


### Data Generator

In [31]:
class DataGenerator(Sequence):
    'Generates data for Keras'
    def __init__(self, user_level_data, subjects_split, set_type='train', bert_tokenizer=bert_tokenizer,
                 batch_size=hyperparams_features['batch_size'], seq_len=hyperparams_features['maxlen'], 
                 voc_size=hyperparams_features['max_features'], emotion_lexicon=nrc_lexicon,
                 emotions=emotions, pronouns=["i", "me", "my", "mine", "myself"], 
                 max_posts_per_user=hyperparams_features['posts_per_user'],
                 shuffle=True):
        'Initialization'
        self.seq_len = seq_len
        self.bert_tokenizer = bert_tokenizer
        self.subjects_split = subjects_split
        self.set = set_type
        self.emotion_lexicon = emotion_lexicon
        self.batch_size = batch_size
        self.data = user_level_data
        self.emotions = emotions
        self.pronouns = pronouns
        self.shuffle = shuffle
        self.voc_size = voc_size
        self.max_posts_per_user = max_posts_per_user
        self.on_epoch_end()

    def __encode_text(self, tokens, raw_text):
        # Using voc_size-1 value for OOV token
        encoded_tokens = [vocabulary.get(w, self.voc_size-1) for w in tokens]
        encoded_emotions = encode_emotions(tokens, self.emotion_lexicon, self.emotions)
        encoded_pronouns = encode_pronouns(tokens, self.pronouns)
        encoded_stopwords = encode_stopwords(tokens)
        bert_ids, bert_masks, bert_segments, label = encode_text_for_bert(self.bert_tokenizer, InputExample(None, 
                                               raw_text), self.seq_len)
        return (encoded_tokens, encoded_emotions, encoded_pronouns, encoded_stopwords,
               bert_ids, bert_masks, bert_segments)
        
    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.subjects_split[self.set]) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        user_indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        # Find users
        users = [self.subjects_split[self.set][i] for i in user_indexes
                    if self.subjects_split[self.set][i] in self.data.keys()] # TODO: maybe needs a warning that user is missing

        post_indexes = {}
        # Sample post ids
        for subject in users:
            posts_len = len(self.data[subject]['texts'])
            posts_index_sample = sorted(np.random.choice(posts_len, 
                                                         min(self.max_posts_per_user, posts_len),
                                                         replace=False))
            post_indexes[subject] = posts_index_sample
        # Generate data
        X, y = self.__data_generation(users, post_indexes)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.subjects_split[self.set]))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, users, post_indexes):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        tokens_data = []
        categ_data = []
        sparse_data = []
        subjects = []
        bert_ids_data = []
        bert_masks_data = []
        bert_segments_data = []
        labels = []

        for subject in users:
            texts = self.data[subject]['texts']
            raw_texts = self.data[subject]['raw']
            label = self.data[subject]['label']
            liwc_scores = self.data[subject]['liwc']
            
            # Sample
            texts = [texts[i] for i in post_indexes[subject]]
            liwc_selection = [liwc_scores[i] for i in post_indexes[subject]]
            raw_texts = [raw_texts[i] for i in post_indexes[subject]]
            
            all_words = [sum(texts, [])] # merge all texts in one list
            liwc_aggreg = [np.array(liwc_selection).mean(axis=0).tolist()]
            all_raw_texts = [" ".join(raw_texts)]
            
            for i, words in enumerate(all_words):
                encoded_tokens, encoded_emotions, encoded_pronouns, encoded_stopwords, \
                    bert_ids, bert_masks, bert_segments = self.__encode_text(words, all_raw_texts[i])
                subject_id = int(subject.split('t')[1])
                tokens_data.append(encoded_tokens)
                categ_data.append(encoded_emotions + [encoded_pronouns] + liwc_aggreg[i])
                sparse_data.append(encoded_stopwords)
                bert_ids_data.append(bert_ids)
                bert_masks_data.append(bert_masks)
                bert_segments_data.append(bert_segments)
                
                labels.append(label)
                subjects.append(subject_id)

        
        # using zeros for padding
        tokens_data_padded = sequence.pad_sequences(tokens_data, maxlen=self.seq_len)

        return ([np.array(tokens_data_padded), np.array(categ_data), np.array(sparse_data),
                 np.array(bert_ids_data), np.array(bert_masks_data), np.array(bert_segments_data),
                np.array(subjects)],
                np.array(labels))


In [32]:
class DataGeneratorHierarchical(Sequence):
    'Generates data for Keras'
    def __init__(self, user_level_data, subjects_split, set_type='train',
                 batch_size=hyperparams_features['batch_size'], seq_len=hyperparams_features['maxlen'], 
                 voc_size=hyperparams_features['max_features'], emotion_lexicon=nrc_lexicon,
                 emotions=emotions, pronouns=["i", "me", "my", "mine", "myself"], 
                 max_posts_per_user=hyperparams_features['posts_per_user'], stopwords=stopword_list,
                 liwc_categories=categories, bert_tokenizer=bert_tokenizer,
                 shuffle=True):
        'Initialization'
        self.seq_len = seq_len
        self.subjects_split = subjects_split
        self.set = set_type
        self.emotion_lexicon = emotion_lexicon
        self.bert_tokenizer = bert_tokenizer
        self.batch_size = batch_size
        self.data = user_level_data
        self.emotions = emotions
        self.pronouns = pronouns
        self.shuffle = shuffle
        self.voc_size = voc_size
        self.max_posts_per_user = max_posts_per_user
        self.categ_dim = len(emotions) + 1 + len(liwc_categories)
        self.sparse_dim = len(stopwords)
        if not shuffle:
            # Sort users so that similar post length users will be in the same batch
            self.subjects_split[self.set] = sorted(self.subjects_split[self.set],
                                                  key = lambda s: len(self.data[s]['texts'] if s in self.data
                                                               else 0))
        self.on_epoch_end()

    def __encode_text(self, tokens, raw_text):
        # Using voc_size-1 value for OOV token
        encoded_tokens = [vocabulary.get(w, self.voc_size-1) for w in tokens]
        encoded_emotions = encode_emotions(tokens, self.emotion_lexicon, self.emotions)
        encoded_pronouns = encode_pronouns(tokens, self.pronouns)
        encoded_stopwords = encode_stopwords(tokens)
        bert_ids, bert_masks, bert_segments, label = encode_text_for_bert(self.bert_tokenizer, InputExample(None, 
                                               raw_text), self.seq_len)
        return (encoded_tokens, encoded_emotions, encoded_pronouns, encoded_stopwords,
               bert_ids, bert_masks, bert_segments)
        
    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.subjects_split[self.set]) / self.batch_size)) + 1 # + 1 to not discard last batch

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        user_indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
#         if len(user_indexes)<self.batch_size:
#             return
        # Find users
        users = [self.subjects_split[self.set][i] for i in user_indexes
                    if self.subjects_split[self.set][i] in self.data.keys()] # TODO: maybe needs a warning that user is missing

        post_indexes = {}
        # Sample post ids
        for subject in users:
            posts_len = len(self.data[subject]['texts'])
            posts_index_sample = sorted(np.random.choice(posts_len, 
                                                         min(self.max_posts_per_user, posts_len),
                                                         replace=False))
            post_indexes[subject] = posts_index_sample
        # Generate data
        X, y = self.__data_generation(users, post_indexes)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.subjects_split[self.set]))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, users, post_indexes):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        user_tokens = []
        user_categ_data = []
        user_sparse_data = []
        user_bert_ids_data = []
        user_bert_masks_data = []
        user_bert_segments_data = []
        
        labels = []
        for subject in users:
            tokens_data = []
            categ_data = []
            sparse_data = []
            bert_ids_data = []
            bert_masks_data = []
            bert_segments_data = []
            
            texts = self.data[subject]['texts']
            raw_texts = self.data[subject]['raw']
            label = self.data[subject]['label']
            liwc_scores = self.data[subject]['liwc']
            
#             if len(texts) < self.max_posts_per_user:
#                 # TODO: pad with zeros
#                 pass
  
            for i in post_indexes[subject]:
                raw_text = raw_texts[i]
                words = texts[i]
                liwc = liwc_scores[i]
                encoded_tokens, encoded_emotions, encoded_pronouns, encoded_stopwords, \
                    bert_ids, bert_masks, bert_segments = self.__encode_text(words, raw_text)
                subject_id = int(subject.split('t')[1])
                tokens_data.append(encoded_tokens)
                # using zeros for padding
                # TODO: there is something wrong with this
                categ_data.append(encoded_emotions + [encoded_pronouns] + liwc)
                sparse_data.append(encoded_stopwords)
                bert_ids_data.append(bert_ids)
                bert_masks_data.append(bert_masks)
                bert_segments_data.append(bert_segments)
            tokens_data_padded = np.array(sequence.pad_sequences(tokens_data, maxlen=self.seq_len))
            user_tokens.append(tokens_data_padded)

            user_categ_data.append(categ_data)
            user_sparse_data.append(sparse_data)
            
            user_bert_ids_data.append(bert_ids_data)
            user_bert_masks_data.append(bert_masks_data)
            user_bert_segments_data.append(bert_segments_data)

            labels.append(label)

        user_tokens = sequence.pad_sequences(user_tokens, value=np.zeros(self.seq_len))
        user_tokens = np.rollaxis(np.dstack(user_tokens), -1)
        
        user_categ_data = sequence.pad_sequences(user_categ_data,  maxlen=self.max_posts_per_user)
        user_categ_data = np.rollaxis(np.dstack(user_categ_data), -1
                                     )
        user_sparse_data = sequence.pad_sequences(user_sparse_data,  maxlen=self.max_posts_per_user)
        user_sparse_data = np.rollaxis(np.dstack(user_sparse_data), -1)
        
        user_bert_ids_data = sequence.pad_sequences(user_bert_ids_data,  maxlen=self.max_posts_per_user)
        user_bert_ids_data = np.rollaxis(np.dstack(user_bert_ids_data), -1)
        
        user_bert_masks_data = sequence.pad_sequences(user_bert_masks_data,  maxlen=self.max_posts_per_user)
        user_bert_masks_data = np.rollaxis(np.dstack(user_bert_masks_data), -1)
        
        user_bert_segments_data = sequence.pad_sequences(user_bert_segments_data, maxlen=self.max_posts_per_user)
        user_bert_segments_data = np.rollaxis(np.dstack(user_bert_segments_data), -1)
        
        return ((user_tokens, user_categ_data, user_sparse_data, 
                 user_bert_ids_data, user_bert_masks_data, user_bert_segments_data),
                np.array(labels))


In [33]:
logger.setLevel(logging.DEBUG)

# TODO: it is skipping the last batch
x_data = {'train': [], 'valid': [], 'test': []}
y_data = {'train': [], 'valid': [], 'test': []}
for set_type in ['train', 'valid', 'test']:
    total_positive = 0
    for x, y in DataGenerator(user_level_data, subjects_split, 
                                          set_type=set_type):
        total_positive += pd.Series(y).sum()
        x_data[set_type].append(x)
        y_data[set_type].append(y)
    logger.info("%d %s positive examples\n" % (total_positive, set_type))


37 train positive examples



I0306 17:19:54.953492 139706427881216 <ipython-input-33-991684b1f272>:13] 37 train positive examples



22 valid positive examples



I0306 17:19:58.114716 139706427881216 <ipython-input-33-991684b1f272>:13] 22 valid positive examples



70 test positive examples



I0306 17:20:17.223898 139706427881216 <ipython-input-33-991684b1f272>:13] 70 test positive examples



In [34]:
x_data['train'][0][6].shape

(32,)

In [35]:
encoded_for_bert = encode_text_for_bert(bert_tokenizer, InputExample(None, 
                                               "Ana are mere"), 200)

In [36]:
ids, masks, segments, label = encoded_for_bert

In [37]:
# class_weights = class_weight.compute_class_weight('balanced',
#                                                  np.unique(y_data['train']),
#                                                  y_data['train'])
# class_weights

In [38]:
def load_embeddings(path, embedding_dim, voc):
    # random matrix with mean value = 0
    embedding_matrix = np.random.random((len(voc)+2, embedding_dim)) - 0.5 # voc + unk + pad value(0)

    f = open(path)
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        word_i = voc.get(word)
        if word_i is not None:
            embedding_matrix[word_i] = coefs
    f.close()

    print('Total %s word vectors.' % len(embedding_matrix))

 
    return embedding_matrix

pretrained_embeddings_path = root_dir + '/resources/glove.twitter.27B/glove.twitter.27B.%dd.txt' % hyperparams_features['embedding_dim']
embedding_matrix = load_embeddings(pretrained_embeddings_path, hyperparams_features['embedding_dim'], vocabulary)


Total 40000 word vectors.


## Define model

In [40]:
hyperparams = {
    'lstm_units': 256,
    'lstm_units_user': 10,
    'dense_bow_units': 20,
    'bert_dense_units': 256,
    'bert_finetune_layers': 0,
    'dropout': 0.0,
    'l2_dense': 0.0000011,
    'l2_embeddings': 0.000001,
    'dense_sentence_units': 50,
    'dense_user_units': 50,
    'optimizer': None,#'adam',
    'decay': 0.0001,
    'lr': 0.001,
    "trainable_embeddings": True,
    "reduce_lr_factor": 0.0002,
    "reduce_lr_patience": 20,
    "freeze_patience": 500,
    'threshold': 0.5,
    'ignore_layer': ['batchnorm'],
    'norm_momentum': 0.1,
    'hierarchical': False,

}
if not hyperparams['optimizer']:
    hyperparams['optimizer'] = optimizers.Adam(lr=hyperparams['lr'], #beta_1=0.9, beta_2=0.999, epsilon=0.0001,
                                   decay=hyperparams['decay'])

In [41]:
class Metrics():
    def __init__(self, threshold=0.5):
        self.threshold=threshold
        
    def recall_m(self, y_true, y_pred):
            y_labels = y_true
            y_pred = K.cast(K.greater(K.clip(y_pred, 0, 1), self.threshold), K.floatx())        
            possible_positives = K.sum(K.round(K.clip(y_labels, 0, 1)))
            true_positives = K.sum(K.round(K.clip(y_labels * y_pred, 0, 1)))
            recall = true_positives / (possible_positives + K.epsilon())
            return recall

    def precision_m(self, y_true, y_pred):
            y_labels = y_true
            y_pred = K.cast(K.greater(K.clip(y_pred, 0, 1), self.threshold), K.floatx())        
            true_positives = K.sum(K.round(K.clip(y_labels * y_pred, 0, 1)))
            predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
            precision = true_positives / (predicted_positives + K.epsilon())
            return precision

    def f1_m(self, y_true, y_pred):
        precision = self.precision_m(y_true, y_pred)
        recall = self.recall_m(y_true, y_pred)
        return 2*((precision*recall)/(precision+recall+K.epsilon()))

def binary_crossentropy_custom(y_true, y_pred):
    y_labels = y_true
    return K.binary_crossentropy(y_labels, 
                                 y_pred)

metrics_class = Metrics(threshold=hyperparams['threshold'])

In [42]:
class BertLayer(tf.keras.layers.Layer):
    def __init__(
        self,
        n_fine_tune_layers=10,
        pooling="first",
        bert_path="https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1", 
        **kwargs
    ):
        self.n_fine_tune_layers = n_fine_tune_layers
        self.trainable = True
        self.output_size = 768
        self.pooling = pooling
        self.bert_path = bert_path
        if self.pooling not in ["first", "mean"]:
            raise NameError(
               "Undefined pooling type (must be either first or mean, but is %s)" % self.pooling
            )

        super(BertLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.bert = hub.Module(
            self.bert_path, trainable=self.trainable, name="%s_module" % self.name
        )

        # Remove unused layers
        trainable_vars = self.bert.variables
        if self.pooling == "first":
            trainable_vars = [var for var in trainable_vars if not "/cls/" in var.name]
            trainable_layers = ["pooler/dense"]

        elif self.pooling == "mean":
            trainable_vars = [
                var
                for var in trainable_vars
                if not "/cls/" in var.name and not "/pooler/" in var.name
            ]
            trainable_layers = []
        else:
            raise NameError(
                "Undefined pooling type (must be either first or mean, but is %s)" % self.pooling
            )

        # Select how many layers to fine tune
        for i in range(self.n_fine_tune_layers):
            trainable_layers.append("encoder/layer_%s" % str(11 - i))

        # Update trainable vars to contain only the specified layers
        trainable_vars = [
            var
            for var in trainable_vars
            if any([l in var.name for l in trainable_layers])
        ]

        # Add to trainable weights
        for var in trainable_vars:
            self._trainable_weights.append(var)

        for var in self.bert.variables:
            if var not in self._trainable_weights:
                self._non_trainable_weights.append(var)

        super(BertLayer, self).build(input_shape)

    def call(self, inputs):
        inputs = [K.cast(x, dtype="int32") for x in inputs]
        input_ids, input_mask, segment_ids = inputs
        bert_inputs = dict(
            input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids
        )
        if self.pooling == "first":
            pooled = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)[
                "pooled_output"
            ]
        elif self.pooling == "mean":
            result = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)[
                "sequence_output"
            ]

            mul_mask = lambda x, m: x * tf.expand_dims(m, axis=-1)
            masked_reduce_mean = lambda x, m: tf.reduce_sum(mul_mask(x, m), axis=1) / (
                    tf.reduce_sum(m, axis=1, keepdims=True) + 1e-10)
            input_mask = tf.cast(input_mask, tf.float32)
            pooled = masked_reduce_mean(result, input_mask)
        else:
            raise NameError("Undefined pooling type (must be either first or mean, but is %s)" % self.pooling)

        return pooled

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.output_size)

In [43]:
def build_model(hyperparams, hyperparams_features, embedding_matrix, emotions, stopwords_list,
                liwc_categories,
               ignore_layer=[]):

    tokens_features = Input(shape=(hyperparams_features['maxlen'],), name='word_seq')
    embedding_layer = Embedding(hyperparams_features['max_features'], 
                                hyperparams_features['embedding_dim'], 
                                input_length=hyperparams_features['maxlen'],
                                embeddings_regularizer=regularizers.l2(hyperparams['l2_embeddings']),
                                weights=[embedding_matrix], 
                                trainable=hyperparams['trainable_embeddings'],
                               name='embeddings_layer')(
        tokens_features)
#     if 'batchnorm' not in ignore_layer:
#         embedding_layer_norm = BatchNormalization(axis=1, momentum=hyperparams['norm_momentum'],
#                                                      name='embeddings_layer_norm')(embedding_layer)
#     lstm_layers = Bidirectional(LSTM(hyperparams['lstm_units']))(embedding_layer)

    if tf.test.is_gpu_available():
        lstm_layers = CuDNNLSTM(hyperparams['lstm_units'], 
                                return_sequences='attention' not in ignore_layer, # only True if using attention
                      name='LSTM_layer')(embedding_layer)
    else:
        lstm_layers = LSTM(hyperparams['lstm_units'], 
                           return_sequences='attention' not in ignore_layer,
                      name='LSTM_layer')(embedding_layer)
    
    # Attention
    if 'attention' not in ignore_layer:
        attention = Dense(1, activation='tanh', name='attention')(lstm_layers)
        attention = Flatten()(attention)
        attention = Activation('softmax')(attention)
        attention = RepeatVector(hyperparams['lstm_units'])(attention)
        attention = Permute([2, 1])(attention)

        sent_representation = Multiply()([lstm_layers, attention])
        sent_representation = Lambda(lambda xin: K.sum(xin, axis=1), 
                                     output_shape=(hyperparams['lstm_units'],)
                                    )(sent_representation)

        
    else:
        sent_representation = lstm_layers
        
    
    sent_representation = Dropout(hyperparams['dropout'], name='lstm_att_dropout')(sent_representation)
    if hyperparams['dense_sentence_units']:
        sent_representation = Dense(units=hyperparams['dense_sentence_units'],
                                   name='dense_sent_representation')(sent_representation)
    
    # Other features
    numerical_features = Input(shape=(len(emotions) + 1 + len(liwc_categories),), name='numeric_input') # emotions and pronouns
    dense_layer = Dense(units=1,
                        kernel_regularizer=regularizers.l2(hyperparams['l2_dense']),
                        name='numerical_dense_layer',
                       )(numerical_features)
    sparse_features = Input(shape=(len(stopwords_list),), name='sparse_input') # stopwords

    dense_layer_sparse = Dense(units=hyperparams['dense_bow_units'],
                              name='sparse_feat_dense_layer',
                                kernel_regularizer=regularizers.l2(hyperparams['l2_dense']),
                              )(sparse_features)
    
    # BERT encoder
    in_id_bert = Input(shape=(hyperparams_features['maxlen'],), name="input_ids_bert")
    in_mask_bert = Input(shape=(hyperparams_features['maxlen'],), name="input_masks_bert")
    in_segment_bert = Input(shape=(hyperparams_features['maxlen'],), name="segment_ids_bert")
    bert_inputs = [in_id_bert, in_mask_bert, in_segment_bert]
    
    bert_output = BertLayer(n_fine_tune_layers=hyperparams['bert_finetune_layers'], pooling="first")(bert_inputs)
    dense_bert = Dense(hyperparams['bert_dense_units'], activation='relu')(bert_output)
    
    # Batch normalization
    if 'batchnorm' not in ignore_layer:
        numerical_features_norm = BatchNormalization(axis=1, momentum=hyperparams['norm_momentum'],
                                                     name='numerical_features_norm')(numerical_features)
        sent_representation_norm = BatchNormalization(axis=1, momentum=hyperparams['norm_momentum'],
                                                      name='sent_repr_norm')(sent_representation)
        dense_layer_sparse_norm = BatchNormalization(axis=1, momentum=hyperparams['norm_momentum'],
                                                     name='sparse_features_norm')(dense_layer_sparse)
        dense_bert_norm = BatchNormalization(axis=1, momentum=hyperparams['norm_momentum'],
                                                     name='bert_layer_norm_norm')(dense_bert)
        
    subjects = Input(shape=(1,), name='subjects')
    

    all_layers = {
        'lstm_layers': sent_representation,
        'numerical_dense_layer': numerical_features,
        'sparse_feat_dense_layer': dense_layer_sparse,
        'bert_layer': dense_bert,
    }
    if 'batchnorm' not in ignore_layer:
        all_layers = {
            'lstm_layers': sent_representation_norm,
            'numerical_dense_layer': numerical_features_norm,
            'sparse_feat_dense_layer': dense_layer_sparse_norm,
            'bert_layer': dense_bert_norm
        }
    layers_to_merge = []
    for n, l in all_layers.items():
        if n in ignore_layer:
            continue
        layers_to_merge.append(l)
        
    if len(layers_to_merge) == 1:
        merged_layers = layers_to_merge[0]
    else:
        merged_layers = concatenate(layers_to_merge)
    output_layer = Dense(1, activation='sigmoid',
                         name='output_layer',
                        kernel_regularizer=regularizers.l2(hyperparams['l2_dense']))(merged_layers)

    # Compile model
    model = Model(inputs=[tokens_features, numerical_features, sparse_features, 
                          in_id_bert, in_mask_bert, in_segment_bert,
                          subjects], 
                  outputs=output_layer)

    model.compile(hyperparams['optimizer'], binary_crossentropy_custom,
                  metrics=[metrics_class.f1_m, metrics_class.precision_m, metrics_class.recall_m])
    return model



In [44]:
def build_hierarchical_model(hyperparams, hyperparams_features, embedding_matrix, emotions, stopwords_list,
                liwc_categories,
               ignore_layer=[]):

    # Post/sentence representation - word sequence
    tokens_features = Input(shape=(hyperparams_features['maxlen'],), name='word_seq')
    embedding_layer = Embedding(hyperparams_features['max_features'], 
                                hyperparams_features['embedding_dim'], 
                                input_length=hyperparams_features['maxlen'],
                                embeddings_regularizer=regularizers.l2(hyperparams['l2_embeddings']),
                                weights=[embedding_matrix], 
                                trainable=hyperparams['trainable_embeddings'],
                               name='embeddings_layer')(
        tokens_features)
    
    if tf.test.is_gpu_available():
        lstm_layers = CuDNNLSTM(hyperparams['lstm_units'], 
                                return_sequences='attention' not in ignore_layer, # only True if using attention
                      name='LSTM_layer')(embedding_layer)
    else:
        lstm_layers = LSTM(hyperparams['lstm_units'], 
                           return_sequences='attention' not in ignore_layer,
                      name='LSTM_layer')(embedding_layer)
    
    # Attention
    if 'attention' not in ignore_layer:
        attention = Dense(1, activation='tanh', name='attention')(lstm_layers)
        attention = Flatten()(attention)
        attention = Activation('softmax')(attention)
        attention = RepeatVector(hyperparams['lstm_units'])(attention)
        attention = Permute([2, 1])(attention)

        sent_representation = Multiply()([lstm_layers, attention])
        sent_representation = Lambda(lambda xin: K.sum(xin, axis=1), 
                                     output_shape=(hyperparams['lstm_units'],)
                                    )(sent_representation)       
    else:
        sent_representation = lstm_layers
    
    if 'batchnorm' not in ignore_layer:
        sent_representation = BatchNormalization(axis=1, momentum=hyperparams['norm_momentum'],
                                                          name='sent_repr_norm')(sent_representation)
    sent_representation = Dropout(hyperparams['dropout'], name='lstm_att_dropout')(sent_representation)


    # Hierarchy
    sentEncoder = Model(inputs=tokens_features, 
                        outputs=sent_representation)
    sentEncoder.summary()

    posts_history_input = Input(shape=(hyperparams_features['posts_per_user'], 
                                 hyperparams_features['maxlen']
                                      ), name='hierarchical_word_seq_input')

    user_encoder = TimeDistributed(sentEncoder, name='user_encoder')(posts_history_input)    
        
    # BERT encoder
    in_id_bert = Input(shape=(hyperparams_features['maxlen'],), name="input_ids_bert")
    in_mask_bert = Input(shape=(hyperparams_features['maxlen'],), name="input_masks_bert")
    in_segment_bert = Input(shape=(hyperparams_features['maxlen'],), name="segment_ids_bert")
    bert_inputs = [in_id_bert, in_mask_bert, in_segment_bert]
    
    bert_output = BertLayer(n_fine_tune_layers=hyperparams['bert_finetune_layers'], pooling="first")(bert_inputs)
    dense_bert = Dense(hyperparams['bert_dense_units'], activation='relu')(bert_output)
    bertSentEncoder = Model(bert_inputs, dense_bert)

    
    in_id_bert_history = Input(shape=(hyperparams_features['posts_per_user'],
                                                      hyperparams_features['maxlen'],), name="input_ids_bert_hist")
    in_mask_bert_history = Input(shape=(hyperparams_features['posts_per_user'],
                                                        hyperparams_features['maxlen'],), name="input_masks_bert_hist")
    in_segment_bert_history = Input(shape=(hyperparams_features['posts_per_user'],
                                                           hyperparams_features['maxlen'],), name="segment_ids_bert_hist")
    bert_inputs_history = [in_id_bert_history, in_mask_bert_history, in_segment_bert_history]
    bert_inputs_concatenated = concatenate(bert_inputs_history)
    inputs_indices = [hyperparams_features['maxlen']*i for i in range(3)]
    # slice the input in equal slices on the last dimension
    bert_encoder_layer = TimeDistributed(Lambda(lambda x: bertSentEncoder([x[:,inputs_indices[0]:inputs_indices[1]], 
                                                                  x[:,inputs_indices[1]:inputs_indices[2]],
                                                                          x[:,inputs_indices[2]:]])))(
                        bert_inputs_concatenated)
    bertUserEncoder = Model(bert_inputs_history, bert_encoder_layer)
    
    bert_user_encoder = bertUserEncoder(bert_inputs_history)
    
    # Other features 
    numerical_features_history = Input(shape=(
            hyperparams_features['posts_per_user'],
            len(emotions) + 1 + len(liwc_categories)
        ), name='numeric_input_hist') # emotions and pronouns
    sparse_features_history = Input(shape=(
            hyperparams_features['posts_per_user'],
            len(stopwords_list)
        ), name='sparse_input_hist') # stopwords
    
    
    dense_layer_sparse = Dense(units=hyperparams['dense_bow_units'],
                              name='sparse_feat_dense_layer',
                                kernel_regularizer=regularizers.l2(hyperparams['l2_dense']),
                              )
    dense_layer_sparse_user = TimeDistributed(dense_layer_sparse)(sparse_features_history)

    
    # Concatenate features
    if 'batchnorm' not in ignore_layer:
        numerical_features_history_norm = BatchNormalization(axis=1, momentum=hyperparams['norm_momentum'],
                                                     name='numerical_features_norm')(numerical_features_history)
        dense_layer_sparse_user = BatchNormalization(axis=1, momentum=hyperparams['norm_momentum'],
                                                     name='sparse_features_norm')(dense_layer_sparse_user)
    all_layers = {
        'lstm_layers': user_encoder,
        'bert_layers': bert_user_encoder,
        'numerical_dense_layer': numerical_features_history if 'batchnorm' in ignore_layer \
                    else numerical_features_history_norm,
        'sparse_feat_dense_layer': dense_layer_sparse_user,
    }
    
    layers_to_merge = [l for n,l in all_layers.items() if n not in ignore_layer]
    print(layers_to_merge)
    if len(layers_to_merge) == 1:
        merged_layers = layers_to_merge[0]
    else:
        merged_layers = concatenate(layers_to_merge)
    
    if tf.test.is_gpu_available():
        lstm_user_layers = CuDNNLSTM(hyperparams['lstm_units_user'], 
                                return_sequences='attention_user' not in ignore_layer, # only True if using attention
                      name='LSTM_layer_user')(merged_layers)
    else:
        lstm_user_layers = LSTM(hyperparams['lstm_units_user'], 
                           return_sequences='attention_user' not in ignore_layer,
                      name='LSTM_layer_user')(merged_layers)
    
    # Attention
    if 'attention' not in ignore_layer:
        attention_user = Dense(1, activation='tanh', name='attention_user')(lstm_user_layers)
        attention_user = Flatten()(attention_user)
        attention_user = Activation('softmax')(attention_user)
        attention_user = RepeatVector(hyperparams['lstm_units_user'])(attention_user)
        attention_user = Permute([2, 1])(attention_user)

        user_representation = Multiply()([lstm_user_layers, attention_user])
        user_representation = Lambda(lambda xin: K.sum(xin, axis=1), 
                                     output_shape=(hyperparams['lstm_units_user'],)
                                    )(user_representation)     
    else:
        user_representation = lstm_user_layers
    
    user_representation = Dropout(hyperparams['dropout'], name='lstm_att_dropout_user')(user_representation)
    
    
    if hyperparams['dense_user_units']:
        user_representation = Dense(units=hyperparams['dense_user_units'],
                                   name='dense_user_representation')(user_representation)
    
    # TODO: concatenate before hierarchy? (include all features in th hierarchy)
    
    output_layer = Dense(1, activation='sigmoid',
                         name='output_layer',
                        kernel_regularizer=regularizers.l2(hyperparams['l2_dense']))(user_representation)

    # Compile model
    hierarchical_model = Model(inputs=[posts_history_input, 
                                       numerical_features_history, sparse_features_history,
                                      in_id_bert_history, in_mask_bert_history, in_segment_bert_history], 
                  outputs=output_layer)
    hierarchical_model.summary()
    
    hierarchical_model.compile(hyperparams['optimizer'], binary_crossentropy_custom,
                  metrics=[metrics_class.f1_m, metrics_class.precision_m, metrics_class.recall_m])
    return hierarchical_model



In [45]:
model = build_model(hyperparams, hyperparams_features, embedding_matrix, emotions, stopword_list,
                    liwc_categories=[c for c in categories if c in writings_df.columns]
,
                   ignore_layer=hyperparams['ignore_layer'])
model.summary()

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


W0306 17:35:30.733788 139706427881216 deprecation.py:506] From /usr/local/tensorflow/python3.5/1.13.1/lib/python3.5/site-packages/tensorflow/python/keras/layers/core.py:143: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0306 17:35:32.987660 139706427881216 saver.py:1483] Saver not created because there are no variables in the graph to restore


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
word_seq (InputLayer)           (None, 512)          0                                            
__________________________________________________________________________________________________
embeddings_layer (Embedding)    (None, 512, 100)     4000000     word_seq[0][0]                   
__________________________________________________________________________________________________
LSTM_layer (LSTM)               (None, 512, 256)     365568      embeddings_layer[0][0]           
__________________________________________________________________________________________________
attention (Dense)               (None, 512, 1)       257         LSTM_layer[0][0]                 
__________________________________________________________________________________________________
flatten (F

In [46]:
hierarchical_model = build_hierarchical_model(hyperparams, hyperparams_features, embedding_matrix, emotions, stopword_list,
                    liwc_categories=[c for c in categories if c in writings_df.columns]
,
                   ignore_layer=hyperparams['ignore_layer'])

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
word_seq (InputLayer)           (None, 512)          0                                            
__________________________________________________________________________________________________
embeddings_layer (Embedding)    (None, 512, 100)     4000000     word_seq[0][0]                   
__________________________________________________________________________________________________
LSTM_layer (LSTM)               (None, 512, 256)     365568      embeddings_layer[0][0]           
__________________________________________________________________________________________________
attention (Dense)               (None, 512, 1)       257         LSTM_layer[0][0]                 
__________________________________________________________________________________________________
flatten_1 

I0306 17:35:40.723263 139706427881216 saver.py:1483] Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0306 17:35:41.369130 139706427881216 saver.py:1483] Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0306 17:35:42.025110 139706427881216 saver.py:1483] Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0306 17:35:42.704556 139706427881216 saver.py:1483] Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0306 17:35:43.401958 139706427881216 saver.py:1483] Saver not created because there are no variables in the graph to restore


[<tf.Tensor 'time_distributed_1/Reshape_1:0' shape=(?, 50, 20) dtype=float32>, <tf.Tensor 'model_3/time_distributed/Reshape_1:0' shape=(?, 50, 256) dtype=float32>, <tf.Tensor 'numeric_input_hist:0' shape=(?, 50, 75) dtype=float32>, <tf.Tensor 'user_encoder/Reshape_1:0' shape=(?, 50, 256) dtype=float32>]
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
sparse_input_hist (InputLayer)  (None, 50, 179)      0                                            
__________________________________________________________________________________________________
input_ids_bert_hist (InputLayer (None, 50, 512)      0                                            
__________________________________________________________________________________________________
input_masks_bert_hist (InputLay (None, 50, 512)      0                                            
__

In [47]:
# plot_model(model, 'models/sequential_bert_model.png')

In [48]:
initialize_vars(sess)

In [49]:
experiment = Experiment(api_key="eoBdVyznAhfg3bK9pZ58ZSXfv",
                        project_name="mental", workspace="ananana", disabled=False)

experiment.log_parameters(hyperparams_features)

experiment.log_parameter('emotion_lexicon', nrc_lexicon_path)
experiment.log_parameter('emotions', emotions)
experiment.log_parameter('embeddings_path', pretrained_embeddings_path)
if 'subset' in writings_df.columns:
    experiment.add_tag('anorexia')

experiment.log_parameters(hyperparams)

COMET INFO: old comet version (3.0.3) detected. current: 3.1.1 please update your comet lib with command: `pip install --no-cache-dir --upgrade comet_ml`
COMET INFO: Experiment is live on comet.ml https://www.comet.ml/ananana/mental/0b0c4ee7f9c14c13a2a61f7cd5342c4b



## Train

In [50]:
class WeightsHistory(callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.log_weights(0)

    def on_epoch_end(self, epoch, logs={}):
        if epoch % 10 == 0:
            self.log_weights(epoch)
        
    def log_weights(self, step):
        for layer in model.layers:
            try:
                experiment.log_histogram_3d(layer.get_weights()[0], 
                                            name=layer.name, step=step)
            except Exception as e:
#                 logger.debug("Logging weights error: " + str(e) + "\n")
                # Layer probably does not exist
                pass


class FreezeLayer(callbacks.Callback):
    def __init__(self, logs={}, patience=5, layer={'user_encoder':'embeddings_layer'}, verbose=1, set_to=False):
        super(FreezeLayer, self).__init__()
        self.freeze_epoch = patience
        self.freeze_layer = layer
        self.verbose = verbose
        self.set_to = set_to

    def on_epoch_begin(self, epoch, logs={}):
        if type(self.freeze_layer)==dict:
            submodel = model.get_layer(list(self.freeze_layer.keys())[0])
        else:
            submodel = model
        logging.debug("Trainable embeddings", submodel.get_layer(self.freeze_layer).trainable)
        if epoch == self.freeze_epoch:
            try:
                layer = submodel.get_layer(self.freeze_layer)
                old_value = layer.trainable
                layer.trainable = self.set_to
                # TODO: does this reset the optimizer? should I also compile the top-level model?
                model.compile(hyperparams['optimizer'], binary_crossentropy_custom,
                  metrics=[metrics_class.f1_m, metrics_class.precision_m, metrics_class.recall_m])
                if self.verbose:
                    logging.debug("Setting %s layer from %s to trainable=%s...\n" % (layer.name, old_value,
                                                                   submodel.get_layer(self.freeze_layer).trainable))
            except Exception as e:
                # layer probably does not exist
                pass

In [51]:
early_stopping_patience=50
def train_model(model, 
                data_generator_train, data_generator_valid,
                epochs, class_weight, start_epoch=0, workers=4,
                callback_list = [],
                model_path='/tmp/model',
               verbose=1):
    logging.info('Train...')
    experiment.log_parameter('class_weight', class_weight.values())
    experiment.log_parameter('callbacks', callbacks)

    history = model.fit_generator(data_generator_train,
                steps_per_epoch=100,
              epochs=epochs, initial_epoch=start_epoch, 
              class_weight=class_weight,
              validation_data=data_generator_valid,
                        verbose=verbose,
#               validation_split=0.3,
                       workers=workers,
            callbacks = [
#                 callbacks.ModelCheckpoint(filepath='%s_best' % model_path, verbose=1, 
#                                           save_best_only=True, save_weights_only=True),
                callbacks.EarlyStopping(patience=early_stopping_patience), *callback_list
            ])
    experiment.log_parameter('model_path', model_path)
    return model, history

In [None]:
%%time
model_path='models/seq_bert_user_selfharm'
freeze_layer = FreezeLayer(patience=hyperparams['freeze_patience'], set_to=not hyperparams['trainable_embeddings'])
weights_history = WeightsHistory()
reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=hyperparams['reduce_lr_factor'],
                          patience=hyperparams['reduce_lr_patience'], min_lr=0.000001, verbose=1)
if hyperparams['hierarchical']:
    data_generator_train = DataGeneratorHierarchical(user_level_data, subjects_split, set_type='train',
                                                    max_posts_per_user=hyperparams_features['posts_per_user'])
    data_generator_valid = DataGeneratorHierarchical(user_level_data, subjects_split, set_type='valid',
                                                    max_posts_per_user=hyperparams_features['posts_per_user'])
    model = build_hierarchical_model(hyperparams, hyperparams_features, embedding_matrix, emotions, stopword_list,
                    liwc_categories=[c for c in categories if c in writings_df.columns]
,
                   ignore_layer=hyperparams['ignore_layer'])
    initialize_vars(sess)
else:
    data_generator_train = DataGenerator(user_level_data, subjects_split, set_type='train',
                                                    max_posts_per_user=hyperparams_features['posts_per_user'])
    data_generator_valid = DataGenerator(user_level_data, subjects_split, set_type='valid',
                                                    max_posts_per_user=hyperparams_features['posts_per_user'])
    model = build_model(hyperparams, hyperparams_features, embedding_matrix, emotions, stopword_list,
                    liwc_categories=[c for c in categories if c in writings_df.columns]
,
                   ignore_layer=hyperparams['ignore_layer'])
    initialize_vars(sess)
# Note: FreezeLayer callback doesn't work with hierarchical architecture
model, history = train_model(model, data_generator_train, data_generator_valid,
                       epochs=50,
                      class_weight={0:0.5, 1:5}, start_epoch=0,
                      callback_list = [weights_history, reduce_lr],
                      model_path=model_path, workers=4)
model.save_weights(model_path)#, save_weights_only=True)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0306 17:36:06.756873 139706427881216 saver.py:1483] Saver not created because there are no variables in the graph to restore


Instructions for updating:
Use tf.cast instead.


W0306 17:36:16.394067 139706427881216 deprecation.py:323] From /usr/local/tensorflow/python3.5/1.13.1/lib/python3.5/site-packages/tensorflow/python/ops/math_ops.py:3066: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.
COMET INFO: Ignoring automatic log_parameter('verbose') because 'keras:verbose' is in COMET_LOGGING_PARAMETERS_IGNORE
COMET INFO: Ignoring automatic log_parameter('do_validation') because 'keras:do_validation' is in COMET_LOGGING_PARAMETERS_IGNORE


Epoch 1/50


COMET INFO: Ignoring automatic log_metric('batch_batch') because 'keras:batch_batch' is in COMET_LOGGING_METRICS_IGNORE
COMET INFO: Ignoring automatic log_metric('batch_size') because 'keras:batch_size' is in COMET_LOGGING_METRICS_IGNORE


Epoch 2/50

In [157]:
# TODO: properly extract the test data without sampling
model.evaluate(DataGeneratorHierarchical(user_level_data, subjects_split, 
                                         set_type='test', 
                                         max_posts_per_user=hyperparams_features['posts_per_user']))

ValueError: Error when checking model input: the list of Numpy arrays that you are passing to your model is not the size the model expected. Expected to see 7 array(s), but instead got the following list of 6 arrays: [array([[[    0,     0,     0, ...,   132,     8,  1722],
        [    0,     0,     0, ...,    10,   223,   143],
        [    0,     0,     0, ...,     2,  1215,     3],
        ...,
        [    0,...

In [None]:
dependencies = {
    'f1_m': metrics_class.f1_m,
    'precision_m': metrics_class.precision_m,
    'recall_m': metrics_class.recall_m,
    'binary_crossentropy_custom': binary_crossentropy_custom
}
model = load_model('ham_bert_user_selfharm_best', custom_objects=dependencies)


In [None]:
pd.Series([v for v in model.get_layer('attention').get_weights()[0].flatten()]).rolling(50).mean().plot()

In [None]:
pd.Series([abs(v) for v in model.get_layer('output_layer').get_weights()[0].flatten()]).plot()

### Feature importance

In [None]:
features = [
    (e, 'nrc') for e in emotions] + ['pers_pronouns'] + [
    (c, 'liwc') for c in list(categories) if c in writings_df.columns] + [
(st, 'stopword') for st in stopword_list]
weights = model.get_layer('output_layer').get_weights()[0].tolist()[-(len(features)):]

print(len(weights), len(features))
feature_importance = {}
for (i, f) in enumerate(features):
    feature_importance[f] = weights[i][0]

sorted(feature_importance.items(), key=lambda t: abs(t[1]), reverse=True)

## Evaluate per user

In [None]:
def get_data_for_point(subject, voc, hyperparams_features=hyperparams_features, nrc_lexicon=nrc_lexicon,
                      emotions=emotions):
    eval_writings_df = writings_df[writings_df['subject']==subject]
    correct_label = eval_writings_df.label.values[0]
    (x_train, y_train), (x_valid, y_valid), (x_test, y_test), voc = load_erisk_data(eval_writings_df,
                        seq_len=hyperparams_features['maxlen'],
                        voc_size=hyperparams_features['max_features'],
                        emotion_lexicon=nrc_lexicon,
                        emotions=emotions, user_level=False,
                        train_prop=0.0, vocabulary=voc)
    return x_test, y_test, correct_label

In [None]:
def predict_per_user(writings_df, majority_prop=0.2, train_prop=0.7, majority_nr=0, validate=False, voc=None,
                    random=False, nr_slices=5, test_slice=2):
    all_predictions = []
    all_labels = []
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    thresh=0.5
    majority_proportion=majority_prop
    valid_prop = 0.3
    
    if 'subset' in writings_df.columns:
        training_subjects = list(set(writings_df[writings_df['subset']=='train'].subject))
        test_subjects = list(set(writings_df[writings_df['subset']=='test'].subject))
    else:
        all_subjects = sorted(list(set(writings_df.subject)))
        training_subjects_size = int(len(all_subjects) * train_prop)
        test_subjects_size = len(all_subjects) - training_subjects_size
        # Cross-validation, with fixed slice as input
        test_prop = 1-train_prop
        test_slice = min(test_slice, nr_slices)
        logger.debug("start index: %f, from %f\n" % (
            len(all_subjects)*(1/nr_slices)*test_slice, test_prop*test_slice))
        start_slice = int(len(all_subjects)*(1/nr_slices)*test_slice)
        test_subjects = all_subjects[start_slice: start_slice+test_subjects_size]
        training_subjects = [s for s in all_subjects if s not in test_subjects]
    training_subjects = sorted(training_subjects) # ensuring reproducibility
    valid_subjects_size = int(len(training_subjects) * valid_prop)
    valid_subjects = training_subjects[:valid_subjects_size]
    training_subjects = training_subjects[valid_subjects_size:]
    
    if validate:
        subjects = valid_subjects
    else:
        subjects = test_subjects
    for subject in subjects:
        x_test_user, y_test_user, label = get_data_for_point(subject, voc=voc)
        outputs = model.predict(x_test_user)
        if random:
            sigma = np.std(outputs)
            mu = np.mean(outputs)
            print("generating random outputs with sigma", sigma, "and mu", mu)
            outputs = sigma*np.random.randn(len(outputs))+mu
        positive_pred = sum(outputs>=thresh)
        negative_pred = sum(outputs<thresh)
        majority_pred = 0
        if majority_proportion and positive_pred >= majority_proportion*negative_pred:
            majority_pred = 1
        if majority_nr and positive_pred>=majority_nr:
            majority_pred = 1
        if label == 1:
            if majority_pred == 1:
                tp+=1
            else:
                fn+=1
        else:
            if majority_pred == 0:
                tn+=1
            else:
                fp+=1
        print(negative_pred, positive_pred, majority_pred)
        all_predictions.append(majority_pred)
        all_labels.append(label)
    def prec_recall_f1(tp, fp, tn, fn):
        recall = tp/(tp+fn+0.0000001)
        precision = tp/(tp+fp+0.0000001)
        f1 = 2*precision*recall/(precision+recall+0.0000001)
        print("Recall", recall, "Precision", precision, "F1", f1)
    if majority_prop:
        print("Vote proportion", majority_prop)
    if majority_nr:
        print("Vote points", majority_nr)
    prec_recall_f1(tp, fp, tn, fn)

        

In [None]:
predict_per_user(writings_df=writings_df, voc=voc, majority_prop=0.2)

## Cross-validation

In [None]:
results_per_slice = {}

In [None]:
nr_slices=5
logger.setLevel(logging.INFO)
for tslice in range(nr_slices): 
    (x_train, y_train), (x_valid, y_valid), (x_test, y_test), voc = load_erisk_data(writings_df, 
                                                                seq_len=hyperparams_features['maxlen'],
                                                                voc_size=hyperparams_features['max_features'],
                                                               emotion_lexicon=nrc_lexicon,
                                                               emotions=emotions,
                                                               user_level=hyperparams_features['user_level'],
                                                                                    test_slice=tslice,
                                                                                    nr_slices=nr_slices,
    #                                                            vocabulary=pickle.load(open('vocabulary20K_selfharm.pkl', 'rb'))
                                                                                   logger=logger)
    model, history = train_model(model, x_train, y_train, x_valid, y_valid,
           epochs=200, batch_size=hyperparams['batch_size'],
                      class_weight={0:0.5, 1:5}, start_epoch=0,
                      callback_list = [freeze_layer, weights_history, reduce_lr],
                      workers=2, verbose=0)
    results_per_slice[tslice] = model.evaluate(x_test, y_test)
    logger.info("Results for slice %d: %s\n" % (tslice, results_per_slice[tslice]))

In [None]:
print("Average F1 score: ", np.array([results_per_slice[s][1] for s in results_per_slice.keys()]).mean(),
     "all F1 scores: ", {s: v[1] for (s,v) in results_per_slice.items()} )

## Extra analysis


In [None]:
def merge_tokens(row):
    tokens = []
    if row.tokenized_text:
        tokens += row.tokenized_text
    if row.tokenized_title:
        tokens += row.tokenized_title
    return tokens
writings_df['all_tokens'] = writings_df.apply (lambda row: merge_tokens(row), axis=1)

In [None]:
# TODO: include the title
def extract_emotions(tokens, emotion, relative=True):
    if not tokens:
        return None
    emotion_words = [t for t in tokens 
                     if t in nrc_lexicon[emotion]]
    if relative:
        return len(emotion_words) / len(tokens)
    else:
        return len(emotion_words)
    
    return encoded_emotions

from functools import partial
for emotion in emotions:
    writings_df[emotion] = writings_df['all_tokens'].apply(partial(extract_emotions, emotion=emotion, 
                                                                   relative=True))


In [None]:
writings_df['pronouns'] = writings_df['all_tokens'].apply(partial(encode_pronouns, relative=True))

In [None]:
writings_df[['text', 'label', 'pronouns', 'text_len'] + emotions].corr()

In [None]:
writings_df[['text', 'label', 'pronouns', 'text_len'] + emotions].groupby('label').mean()

In [None]:
from nltk.sentiment import SentimentAnalyzer, SentimentIntensityAnalyzer

In [None]:
sid = SentimentIntensityAnalyzer()


In [None]:
sid.polarity_scores("We are here today happiness is all around")

In [None]:
writings_df['neg_vader'] = writings_df.text.apply(lambda t: sid.polarity_scores(t)['neg']
                                                 if type(t)==str else 0)

In [None]:
writings_df

In [None]:
writings_df['pos_vader'] = writings_df.text.apply(lambda t: sid.polarity_scores(t)['pos']
                                                 if type(t)==str else 0)

In [None]:
writings_df[['text', 'label', 'pronouns', 'text_len', 'neg_vader', 'pos_vader'] + emotions].groupby('label').mean()

In [None]:
writings_df[['text', 'label', 'pronouns', 'text_len', 'neg_vader', 'pos_vader'] + emotions].corr('spearman')

### LIWC

In [None]:
from liwc_readDict import readDict

liwc = readDict('/home/ana/resources/FakeOrFact/features/LIWC/LIWC/liwc.dic')

In [None]:
categories = [c for (w,c) in liwc]
set(categories)

In [None]:
liwc

In [None]:
liwc_dict = {}
for (w, c) in liwc:
    if c not in liwc_dict:
        liwc_dict[c] = []
    liwc_dict[c].append(w)


In [None]:
liwc_dict['pronoun']

In [None]:
def encode_liwc_categories(tokens, category_words, relative=True):
    category_cnt = 0
    if not tokens:
        return None
    text_len = len(tokens)
    for t in tokens:
        for word in category_words:
            if t==word or (word[-1]=='*' and t.startswith(word[:-1])) \
            or (t==word.split("'")[0]):
                category_cnt += 1
                break # one token cannot belong to more than one word in the category
    if relative:
        return category_cnt/text_len
    else:
        return category_cnt

In [None]:
%%time
from functools import partial
# for categ in ['negemo', 'posemo', 'affect', 'sad', 'anx', 'pronoun']:#liwc_dict.keys():
for categ in liwc_dict.keys():
    if categ in writings_df.columns:
        continue
    print("Encoding for category %s..." % categ)
    writings_df[categ] = writings_df['all_tokens'].apply(partial(encode_liwc_categories, 
                                                                   category_words=liwc_dict[categ], 
                                                                   relative=True))


In [None]:
writings_df.groupby('subject').mean()[['label', 'negemo', 'posemo', 'affect', 'sad', 'anx', 'pronoun']].corr()

In [None]:
writings_df[['label', 'negemo', 'posemo', 'affect', 'sad', 'anx', 'pronoun']].groupby('label').mean()

In [None]:
writings_df.groupby('subject').mean()[['label'] + categories].corr()

## Hyperparameter tuning

In [None]:
# Declare your hyperparameters search:
tune_epochs=150
config = {
      "algorithm": "random",
      "parameters": {
          "lstm_units": {"type": "integer", "min": 10, "max": 1000},
          "dense_bow_units": {"type": "integer", "min": 1, "max": 50},
          "lr": {"type": "float", "min": 0.00001, "max": 0.5, "scalingType": "loguniform"},
          "l2_dense": {"type": "float", "min": 0.0000001, "max": 0.05, "scalingType": "loguniform"},
          "l2_embeddings": {"type": "float", "min": 0.0000001, "max": 0.05, "scalingType": "loguniform"},
          "dropout": {"type": "float", "min": 0, "max": 0.7, "scalingType": "uniform"},
          "norm_momentum": {"type": "float", "min": 0.01, "max": 0.99, "scalingType": "uniform"},
          "optimizer": {"type": "categorical", "values": ["adam", "adagrad", ""]},
          "batch_size": {"type": "integer", "min": 10, "max": 512, "scalingType": "loguniform"},
          "positive_class_weight": {"type": "integer", "min": 1, "max": 25},
          "trainable_embeddings": {"type": "discrete", "values": [True, False]},
          "freeze_patience": {"type": "integer", "min": 2, "max": tune_epochs+1},
          "lr_reduce_factor": {"type": "float", "min": 0.0001, "max": 0.8},
          "lr_reduce_patience": {"type": "integer", "min": 2, "max": tune_epochs+1},
          "decay": {"type": "float", "min": 0.00000001, "max": 0.5, "scalingType": "loguniform"},
          "ignore_layers_values": {"type": "categorical", "values": ["attention", "batchnorm", ""]}
      },
      "spec": {
          "metric": "loss",
          "objective": "minimize",
      },
  }
optimizer = Optimizer(config, api_key="eoBdVyznAhfg3bK9pZ58ZSXfv")

for experiment in optimizer.get_experiments(project_name="mental"):
    experiment.add_tag("tune")
    
    # Test the model
    hyperparams_config = {
        param: experiment.get_parameter(param) for param in config['parameters'].keys()}
    if not hyperparams_config['optimizer']:
        hyperparams_config['optimizer'] = optimizers.Adam(lr=hyperparams_config['lr'], 
                                   decay=hyperparams_config['decay'])
    hyperparams_config["ignore_layers"] = []
    if hyperparams_config["ignore_layers_values"]:
        hyperparams_config["ignore_layers"] = [hyperparams_config["ignore_layers_values"]]
    model = build_model(hyperparams=hyperparams_config,
                        hyperparams_features=hyperparams_features, 
                        embedding_matrix=embedding_matrix, emotions=emotions,
                       stopwords_list=stopword_list, liwc_categories=categories)
    freeze_layer = FreezeLayer(patience=experiment.get_parameter('freeze_patience'),
                              set_to=not experiment.get_parameter('trainable_embeddings'))
    reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_loss', 
                                            factor=experiment.get_parameter('lr_reduce_factor'),
                                            patience=experiment.get_parameter('lr_reduce_patience'), 
                                            min_lr=0.000001, verbose=1)
    model, history = train_model(model, 
            x_train, y_train, x_test, y_test,
            epochs=tune_epochs, batch_size=experiment.get_parameter('batch_size'),
                      class_weight={0:1, 1:experiment.get_parameter('positive_class_weight')}, 
                          workers=2,
                          callback_list = [freeze_layer, reduce_lr],
                      model_path='models/experiment')
    loss = history.history['loss'][-1]
    
    # Report the loss, if not auto-logged:
    experiment.log_metric("loss", loss)