In [20]:
import pandas as pd
import xml.etree.ElementTree as ET
import glob, os
import numpy as np
from comet_ml import Experiment, Optimizer
import pickle
import logging
import sys
from sklearn.utils import class_weight

In [21]:
import tensorflow as tf

In [22]:
os.environ['TF_KERAS'] = '1'
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, Lambda, BatchNormalization, TimeDistributed, \
     Bidirectional, Input, concatenate, Flatten, RepeatVector, Activation, Multiply, Permute#, CuDNNLSTM
from tensorflow.keras import regularizers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import callbacks, optimizers
from tensorflow.keras import backend as K
from tensorflow.keras.utils import plot_model, Sequence

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

import tensorflow as tf

In [23]:
import tensorflow as tf
tf.test.is_gpu_available()

True

In [24]:
my_seed = 1234
# tf.set_random_seed(my_seed)

In [25]:
logger = logging.getLogger('training')
logger.addHandler(logging.StreamHandler(sys.stdout))
logger.setLevel(logging.DEBUG)

# Read data

In [26]:
def read_subject_writings(subject_file):
    writings = []
    with open(subject_file) as sf:
        contents = sf.read()
        root = ET.fromstring(contents)
        try:
            subject = root.findall('ID')[0].text.strip()
        except Exception:
            print('Cannot extract ID', contents[:500], '\n-------\n')        
        for w in root.iter('WRITING'):
            subject_writings = {'subject': subject}
            for title in w.findall('TITLE'):
                subject_writings['title'] = title.text
            for text in w.findall('TEXT'):
                subject_writings['text'] = text.text
            for date in w.findall('DATE'):
                subject_writings['date'] = date.text
            writings.append(subject_writings)
    return writings

In [27]:
# root_dir = '/home/anasab/' 
root_dir = '/home/anasab/'

In [28]:
datadir_T2 = root_dir + '/eRisk/data/eRisk2020_T2/eRisk2020_T2_TRAINING_DATA/'
labels_file_T2 = root_dir + '/eRisk/data/eRisk2020_T2/eRisk2020_T2_TRAINING_DATA/Depression Questionnaires_anon.txt'
nr_questions = 21

In [29]:
def read_texts(datadir_T2,
                labels_file_T2):
    writings = []
    writings_df = pd.DataFrame()
    labels_df = pd.DataFrame()

        
    for subject_file in os.listdir(datadir_T2):
        if not subject_file.startswith('subject'):
            continue
        writings.extend(read_subject_writings(os.path.join(datadir_T2, subject_file)))
    writings_df = pd.DataFrame(writings)
    
    labels_df = pd.read_csv(os.path.join(labels_file_T2), 
                                 delimiter='\s+', names=['subject'] + ['label%i' % i for i in range(nr_questions)])

    labels_df = labels_df.set_index('subject')
    
    writings_df = writings_df.join(labels_df, on='subject')
    
    return writings_df, labels_df

In [30]:
# writings_df, labels_df = read_texts(datadir_T2, labels_file_T2)
writings_df = pickle.load(open('writings_df_T2_liwc.pkl', 'rb'))

In [31]:
writings_df.groupby('subject').count()

Unnamed: 0_level_0,title,text,date,label0,label1,label2,label3,label4,label5,label6,...,feel,excl,future,nonfl,ppron,shehe,i,we,you,they
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
subject1272,120,120,120,120,120,120,120,120,120,120,...,120,120,120,120,120,120,120,120,120,120
subject2341,129,129,129,129,129,129,129,129,129,129,...,129,129,129,129,129,129,129,129,129,129
subject2432,332,332,332,332,332,332,332,332,332,332,...,332,332,332,332,332,332,332,332,332,332
subject2827,663,663,663,663,663,663,663,663,663,663,...,659,659,659,659,659,659,659,659,659,659
subject2903,313,313,313,313,313,313,313,313,313,313,...,313,313,313,313,313,313,313,313,313,313
subject2961,180,180,180,180,180,180,180,180,180,180,...,180,180,180,180,180,180,180,180,180,180
subject3707,1022,1022,1022,1022,1022,1022,1022,1022,1022,1022,...,1022,1022,1022,1022,1022,1022,1022,1022,1022,1022
subject3993,1510,1510,1510,1510,1510,1510,1510,1510,1510,1510,...,1509,1509,1509,1509,1509,1509,1509,1509,1509,1509
subject4058,1028,1028,1028,1028,1028,1028,1028,1028,1028,1028,...,1028,1028,1028,1028,1028,1028,1028,1028,1028,1028
subject436,29,29,29,29,29,29,29,29,29,29,...,29,29,29,29,29,29,29,29,29,29


In [32]:
writings_df

Unnamed: 0,subject,title,text,date,label0,label1,label2,label3,label4,label5,...,feel,excl,future,nonfl,ppron,shehe,i,we,you,they
0,subject5791,,"Great, thanks a ton!",2018-10-30 17:35:30,1,0,1,1,0,0,...,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0
1,subject5791,The search button gives me a 404 error,Just downloaded GBA4ios 2.1 and when I go to ...,2018-10-30 17:19:41,1,0,1,1,0,0,...,0.0,0.074468,0.074468,0.0,0.063830,0.0,0.063830,0.0,0.000000,0.0
2,subject5791,,Remindme! 1 week,2018-10-30 14:33:49,1,0,1,1,0,0,...,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0
3,subject5791,,Me too please,2018-10-19 18:06:38,1,0,1,1,0,0,...,0.0,0.000000,0.000000,0.0,0.333333,0.0,0.333333,0.0,0.000000,0.0
4,subject5791,,Any chance you can pm me what this spoiler is...,2018-10-19 18:04:14,1,0,1,1,0,0,...,0.0,0.027778,0.111111,0.0,0.138889,0.0,0.111111,0.0,0.027778,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10936,subject3993,Alternative Currency Being Considered in Penn...,,2009-01-07 18:41:30,0,0,0,0,0,0,...,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0
10937,subject3993,Asus' new keyboard. Oh wait... thats not a ke...,,2009-01-07 17:13:53,0,0,0,0,0,0,...,0.0,0.076923,0.076923,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0
10938,subject3993,Homeland Security USA - tripe to entertain mo...,,2009-01-07 07:09:19,0,0,0,0,0,0,...,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0
10939,subject3993,10 dead as Israeli missile hits near U.N. sch...,,2009-01-06 17:15:24,0,0,0,0,0,0,...,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0


## Preprocess text

In [33]:
tokenizer = RegexpTokenizer(r'\w+')

def tokenize(t):
    return tokenizer.tokenize(t.lower())

In [34]:
def tokenize_fields(writings_df):
    writings_df['tokenized_title'] = writings_df['title'].apply(lambda t: tokenize(t) 
                                                                if type(t)==str and t else None)
    writings_df['title_len'] = writings_df['tokenized_title'].apply(lambda t: len(t) 
                                                                    if type(t)==list and t else None)
    writings_df['tokenized_text'] = writings_df['text'].apply(lambda t: tokenize(t) 
                                                              if type(t)==str and t else None)
    writings_df['text_len'] = writings_df['tokenized_text'].apply(lambda t: len(t) 
                                                                  if type(t)==list and t else None)
    return writings_df

In [35]:
writings_df = tokenize_fields(writings_df)

In [36]:
writings_df.text_len.describe()

count    10409.000000
mean        50.365069
std         84.811676
min          1.000000
25%          9.000000
50%         24.000000
75%         54.000000
max       1567.000000
Name: text_len, dtype: float64

In [37]:
writings_df.title_len.describe()

count    1119.000000
mean       11.246649
std         6.979392
min         1.000000
25%         6.000000
50%        10.000000
75%        15.000000
max        51.000000
Name: title_len, dtype: float64

In [38]:
writings_df.groupby('subject').count().title.describe()

count      20.000000
mean      547.050000
std       446.144828
min        29.000000
25%       180.000000
50%       327.500000
75%      1006.250000
max      1510.000000
Name: title, dtype: float64

In [39]:
writings_df.groupby('subject').count().text.describe()

count      20.000000
mean      547.050000
std       446.144828
min        29.000000
25%       180.000000
50%       327.500000
75%      1006.250000
max      1510.000000
Name: text, dtype: float64

# Extract features

In [40]:
hyperparams_features = {
    "max_features": 20000,
    # cut texts after this number of words
    # (among top max_features most common words)
    "maxlen": 100,
    "embedding_dim": 50,
    "user_level": True,
    "posts_per_user": 10,
    "batch_size": 1,
}

#### Emotions

In [41]:
def load_NRC(nrc_path):
    word_emotions = {}
    emotion_words = {}
    with open(nrc_path) as in_f:
        for line in in_f:
            line = line.strip()
            if not line:
                continue
            word, emotion, label = line.split()
            if word not in word_emotions:
                word_emotions[word] = set()
            if emotion not in emotion_words:
                emotion_words[emotion] = set()
            label = int(label)
            if label:
                word_emotions[word].add(emotion)
                emotion_words[emotion].add(word)
    return emotion_words

nrc_lexicon_path = root_dir + '/resources/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt'
nrc_lexicon = load_NRC(nrc_lexicon_path)
emotions = list(nrc_lexicon.keys())


In [42]:
def encode_emotions(tokens, emotion_lexicon, emotions, relative=True):
    text_len = len(tokens)
    encoded_emotions = [0 for e in emotions]
    for i, emotion in enumerate(emotions):
        try:
            emotion_words = [t for t in tokens if t in emotion_lexicon[emotion]]
            if relative:
                encoded_emotions[i] = len(emotion_words) / len(tokens)
            else:
                encoded_emotions[i] = len(emotion_words)
        except ValueError:
            print("Emotion not found.")
    return encoded_emotions

In [43]:
from liwc_readDict import readDict

liwc = readDict(root_dir + '/resources/liwc.dic')

categories = set([c for (w,c) in liwc])
len(categories)

64

#### Personal pronouns

In [44]:
first_person_pronouns = {"i", "me", "my", "mine", "myself"}
def encode_pronouns(tokens, pronouns={"i", "me", "my", "mine", "myself"}, relative=True):
    if not tokens:
        return np.nan
    text_len = len(tokens)
    nr_pronouns = len([t for t in tokens if t in pronouns])
    if relative:
        return nr_pronouns/text_len
    else:
        return nr_pronouns

#### Stopwords

In [45]:
stopword_list = stopwords.words("english")
def encode_stopwords(tokens, stopwords=stopword_list):
    encoded_stopwords = [0 for s in stopword_list]
    if not tokens:
        return encoded_stopwords
    for i, stopword in enumerate(stopwords):
        if stopword in tokens:
            encoded_stopwords[i] += 1
    return encoded_stopwords

### BERT

In [46]:
# from easybert import Bert
# bert = Bert("https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1")

In [47]:
# x = bert.embed("A sequence of words is a sequebce.")

In [48]:
# def encode_bert(sequence):
#     return bert.embed(sequence)

## Simple transformers


In [49]:
# import simpletransformers

In [50]:
# prefix='/home/anasab/eRisk/data/'
# train_df = pd.read_csv(prefix + 'train.csv', header=None)
# train_df.head()

# eval_df = pd.read_csv(prefix + 'test.csv', header=None)
# eval_df.head()

# train_df[0] = (train_df[0] == 2).astype(int)
# eval_df[0] = (eval_df[0] == 2).astype(int)

# train_df = pd.DataFrame({
#     'text': train_df[1].replace(r'\n', ' ', regex=True),
#     'label':train_df[0]
# })

# print(train_df.head())

# eval_df = pd.DataFrame({
#     'text': eval_df[1].replace(r'\n', ' ', regex=True),
#     'label':eval_df[0]
# })

# print(eval_df.head())

In [51]:
# from simpletransformers.classification import ClassificationModel


# # Create a TransformerModel
# model = ClassificationModel('roberta', 'roberta-base')

# # Train the model
# model.train_model(train_df)

# # Evaluate the model
# result, model_outputs, wrong_predictions = model.eval_model(eval_df)

In [52]:
# from bert import albert_tokenization
# from bert import bert_tokenization

In [53]:
import tensorflow_hub as hub
# bert_path = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"
bert_path = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1"
import bert
from bert.tokenization.bert_tokenization import FullTokenizer

In [54]:
bert_layer = hub.KerasLayer(bert_path,# signature='tokens' , signature_outputs_as_dict=True,
                            trainable=True)


In [55]:
class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, label=None):
        """Constructs a InputExample.
    Args:
      guid: Unique id for the example.
      text_a: string. The untokenized text of the first sequence. For single
        sequence tasks, only this sequence must be specified.
      text_b: (Optional) string. The untokenized text of the second sequence.
        Only must be specified for sequence pair tasks.
      label: (Optional) string. The label of the example. This should be
        specified for train and dev examples, but not for test examples.
    """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label

In [56]:
def encode_text_for_bert(tokenizer, example, max_seq_length=256):
    """Converts a single `InputExample` into a single `InputFeatures`."""

#     if isinstance(example, PaddingInputExample):
#         input_ids = [0] * max_seq_length
#         input_mask = [0] * max_seq_length
#         segment_ids = [0] * max_seq_length
#         label = 0
#         return input_ids, input_mask, segment_ids, label

    tokens_a = tokenizer.tokenize(example.text_a)
    if len(tokens_a) > max_seq_length - 2:
        tokens_a = tokens_a[0 : (max_seq_length - 2)]

    tokens = []
    segment_ids = []
    tokens.append("[CLS]")
    segment_ids.append(0)
    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(0)
    tokens.append("[SEP]")
    segment_ids.append(0)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        input_mask.append(0)
        segment_ids.append(0)

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    return input_ids, input_mask, segment_ids, example.label


In [57]:
def create_tokenizer_from_hub_module():
    """Get the vocab file and casing info from the Hub module."""
#     bert_module =  hub.Module(bert_path)
#     tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
    do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()

#     vocab_file, do_lower_case = sess.run(
#         [
#             tokenization_info["vocab_file"],
#             tokenization_info["do_lower_case"],
#         ]
#     )

    return FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)

In [58]:
# Instantiate tokenizer
# bert_tokenizer = FullTokenizer()
bert_tokenizer = create_tokenizer_from_hub_module()

encode_text_for_bert(bert_tokenizer, InputExample(None, 
                                               "Ana are mere"), 
                       hyperparams_features['maxlen'])

([101,
  9617,
  2024,
  8210,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
 

### tfhub albert

In [59]:
# input_ids = tf.keras.layers.Input(shape=[None], dtype=tf.int32)
# input_mask = tf.keras.layers.Input(shape=[None], dtype=tf.int32)
# sequence_mask = tf.keras.layers.Input(shape=[None], dtype=tf.int32)

# albert = hub.KerasLayer(
#     "https://tfhub.dev/google/albert_xlarge/3",
#     trainable=True,
#     signature="tokens",
#     output_key="pooled_output",
# )

# features = {
#     "input_ids": input_ids,
#     "input_mask": input_mask,
#     "segment_ids": sequence_mask,
# }
# out = albert(features)
# model = tf.keras.Model(inputs=[input_ids, input_mask, sequence_mask], outputs=out)
# model.compile("adam", loss="sparse_categorical_crossentropy")
# model.summary()

### Encode data

In [60]:
from collections import Counter
def encode_labels(labels):
    '''Convert ia to i and ib to -i'''
    encoded_labels = []
    for i, l in enumerate(labels):
        try:
            encoded_labels.append(int(l))
        except Exception as e:
            logger.debug("Encoding label %s\n" % l)
        
            if str(l)[-1] == 'a':
                encoded_labels.append(int(l[0]))
            elif str(l)[-1] == 'b':
                encoded_labels.append(-int(l[0]))
            else:
                logger.warning("Coult not encode label %s\n" % l)
    return encoded_labels

def load_erisk_data(writings_df, voc_size, emotion_lexicon, seq_len, emotions =  
                    ['anger', 'anticipation', 'disgust', 'fear', 'joy', 
                     'negative', 'positive', 'sadness', 'surprise', 'trust'],
                    liwc_categories = categories, ignore_features=[],
                    pronouns = ["i", "me", "my", "mine", "myself"],
                    train_prop=0.7, valid_prop=0.3, test_slice=2,
                    nr_slices=5,
                    min_post_len=3, min_word_len=1, 
                    user_level=True, vocabulary=None,
                   logger=logger):
    logger.debug("Loading data...\n")
    if not vocabulary:
        vocabulary = {}
        word_freqs = Counter()
        for words in writings_df.tokenized_text:
            word_freqs.update(words)
        for words in writings_df.tokenized_title:
            word_freqs.update(words)
        i = 1
        for w, f in word_freqs.most_common(voc_size-2): # keeping voc_size-1 for unk
            if len(w) < min_word_len:
                continue
            vocabulary[w] = i
            i += 1

    if 'subset' in writings_df.columns:
        training_subjects = list(set(writings_df[writings_df['subset']=='train'].subject))
        test_subjects = list(set(writings_df[writings_df['subset']=='test'].subject))
    else:
        all_subjects = sorted(list(set(writings_df.subject)))
        training_subjects_size = int(len(all_subjects) * train_prop)
        test_subjects_size = len(all_subjects) - training_subjects_size
        # Cross-validation, with fixed slice as input
        test_prop = 1-train_prop
        test_slice = min(test_slice, nr_slices)
        logger.debug("start index: %f, from %f\n" % (
            len(all_subjects)*(1/nr_slices)*test_slice, test_prop*test_slice))
        start_slice = int(len(all_subjects)*(1/nr_slices)*test_slice)
        test_subjects = all_subjects[start_slice: start_slice+test_subjects_size]
        training_subjects = [s for s in all_subjects if s not in test_subjects]
    training_subjects = sorted(training_subjects) # ensuring reproducibility
    valid_subjects_size = int(len(training_subjects) * valid_prop)
    valid_subjects = training_subjects[:valid_subjects_size]
    training_subjects = training_subjects[valid_subjects_size:]
    categories = [c for c in liwc_categories if c in writings_df.columns]
    logger.debug("%d training users, %d validation users, %d test users." % (
        len(training_subjects), 
          len(valid_subjects),
          len(test_subjects)))
    subjects_split = {'train': training_subjects, 
                      'valid': valid_subjects, 
                      'test': test_subjects}

    user_level_texts = {}
    for row in writings_df.sort_values(by='date').itertuples():
        words = []
        raw_text = ""
        if row.tokenized_title:
            words.extend(row.tokenized_title)
            raw_text += row.title
        if row.tokenized_text:
            words.extend(row.tokenized_text)
            raw_text += row.text
        if not words or len(words)<min_post_len:
            print(row.subject)
            continue
        labels = [getattr(row, 'label%d'%i) for i in range(nr_questions)]
        liwc_categs = [getattr(row, categ) for categ in categories]
        if row.subject not in user_level_texts.keys():
            user_level_texts[row.subject] = {}
            user_level_texts[row.subject]['texts'] = [words]
            user_level_texts[row.subject]['labels'] = encode_labels(labels)
            user_level_texts[row.subject]['liwc'] = [liwc_categs]
            user_level_texts[row.subject]['raw'] = [raw_text]
        else:
            user_level_texts[row.subject]['texts'].append(words)
            user_level_texts[row.subject]['liwc'].append(liwc_categs)
            user_level_texts[row.subject]['raw'].append(raw_text)

    return user_level_texts, subjects_split, vocabulary


In [61]:
user_level_data, subjects_split, vocabulary = load_erisk_data(writings_df, 
                                                            seq_len=hyperparams_features['maxlen'],
                                                            voc_size=hyperparams_features['max_features'],
                                                           emotion_lexicon=nrc_lexicon,
                                                           emotions=emotions,
                                                           user_level=hyperparams_features['user_level'],
                                                                                logger=logger
#                                                            vocabulary=pickle.load(open('vocabulary20K_selfharm.pkl', 'rb'))
                                                                               )

Loading data...

Loading data...



DEBUG:training:Loading data...



start index: 8.000000, from 0.600000

start index: 8.000000, from 0.600000



DEBUG:training:start index: 8.000000, from 0.600000



10 training users, 4 validation users, 6 test users.
10 training users, 4 validation users, 6 test users.


DEBUG:training:10 training users, 4 validation users, 6 test users.


subject3993
subject3993
subject3993
subject9798
subject9798
subject9798
subject9798
subject9798
subject9798
subject9798
subject9798
subject2903
subject9798
subject6619
subject2903
subject2903
subject9798
subject6619
subject6619
subject6619
subject6619
subject6619
subject6619
subject6619
subject6619
subject6619
subject6619
subject3993
subject5791
subject6619
subject5791
subject9798
subject6619
subject9798
subject6619
subject6619
subject6619
subject6619
subject6619
subject6619
subject5791
subject7039
subject5791
subject5791
subject6619
subject6619
subject6619
subject6619
subject6619
subject6635
subject6619
subject6635
subject6619
subject6635
subject6635
subject7039
subject3993
subject6635
subject6619
subject7039
subject9694
subject6619
subject7039
subject7039
subject7039
subject7039
subject7039
subject7039
subject7039
subject7039
subject7039
subject6619
subject7039
subject9694
subject6635
subject6635
subject7039
subject9694
subject9694
subject9694
subject9694
subject9694
subject7039
subj

subject6900
subject9798
subject2903
subject2903


In [62]:
def load_embeddings(path, embedding_dim, voc):
    # random matrix with mean value = 0
    embedding_matrix = np.random.random((len(voc)+2, embedding_dim)) - 0.5 # voc + unk + pad value(0)

    f = open(path)
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        word_i = voc.get(word)
        if word_i is not None:
            embedding_matrix[word_i] = coefs
    f.close()

    print('Total %s word vectors.' % len(embedding_matrix))

 
    return embedding_matrix

pretrained_embeddings_path = root_dir + '/resources/glove.twitter.27B/glove.twitter.27B.%dd.txt' % hyperparams_features['embedding_dim']
embedding_matrix = load_embeddings(pretrained_embeddings_path, hyperparams_features['embedding_dim'], vocabulary)


Total 20000 word vectors.


## Data Generator

In [63]:
class DataGenerator(Sequence):
    'Generates data for Keras'
    def __init__(self, user_level_data, subjects_split, set_type='train', bert_tokenizer=bert_tokenizer,
                 batch_size=hyperparams_features['batch_size'], seq_len=hyperparams_features['maxlen'], 
                 voc_size=hyperparams_features['max_features'], emotion_lexicon=nrc_lexicon,
                 emotions=emotions, pronouns=["i", "me", "my", "mine", "myself"], 
                 max_posts_per_user=hyperparams_features['posts_per_user'],
                 shuffle=True):
        'Initialization'
        self.seq_len = seq_len
        self.bert_tokenizer = bert_tokenizer
        self.subjects_split = subjects_split
        self.set = set_type
        self.emotion_lexicon = emotion_lexicon
        self.batch_size = batch_size
        self.data = user_level_data
        self.emotions = emotions
        self.pronouns = pronouns
        self.shuffle = shuffle
        self.voc_size = voc_size
        self.max_posts_per_user = max_posts_per_user
        self.on_epoch_end()

    def __encode_text(self, tokens, raw_text):
        # Using voc_size-1 value for OOV token
        encoded_tokens = [vocabulary.get(w, self.voc_size-1) for w in tokens]
        encoded_emotions = encode_emotions(tokens, self.emotion_lexicon, self.emotions)
        encoded_pronouns = encode_pronouns(tokens, self.pronouns)
        encoded_stopwords = encode_stopwords(tokens)
        bert_ids, bert_masks, bert_segments, label = encode_text_for_bert(self.bert_tokenizer, InputExample(None, 
                                               raw_text), self.seq_len)
        return (encoded_tokens, encoded_emotions, encoded_pronouns, encoded_stopwords,
               bert_ids, bert_masks, bert_segments)
        
    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.subjects_split[self.set]) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        user_indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        # Find users
        users = [self.subjects_split[self.set][i] for i in user_indexes
                    if self.subjects_split[self.set][i] in self.data.keys()] # TODO: maybe needs a warning that user is missing

        post_indexes = {}
        # Sample post ids
        for subject in users:
            posts_len = len(self.data[subject]['texts'])
            posts_index_sample = sorted(np.random.choice(posts_len, 
                                                         min(self.max_posts_per_user, posts_len),
                                                         replace=False))
            post_indexes[subject] = posts_index_sample
        # Generate data
        X, y = self.__data_generation(users, post_indexes)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.subjects_split[self.set]))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, users, post_indexes):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        tokens_data = []
        categ_data = []
        sparse_data = []
        subjects = []
        bert_ids_data = []
        bert_masks_data = []
        bert_segments_data = []
        labels = []

        for subject in users:
            texts = self.data[subject]['texts']
            raw_texts = self.data[subject]['raw']
            label = self.data[subject]['label']
            liwc_scores = self.data[subject]['liwc']
            
            # Sample
            texts = [texts[i] for i in post_indexes[subject]]
            liwc_selection = [liwc_scores[i] for i in post_indexes[subject]]
            raw_texts = [raw_texts[i] for i in post_indexes[subject]]
            
            all_words = [sum(texts, [])] # merge all texts in one list -- Ok, why sum?? this is wrong!!
            liwc_aggreg = [np.array(liwc_selection).mean(axis=0).tolist()]
            all_raw_texts = [" ".join(raw_texts)]
            
            for i, words in enumerate(all_words):
                encoded_tokens, encoded_emotions, encoded_pronouns, encoded_stopwords, \
                    bert_ids, bert_masks, bert_segments = self.__encode_text(words, all_raw_texts[i])
                subject_id = int(subject.split('t')[1])
                tokens_data.append(encoded_tokens)
                categ_data.append(encoded_emotions + [encoded_pronouns] + liwc_aggreg[i])
                sparse_data.append(encoded_stopwords)
                bert_ids_data.append(bert_ids)
                bert_masks_data.append(bert_masks)
                bert_segments_data.append(bert_segments)
                
                labels.append(label)
                subjects.append(subject_id)

        
        # using zeros for padding
        tokens_data_padded = sequence.pad_sequences(tokens_data, maxlen=self.seq_len)

        return ([np.array(tokens_data_padded), np.array(categ_data), np.array(sparse_data),
                 np.array(bert_ids_data), np.array(bert_masks_data), np.array(bert_segments_data),
                np.array(subjects)],
                np.array(labels))

In [64]:
class DataGenerator(Sequence):
    'Generates data for Keras'
    def __init__(self, user_level_data, subjects_split, batch_size=hyperparams_features['batch_size'], 
                 seq_len=hyperparams_features['maxlen'], voc_size=hyperparams_features['max_features'], 
                 emotion_lexicon=nrc_lexicon, set_type='train', test_user_indexes=[0],
                 emotions=emotions, pronouns=["i", "me", "my", "mine", "myself"], 
                 max_posts_per_user=hyperparams_features['posts_per_user'],
                 bert_tokenizer=bert_tokenizer,
                 shuffle=True):
        'Initialization'
        self.seq_len = seq_len
        self.emotion_lexicon = emotion_lexicon
        self.bert_tokenizer = bert_tokenizer
        self.batch_size = batch_size
        self.data = user_level_data
        self.all_users = list(self.data.keys())
        self.emotions = emotions
        self.pronouns = pronouns
        self.set = set_type
        self.subjects_split = subjects_split
        self.shuffle = shuffle
        self.voc_size = voc_size
        self.max_posts_per_user = max_posts_per_user
        self.test_user_indexes = test_user_indexes
        self.on_epoch_end()


    def __encode_text(self, tokens, raw_text):
        # Using voc_size-1 value for OOV token
        encoded_tokens = [vocabulary.get(w, self.voc_size-1) for w in tokens]
        encoded_emotions = encode_emotions(tokens, self.emotion_lexicon, self.emotions)
        encoded_pronouns = encode_pronouns(tokens, self.pronouns)
        encoded_stopwords = encode_stopwords(tokens)
        bert_ids, bert_masks, bert_segments, label = encode_text_for_bert(self.bert_tokenizer, InputExample(None, 
                                               raw_text), self.seq_len)
        return (encoded_tokens, encoded_emotions, encoded_pronouns, encoded_stopwords,
               bert_ids, bert_masks, bert_segments)
        
    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.subjects_split[self.set]) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        user_indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        # Find users
        users = [self.subjects_split[self.set][i] for i in user_indexes
                    if self.subjects_split[self.set][i] in self.data.keys()] # TODO: maybe needs a warning that user is missing

        post_indexes = {}
        # Sample post ids
        for subject in users:
            posts_len = len(self.data[subject]['texts'])
            posts_index_sample = sorted(np.random.choice(posts_len, 
                                                         min(self.max_posts_per_user, posts_len),
                                                         replace=False))
            post_indexes[subject] = posts_index_sample
        # Generate data
        X, y = self.__data_generation(users, post_indexes)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.subjects_split[self.set]))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, users, post_indexes):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        tokens_data = []
        categ_data = []
        sparse_data = []
        subjects = []
        bert_ids_data = []
        bert_masks_data = []
        bert_segments_data = []
        labels = []
        for subject in users:
            texts = self.data[subject]['texts']
            label = self.data[subject]['labels']
            raw_texts = self.data[subject]['raw']

            # Sample
            texts = [texts[i] for i in post_indexes[subject]]
            liwc_selection = [self.data[subject]['liwc'][i] for i in post_indexes[subject]]
            raw_texts = [raw_texts[i] for i in post_indexes[subject]]

            all_words = [sum(texts, [])] # merge all texts in one list
            liwc_aggreg = [np.array(liwc_selection).mean(axis=0).tolist()]
            all_raw_texts = [" ".join(raw_texts)]

            
            for i, words in enumerate(all_words):
                encoded_tokens, encoded_emotions, encoded_pronouns, encoded_stopwords, \
                    bert_ids, bert_masks, bert_segments = self.__encode_text(words, all_raw_texts[i])
                subject_id = int(subject.split('t')[1])
                tokens_data.append(encoded_tokens)
                categ_data.append(encoded_emotions + [encoded_pronouns] + liwc_aggreg[i])
                sparse_data.append(encoded_stopwords)
                labels.append(label)
                bert_ids_data.append(bert_ids)
                bert_masks_data.append(bert_masks)
                bert_segments_data.append(bert_segments)
                
                subjects.append(subject_id)

        
        # using zeros for padding
        tokens_data_padded = sequence.pad_sequences(tokens_data, maxlen=self.seq_len)

        return ([np.array(tokens_data_padded), np.array(categ_data), np.array(sparse_data),
#                 np.array(subjects),
                np.array(bert_ids_data), np.array(bert_masks_data), np.array(bert_segments_data),],
                np.array(labels).reshape(-1, len(labels)).tolist()) # to have one array per output

In [65]:
# TODO: Don't split into the 3 sets, do leave-one-out cross-validation

In [66]:
def get_subjects_split(test_size=hyperparams_features['batch_size']):
    test_user_indexes = [np.random.randint(len(user_level_data)) for i in range(test_size)]

    subjects_split = {'test': [u for i,u in 
                               enumerate(user_level_data.keys()) if i in test_user_indexes],
                     'train': [u for i,u in 
                               enumerate(user_level_data.keys()) if i not in test_user_indexes],}
    return subjects_split

In [67]:
logger.setLevel(logging.DEBUG)

# TODO: it is skipping the last batch
x_data = {'train': [], 'test': []}
y_data = {'train': [], 'test': []}
subjects_split = get_subjects_split()
for set_type in ['test']:
    for x, y in DataGenerator(user_level_data, batch_size=hyperparams_features['batch_size'],
                            set_type=set_type,
                             subjects_split=subjects_split):
        print(x)
        x_data[set_type].append(x)
        y_data[set_type].append(y)
#         break


[array([[   97,  4565,    52,    79,   214,    51,    10,     4,   285,
          717,     2,    36,     1,   294,     5,   202,    62,   970,
           12,   108,    15,    52,    79,     3,  5811,    35,   559,
            8,   272,    50,    23,    11,     5,    17,   942,     3,
         3877,     1,   508,     1,   184,     1,   616,    34,   789,
           18,    71,    44,     1,    68,     7,   290,    18,     1,
         1593,     5,     1,  3408,     5,    97,   102,    31,   646,
           35,    55,    23,     6,   291,     3,    10,    51,    57,
        13934,     8,   274,    55,    77,    23,  1218,     8,     1,
          272,   170,    71,     5,   112,  2051,   473,     1,   682,
           86,    40,    16,  2104,    77,   472,    53,     3,   731,
          545]], dtype=int32), array([[0.01456311, 0.01941748, 0.        , 0.01941748, 0.01456311,
        0.02427184, 0.03883495, 0.00485437, 0.01456311, 0.02427184,
        0.02427184, 0.03542292, 0.01157407, 0.64589

In [68]:
subjects_split

{'test': ['subject4058'],
 'train': ['subject3993',
  'subject9798',
  'subject7039',
  'subject436',
  'subject6619',
  'subject6635',
  'subject2903',
  'subject5791',
  'subject9694',
  'subject5897',
  'subject2341',
  'subject2432',
  'subject3707',
  'subject1272',
  'subject9218',
  'subject9454',
  'subject2961',
  'subject2827',
  'subject6900']}

In [69]:
x_data['train']

[]

In [70]:
np.array([[1,2],[3,4]]).reshape(2,-1)

array([[1, 2],
       [3, 4]])

In [71]:
sum([len(subjects_split[s]) for s in ['train', 'test']])

20

In [72]:
x_data['train']

[]

In [74]:
y_data

{'train': [],
 'test': [[[0],
   [0],
   [0],
   [1],
   [0],
   [0],
   [0],
   [0],
   [1],
   [0],
   [0],
   [0],
   [0],
   [0],
   [3],
   [2],
   [1],
   [0],
   [0],
   [3],
   [0]]]}

# Train

In [75]:
hyperparams = {
    'lstm_units': 10,
    'dense_bow_units': 20,
    'dropout': 0.0,
    'l2_dense': 0.00000011,
    'l2_embeddings': 0.000001,
    'dense_sentence_units': 100,
    'optimizer': 'adam',
    'bert_dense_units': 100,
    'decay': 0.00001,
    'lr': 0.001,
    "trainable_embeddings": True,
    "reduce_lr_factor": 0.0002,
    "reduce_lr_patience": 1000,
    "freeze_patience": 500,
    'threshold': 0.5,
    'bert_len': 768,
    'ignore_layer': ['batchnorm'],
    'norm_momentum': 0.1,

}
if not hyperparams['optimizer']:
    hyperparams['optimizer'] = optimizers.Adam(lr=hyperparams['lr'], #beta_1=0.9, beta_2=0.999, epsilon=0.0001,
                                   decay=hyperparams['decay'])

In [76]:
# class Metrics():
#     def __init__(self, threshold=0.5):
#         self.threshold=threshold
        
#     def recall_m(self, y_true, y_pred):
#             y_labels = y_true
#             y_pred = K.cast(K.greater(K.clip(y_pred, 0, 1), self.threshold), K.floatx())        
#             possible_positives = K.sum(K.round(K.clip(y_labels, 0, 1)))
#             true_positives = K.sum(K.round(K.clip(y_labels * y_pred, 0, 1)))
#             recall = true_positives / (possible_positives + K.epsilon())
#             return recall

#     def precision_m(self, y_true, y_pred):
#             y_labels = y_true
#             y_pred = K.cast(K.greater(K.clip(y_pred, 0, 1), self.threshold), K.floatx())        
#             true_positives = K.sum(K.round(K.clip(y_labels * y_pred, 0, 1)))
#             predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
#             precision = true_positives / (predicted_positives + K.epsilon())
#             return precision

#     def f1_m(self, y_true, y_pred):
#         precision = self.precision_m(y_true, y_pred)
#         recall = self.recall_m(y_true, y_pred)
#         return 2*((precision*recall)/(precision+recall+K.epsilon()))

# def binary_crossentropy_custom(y_true, y_pred):
#     y_labels = y_true
#     return K.binary_crossentropy(y_labels, 
#                                  y_pred)

# metrics_class = Metrics(threshold=hyperparams['threshold'])

In [77]:
def build_model(hyperparams, hyperparams_features, embedding_matrix, emotions, stopwords_list,
                liwc_categories, nr_classes,
               ignore_layer=[]):

    tokens_features = Input(shape=(hyperparams_features['maxlen'],), name='word_seq')
    embedding_layer = Embedding(hyperparams_features['max_features'], 
                                hyperparams_features['embedding_dim'], 
                                input_length=hyperparams_features['maxlen'],
                                embeddings_regularizer=regularizers.l2(hyperparams['l2_embeddings']),
                                weights=[embedding_matrix], 
                                trainable=hyperparams['trainable_embeddings'],
                               name='embeddings_layer')(
        tokens_features)
#     if 'batchnorm' not in ignore_layer:
#         embedding_layer_norm = BatchNormalization(axis=-1, momentum=hyperparams['norm_momentum'],
#                                                      name='embeddings_layer_norm')(embedding_layer)
#     lstm_layers = Bidirectional(LSTM(hyperparams['lstm_units']))(embedding_layer)


    lstm_layers = LSTM(hyperparams['lstm_units'], 
                           return_sequences='attention' not in ignore_layer,
                      name='LSTM_layer')(embedding_layer)
    
    # Attention
    if 'attention' not in ignore_layer:
        attention = Dense(1, activation='tanh', name='attention')(lstm_layers)
        attention = Flatten()(attention)
        attention = Activation('softmax')(attention)
        attention = RepeatVector(hyperparams['lstm_units'])(attention)
        attention = Permute([2, 1])(attention)

        sent_representation = Multiply()([lstm_layers, attention])
        sent_representation = Lambda(lambda xin: K.sum(xin, axis=1), 
                                     output_shape=(hyperparams['lstm_units'],)
                                    )(sent_representation)

        
    else:
        sent_representation = lstm_layers
        
    
    sent_representation = Dropout(hyperparams['dropout'], name='lstm_att_dropout')(sent_representation)
    if hyperparams['dense_sentence_units']:
        sent_representation = Dense(units=hyperparams['dense_sentence_units'],
                                   name='dense_sent_representation')(sent_representation)
    numerical_features = Input(shape=(len(emotions) + 1 + len(liwc_categories),), name='numeric_input') # emotions and pronouns
    
    in_id_bert = Input(shape=(hyperparams_features['maxlen'],), name="input_ids_bert")
    in_mask_bert = Input(shape=(hyperparams_features['maxlen'],), name="input_masks_bert")
    in_segment_bert = Input(shape=(hyperparams_features['maxlen'],), name="segment_ids_bert")
    bert_layer = hub.KerasLayer(
        "https://tfhub.dev/google/albert_xlarge/3",
        trainable=False,
        signature="tokens",
        output_key="pooled_output",
    )

    bert_features = {
        "input_ids": in_id_bert,
        "input_mask": in_mask_bert,
        "segment_ids": in_segment_bert,
    }
    bert_output = bert_layer(bert_features)
    dense_layer_bert = Dense(units=hyperparams['bert_dense_units'],
                        kernel_regularizer=regularizers.l2(hyperparams['l2_dense']),
                        name='bert_dense_layer',
                       )(bert_output)
    sparse_features = Input(shape=(len(stopwords_list),), name='sparse_input') # stopwords

    if hyperparams['dense_bow_units']:
        dense_layer_sparse = Dense(units=hyperparams['dense_bow_units'],
                              name='sparse_feat_dense_layer',
                                kernel_regularizer=regularizers.l2(hyperparams['l2_dense']),
                              )(sparse_features)
    else:
        dense_layer_sparse = sparse_features
    
    if 'batchnorm' not in ignore_layer:
        numerical_features_norm = BatchNormalization(axis=1, momentum=hyperparams['norm_momentum'],
                                                     name='numerical_features_norm')(numerical_features)
        sent_representation_norm = BatchNormalization(axis=1, momentum=hyperparams['norm_momentum'],
                                                      name='sent_repr_norm')(sent_representation)

        dense_layer_sparse_norm = BatchNormalization(axis=1, momentum=hyperparams['norm_momentum'],
                                                         name='sparse_features_norm')(dense_layer_sparse)
        dense_layer_bert_norm = BatchNormalization(axis=1, momentum=hyperparams['norm_momentum'],
                                                         name='bert_features_norm')(dense_layer_bert)
        
    subjects = Input(shape=(1,), name='subjects')
    

    all_layers = {
        'lstm_layers': sent_representation,
        'numerical_dense_layer': numerical_features,
        'sparse_feat_dense_layer': dense_layer_sparse,
        'bert_layer': dense_layer_bert
    }
    if 'batchnorm' not in ignore_layer:
        all_layers = {
            'lstm_layers': sent_representation_norm,
            'numerical_dense_layer': numerical_features_norm,
            'sparse_feat_dense_layer': dense_layer_sparse_norm,
            'bert_layer': dense_layer_bert_norm
        }
    layers_to_merge = []
    for n, l in all_layers.items():
        if n in ignore_layer:
            continue
        layers_to_merge.append(l)
        
    if len(layers_to_merge) == 1:
        merged_layers = layers_to_merge[0]
    else:
        merged_layers = concatenate(layers_to_merge)
    output_layers = []
    for label in range(nr_classes):
        output_layer = Dense(1, activation='softmax',
                         name='output_layer%d' % label,
                        kernel_regularizer=regularizers.l2(hyperparams['l2_dense']))(merged_layers)
        output_layers.append(output_layer)

    # Compile model
    model = Model(inputs=[tokens_features, numerical_features, sparse_features,
                         in_id_bert, in_mask_bert, in_segment_bert], 
                  outputs=output_layers)

    model.compile(hyperparams['optimizer'], {'output_layer%d'%i: 
                                             'mean_squared_error' for i in range(nr_classes)},
                  metrics={'output_layer%d' % label: 
                           ['accuracy', 'mean_squared_error'] for label in range(nr_classes)})
    return model



In [78]:
model = build_model(hyperparams, hyperparams_features, embedding_matrix, emotions, stopword_list,
                    liwc_categories=[c for c in categories if c in writings_df.columns],
                    nr_classes=1,
                   ignore_layer=hyperparams['ignore_layer'])
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
word_seq (InputLayer)           [(None, 100)]        0                                            
__________________________________________________________________________________________________
embeddings_layer (Embedding)    (None, 100, 50)      1000000     word_seq[0][0]                   
__________________________________________________________________________________________________
LSTM_layer (LSTM)               (None, 100, 10)      2440        embeddings_layer[0][0]           
__________________________________________________________________________________________________
attention (Dense)               (None, 100, 1)       11          LSTM_layer[0][0]                 
______________________________________________________________________________________________

In [83]:
def train_model(model, 
                data_generator_train, data_generator_valid,
                epochs, start_epoch=0, workers=4,
                callback_list = [],
                model_path='/tmp/model',
               verbose=1):
    logging.info('Train...')
    experiment.log_parameter('callbacks', callbacks)

    history = model.fit_generator(data_generator_train,
#               batch_size=batch_size,
#                 steps_per_epoch=steps_per_epoch,
              epochs=epochs, initial_epoch=start_epoch, 
              validation_data=data_generator_valid,
                        verbose=verbose,
#               validation_split=0.3,
                       workers=workers,
            callbacks = [
                callbacks.ModelCheckpoint(filepath='%s_best' % model_path, verbose=1, 
                                          save_best_only=True),
                callbacks.EarlyStopping(patience=500), *callback_list
            ])
    model.save(model_path)
    experiment.log_parameter('model_path', model_path)
    return model, history

In [84]:
experiment = Experiment(api_key="eoBdVyznAhfg3bK9pZ58ZSXfv",
                        project_name="mental", workspace="ananana", disabled=False)

experiment.log_parameters(hyperparams_features)

experiment.log_parameter('emotion_lexicon', nrc_lexicon_path)
experiment.log_parameter('emotions', emotions)
experiment.log_parameter('embeddings_path', pretrained_embeddings_path)

experiment.add_tag('T2')
experiment.log_parameters(hyperparams)

COMET INFO: ----------------------------
COMET INFO: Comet.ml Experiment Summary:
COMET INFO:   Data:
COMET INFO:     url: https://www.comet.ml/ananana/mental/81432da8b90b4dcba49474b12d695748
COMET INFO:   Metrics [count] (min, max):
COMET INFO:     sys.gpu.0.free_memory [2]    : (350224384.0, 350224384.0)
COMET INFO:     sys.gpu.0.gpu_utilization [2]: (0.0, 0.0)
COMET INFO:     sys.gpu.0.total_memory       : (8370061312.0, 8370061312.0)
COMET INFO:     sys.gpu.0.used_memory [2]    : (8019836928.0, 8019836928.0)
COMET INFO:     sys.gpu.1.free_memory [2]    : (8239185920.0, 8239185920.0)
COMET INFO:     sys.gpu.1.gpu_utilization [2]: (0.0, 0.0)
COMET INFO:     sys.gpu.1.total_memory       : (8367439872.0, 8367439872.0)
COMET INFO:     sys.gpu.1.used_memory [2]    : (128253952.0, 128253952.0)
COMET INFO:   Uploads:
COMET INFO:     git-patch: 1
COMET INFO: ----------------------------
COMET INFO: Experiment is live on comet.ml https://www.comet.ml/ananana/mental/5a77836e030f43b7bb9b036fee

In [85]:
subjects_split = get_subjects_split()
data_generator_train = DataGenerator(user_level_data, set_type='train', 
                                     subjects_split=subjects_split)
data_generator_valid = DataGenerator(user_level_data, set_type='test',  
                                     subjects_split=subjects_split)
model, history = train_model(model, data_generator_train, data_generator_valid,
           epochs=1000, start_epoch=0,
                      callback_list = [],
                      model_path='models/mlp_t21', workers=1)

AttributeError: 'int' object has no attribute 'shape'

In [88]:
for d in DataGenerator(user_level_data, set_type='train', 
                                     subjects_split=subjects_split):
    print((d))
    break

([array([[    2,   100,   216,    74,     1,  1035,     8,     1,   889,
          291,     3,     1,  1991,    11,     1,  1927,  1164,   247,
            8,   171,   127,  5001,     2,    73,     2,   236,     3,
          109,    44,     4,   317,    24,  5315,  4116,    18,    16,
         1255,    12,   173,   174,  1238,   377,   145,  1000,    15,
           64,    31,   153,    31,   340, 17929, 17930,     2,   287,
            1,  1252,   908,     5,    84,   434,     2,  3645,     3,
            9,  1249,    21,    16,   871,    11,    16,  2786,    15,
            2,   105,  1029,     1,  2426,     3,   321,   142,     2,
          688,    53,     1,   438,   405,  4908,    64,     1,  3256,
         1177,     1,   540,    20,    67,  1971,    65,  1532,     1,
          525]], dtype=int32), array([[8.15660685e-03, 2.28384992e-02, 9.78792822e-03, 1.63132137e-02,
        2.93637847e-02, 2.12071778e-02, 5.05709625e-02, 8.15660685e-03,
        3.26264274e-03, 2.61011419e-02, 6.

# Extra analysis


### Extract LIWC

In [None]:
def merge_tokens(row):
    tokens = []
    if row.tokenized_text:
        tokens += row.tokenized_text
    if row.tokenized_title:
        tokens += row.tokenized_title
    return tokens
writings_df['all_tokens'] = writings_df.apply (lambda row: merge_tokens(row), axis=1)

In [None]:
# TODO: include the title
def extract_emotions(tokens, emotion, relative=True):
    if not tokens:
        return None
    emotion_words = [t for t in tokens 
                     if t in nrc_lexicon[emotion]]
    if relative:
        return len(emotion_words) / len(tokens)
    else:
        return len(emotion_words)
    
    return encoded_emotions

from functools import partial
for emotion in emotions:
    writings_df[emotion] = writings_df['all_tokens'].apply(partial(extract_emotions, emotion=emotion, 
                                                                   relative=True))

In [None]:
writings_df['pronouns'] = writings_df['all_tokens'].apply(partial(encode_pronouns, relative=True))

In [None]:
writings_df[['label%i'%i for i in range(21)] + ['text', 'pronouns', 'text_len'] + emotions].corr('spearman')

In [None]:
# writings_df['label15'] = writings_df['label15'].apply(lambda l: encode_labels([l])[0])

In [None]:
# writings_df['label17'] = writings_df['label17'].apply(lambda l: encode_labels([l])[0])

In [None]:
writings_df.groupby('subject').mean()[
    ['label%i'%i for i in range(21)] + ['pronouns', 'text_len'] + emotions].corr(
    'spearman')[['pronouns', 'text_len'] + emotions]

In [None]:
writings_df.corrwith?

In [None]:
from liwc_readDict import readDict

liwc = readDict('/home/ana/resources/FakeOrFact/features/LIWC/LIWC/liwc.dic')
categories = [c for (w,c) in liwc]
set(categories)
liwc_dict = {}
for (w, c) in liwc:
    if c not in liwc_dict:
        liwc_dict[c] = []
    liwc_dict[c].append(w)

In [None]:
def encode_liwc_categories(tokens, category_words, relative=True):
    category_cnt = 0
    if not tokens:
        return None
    text_len = len(tokens)
    for t in tokens:
        for word in category_words:
            if t==word or (word[-1]=='*' and t.startswith(word[:-1])) \
            or (t==word.split("'")[0]):
                category_cnt += 1
                break # one token cannot belong to more than one word in the category
    if relative:
        return category_cnt/text_len
    else:
        return category_cnt

In [None]:
%%time
from functools import partial
# for categ in ['negemo', 'posemo', 'affect', 'sad', 'anx', 'pronoun']:#liwc_dict.keys():
for categ in liwc_dict.keys():
    if categ in writings_df.columns:
        continue
    print("Encoding for category %s..." % categ)
    writings_df[categ] = writings_df['all_tokens'].apply(partial(encode_liwc_categories, 
                                                                   category_words=liwc_dict[categ], 
                                                                   relative=True))


In [None]:
relevant_categs = ['posemo', 'negemo', 'anx', 'sad', 'affect', 'feel', 'social', 'health', 
                   'sexual', 'present', 'cogmech', 'inhib']
writings_df.groupby('subject').mean()[
    ['label%i'%i for i in range(21)] + relevant_categs].corr(
    'spearman')[relevant_categs]

In [None]:
list(writings_df.groupby('subject').min()[
    ['label%i'%i for i in range(21)] + list(liwc_dict.keys())].corr()[list(liwc_dict.keys())].mean().sort_values().index)

In [None]:
pickle.dump(writings_df, open('writings_df_T2_liwc.pkl', 'wb+'))