In [None]:
import pandas as pd
import xml.etree.ElementTree as ET
import glob, os
import numpy as np
from comet_ml import Experiment, Optimizer
import pickle
import logging
import sys
from sklearn.utils import class_weight

In [None]:
os.environ['TF_KERAS'] = '1'
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, Lambda, BatchNormalization, TimeDistributed, \
    CuDNNLSTM, Bidirectional, Input, concatenate, Flatten, RepeatVector, Activation, Multiply, Permute
from tensorflow.keras import regularizers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import callbacks, optimizers
from tensorflow.keras import backend as K
from tensorflow.keras.utils import plot_model, Sequence

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

import tensorflow as tf

In [None]:
my_seed = 1234
tf.set_random_seed(my_seed)

In [None]:
logger = logging.getLogger('training')
logger.addHandler(logging.StreamHandler(sys.stdout))
logger.setLevel(logging.DEBUG)

# Read data

In [None]:
def read_subject_writings(subject_file):
    writings = []
    with open(subject_file) as sf:
        contents = sf.read()
        root = ET.fromstring(contents)
        try:
            subject = root.findall('ID')[0].text.strip()
        except Exception:
            print('Cannot extract ID', contents[:500], '\n-------\n')        
        for w in root.iter('WRITING'):
            subject_writings = {'subject': subject}
            for title in w.findall('TITLE'):
                subject_writings['title'] = title.text
            for text in w.findall('TEXT'):
                subject_writings['text'] = text.text
            for date in w.findall('DATE'):
                subject_writings['date'] = date.text
            writings.append(subject_writings)
    return writings

In [None]:
# root_dir = '/home/anasab/' 
root_dir = '/home/ana/'

In [None]:
datadir_T2 = root_dir + '/eRisk/data/eRisk2020_T2/eRisk2020_T2_TRAINING_DATA/'
labels_file_T2 = root_dir + '/eRisk/data/eRisk2020_T2/eRisk2020_T2_TRAINING_DATA/Depression Questionnaires_anon.txt'
nr_questions = 21

In [None]:
def read_texts(datadir_T2,
                labels_file_T2):
    writings = []
    writings_df = pd.DataFrame()
    labels_df = pd.DataFrame()

        
    for subject_file in os.listdir(datadir_T2):
        if not subject_file.startswith('subject'):
            continue
        writings.extend(read_subject_writings(os.path.join(datadir_T2, subject_file)))
    writings_df = pd.DataFrame(writings)
    
    labels_df = pd.read_csv(os.path.join(labels_file_T2), 
                                 delimiter='\s+', names=['subject'] + ['label%i' % i for i in range(nr_questions)])

    labels_df = labels_df.set_index('subject')
    
    writings_df = writings_df.join(labels_df, on='subject')
    
    return writings_df, labels_df

In [None]:
# writings_df, labels_df = read_texts(datadir_T2, labels_file_T2)
writings_df = pickle.load(open('writings_df_T2_liwc.pkl', 'rb'))

In [None]:
writings_df.groupby('subject').count()

In [None]:
writings_df

## Preprocess text

In [None]:
tokenizer = RegexpTokenizer(r'\w+')

def tokenize(t):
    return tokenizer.tokenize(t.lower())

In [None]:
def tokenize_fields(writings_df):
    writings_df['tokenized_title'] = writings_df['title'].apply(lambda t: tokenize(t) 
                                                                if type(t)==str and t else None)
    writings_df['title_len'] = writings_df['tokenized_title'].apply(lambda t: len(t) 
                                                                    if type(t)==list and t else None)
    writings_df['tokenized_text'] = writings_df['text'].apply(lambda t: tokenize(t) 
                                                              if type(t)==str and t else None)
    writings_df['text_len'] = writings_df['tokenized_text'].apply(lambda t: len(t) 
                                                                  if type(t)==list and t else None)
    return writings_df

In [None]:
writings_df = tokenize_fields(writings_df)

In [None]:
writings_df.text_len.describe()

In [None]:
writings_df.title_len.describe()

In [None]:
writings_df.groupby('subject').count().title.describe()

In [None]:
writings_df.groupby('subject').count().text.describe()

# Extract features

In [None]:
hyperparams_features = {
    "max_features": 20000,
    # cut texts after this number of words
    # (among top max_features most common words)
    "maxlen": 100,
    "embedding_dim": 50,
    "user_level": True,
    "posts_per_user": 10,
    "batch_size": 20,
}

#### Emotions

In [None]:
def load_NRC(nrc_path):
    word_emotions = {}
    emotion_words = {}
    with open(nrc_path) as in_f:
        for line in in_f:
            line = line.strip()
            if not line:
                continue
            word, emotion, label = line.split()
            if word not in word_emotions:
                word_emotions[word] = set()
            if emotion not in emotion_words:
                emotion_words[emotion] = set()
            label = int(label)
            if label:
                word_emotions[word].add(emotion)
                emotion_words[emotion].add(word)
    return emotion_words

nrc_lexicon_path = root_dir + '/resources/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt'
nrc_lexicon = load_NRC(nrc_lexicon_path)
emotions = list(nrc_lexicon.keys())


In [None]:
def encode_emotions(tokens, emotion_lexicon, emotions, relative=True):
    text_len = len(tokens)
    encoded_emotions = [0 for e in emotions]
    for i, emotion in enumerate(emotions):
        try:
            emotion_words = [t for t in tokens if t in emotion_lexicon[emotion]]
            if relative:
                encoded_emotions[i] = len(emotion_words) / len(tokens)
            else:
                encoded_emotions[i] = len(emotion_words)
        except ValueError:
            print("Emotion not found.")
    return encoded_emotions

In [None]:
from liwc_readDict import readDict

liwc = readDict(root_dir + '/resources/liwc.dic')

categories = set([c for (w,c) in liwc])
len(categories)

#### Personal pronouns

In [None]:
first_person_pronouns = {"i", "me", "my", "mine", "myself"}
def encode_pronouns(tokens, pronouns={"i", "me", "my", "mine", "myself"}, relative=True):
    if not tokens:
        return np.nan
    text_len = len(tokens)
    nr_pronouns = len([t for t in tokens if t in pronouns])
    if relative:
        return nr_pronouns/text_len
    else:
        return nr_pronouns

#### Stopwords

In [None]:
stopword_list = stopwords.words("english")
def encode_stopwords(tokens, stopwords=stopword_list):
    encoded_stopwords = [0 for s in stopword_list]
    if not tokens:
        return encoded_stopwords
    for i, stopword in enumerate(stopwords):
        if stopword in tokens:
            encoded_stopwords[i] += 1
    return encoded_stopwords

### Encode data

In [None]:
from collections import Counter
def encode_labels(labels):
    '''Convert ia to i and ib to -i'''
    encoded_labels = []
    for i, l in enumerate(labels):
        try:
            encoded_labels.append(int(l))
        except Exception as e:
            logger.debug("Encoding label %s\n" % l)
        
            if str(l)[-1] == 'a':
                encoded_labels.append(int(l[0]))
            elif str(l)[-1] == 'b':
                encoded_labels.append(-int(l[0]))
            else:
                logger.warning("Coult not encode label %s\n" % l)
    return encoded_labels

def load_erisk_data(writings_df, voc_size, emotion_lexicon, seq_len, emotions =  
                    ['anger', 'anticipation', 'disgust', 'fear', 'joy', 
                     'negative', 'positive', 'sadness', 'surprise', 'trust'],
                    liwc_categories = categories, ignore_features=[],
                    pronouns = ["i", "me", "my", "mine", "myself"],
                    train_prop=0.7, valid_prop=0.3, test_slice=2,
                    nr_slices=5,
                    min_post_len=3, min_word_len=1, 
                    user_level=True, vocabulary=None,
                   logger=logger):
    logger.debug("Loading data...\n")
    if not vocabulary:
        vocabulary = {}
        word_freqs = Counter()
        for words in writings_df.tokenized_text:
            word_freqs.update(words)
        for words in writings_df.tokenized_title:
            word_freqs.update(words)
        i = 1
        for w, f in word_freqs.most_common(voc_size-2): # keeping voc_size-1 for unk
            if len(w) < min_word_len:
                continue
            vocabulary[w] = i
            i += 1

    if 'subset' in writings_df.columns:
        training_subjects = list(set(writings_df[writings_df['subset']=='train'].subject))
        test_subjects = list(set(writings_df[writings_df['subset']=='test'].subject))
    else:
        all_subjects = sorted(list(set(writings_df.subject)))
        training_subjects_size = int(len(all_subjects) * train_prop)
        test_subjects_size = len(all_subjects) - training_subjects_size
        # Cross-validation, with fixed slice as input
        test_prop = 1-train_prop
        test_slice = min(test_slice, nr_slices)
        logger.debug("start index: %f, from %f\n" % (
            len(all_subjects)*(1/nr_slices)*test_slice, test_prop*test_slice))
        start_slice = int(len(all_subjects)*(1/nr_slices)*test_slice)
        test_subjects = all_subjects[start_slice: start_slice+test_subjects_size]
        training_subjects = [s for s in all_subjects if s not in test_subjects]
    training_subjects = sorted(training_subjects) # ensuring reproducibility
    valid_subjects_size = int(len(training_subjects) * valid_prop)
    valid_subjects = training_subjects[:valid_subjects_size]
    training_subjects = training_subjects[valid_subjects_size:]
    categories = [c for c in liwc_categories if c in writings_df.columns]
    logger.debug("%d training users, %d validation users, %d test users." % (
        len(training_subjects), 
          len(valid_subjects),
          len(test_subjects)))
    subjects_split = {'train': training_subjects, 
                      'valid': valid_subjects, 
                      'test': test_subjects}

    user_level_texts = {}
    for row in writings_df.sort_values(by='date').itertuples():
        words = []
        if row.tokenized_title:
            words.extend(row.tokenized_title)
        if row.tokenized_text:
            words.extend(row.tokenized_text)
        if not words or len(words)<min_post_len:
            print(row.subject)
            continue
        labels = [getattr(row, 'label%d'%i) for i in range(nr_questions)]
        liwc_categs = [getattr(row, categ) for categ in categories]
        if row.subject not in user_level_texts.keys():
            user_level_texts[row.subject] = {}
            user_level_texts[row.subject]['texts'] = [words]
            user_level_texts[row.subject]['labels'] = encode_labels(labels)
            user_level_texts[row.subject]['liwc'] = [liwc_categs]
        else:
            user_level_texts[row.subject]['texts'].append(words)
            user_level_texts[row.subject]['liwc'].append(liwc_categs)
    return user_level_texts, subjects_split, vocabulary


In [None]:
user_level_data, subjects_split, vocabulary = load_erisk_data(writings_df, 
                                                            seq_len=hyperparams_features['maxlen'],
                                                            voc_size=hyperparams_features['max_features'],
                                                           emotion_lexicon=nrc_lexicon,
                                                           emotions=emotions,
                                                           user_level=hyperparams_features['user_level'],
                                                                                logger=logger
#                                                            vocabulary=pickle.load(open('vocabulary20K_selfharm.pkl', 'rb'))
                                                                               )

In [None]:
def load_embeddings(path, embedding_dim, voc):
    # random matrix with mean value = 0
    embedding_matrix = np.random.random((len(voc)+2, embedding_dim)) - 0.5 # voc + unk + pad value(0)

    f = open(path)
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        word_i = voc.get(word)
        if word_i is not None:
            embedding_matrix[word_i] = coefs
    f.close()

    print('Total %s word vectors.' % len(embedding_matrix))

 
    return embedding_matrix

pretrained_embeddings_path = root_dir + '/resources/glove.twitter.27B/glove.twitter.27B.%dd.txt' % hyperparams_features['embedding_dim']
embedding_matrix = load_embeddings(pretrained_embeddings_path, hyperparams_features['embedding_dim'], vocabulary)


## Data Generator

In [None]:
class DataGenerator(Sequence):
    'Generates data for Keras'
    def __init__(self, user_level_data, batch_size=hyperparams_features['batch_size'], 
                 seq_len=hyperparams_features['maxlen'], voc_size=hyperparams_features['max_features'], 
                 emotion_lexicon=nrc_lexicon, set_type='train', test_user_index=0,
                 emotions=emotions, pronouns=["i", "me", "my", "mine", "myself"], 
                 max_posts_per_user=hyperparams_features['posts_per_user'],
                 shuffle=True):
        'Initialization'
        self.seq_len = seq_len
        self.emotion_lexicon = emotion_lexicon
        self.batch_size = batch_size
        self.data = user_level_data
        self.all_users = list(self.data.keys())
        self.emotions = emotions
        self.pronouns = pronouns
        self.set = set_type
        self.shuffle = shuffle
        self.voc_size = voc_size
        self.max_posts_per_user = max_posts_per_user
        self.test_user_index = test_user_index
        self.on_epoch_end()

    def __encode_text(self, tokens):
        # Using voc_size-1 value for OOV token
        encoded_tokens = [vocabulary.get(w, self.voc_size-1) for w in tokens]
        encoded_emotions = encode_emotions(tokens, self.emotion_lexicon, self.emotions)
        encoded_pronouns = encode_pronouns(tokens, self.pronouns)
        encoded_stopwords = encode_stopwords(tokens)
        return (encoded_tokens, encoded_emotions, encoded_pronouns, encoded_stopwords)
        
    def __len__(self):
        'Denotes the number of batches per epoch'
        if self.set == 'test':
            return 1
        return int(np.floor(len(self.data) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        if self.set == 'test':
            users = [self.all_users[self.test_user_index]]
        else:
            # Generate indexes of the batch
            user_indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
            # Find users

            users = [self.all_users[i] for i in user_indexes
                     if i!=self.test_user_index
                    ]

        post_indexes = {}
        # Sample post ids
        for subject in users:
            posts_len = len(self.data[subject]['texts'])
            posts_index_sample = sorted(np.random.choice(posts_len, 
                                                         min(self.max_posts_per_user, posts_len),
                                                         replace=False))
            post_indexes[subject] = posts_index_sample
        # Generate data
        X, y = self.__data_generation(users, post_indexes)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.data))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, users, post_indexes):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        tokens_data = []
        categ_data = []
        sparse_data = []
        subjects = []
        labels = []
        for subject in users:
            texts = self.data[subject]['texts']
            label = self.data[subject]['labels']
            
            # Sample
            texts = [texts[i] for i in post_indexes[subject]]
            liwc_selection = [self.data[subject]['liwc'][i] for i in post_indexes[subject]]

            all_words = [sum(texts, [])] # merge all texts in one list
            liwc_aggreg = [np.array(liwc_selection).mean(axis=0).tolist()]

            for i, words in enumerate(all_words):
                encoded_tokens, encoded_emotions, encoded_pronouns, encoded_stopwords = self.__encode_text(words)
                subject_id = int(subject.split('t')[1])
                tokens_data.append(encoded_tokens)
                categ_data.append(encoded_emotions + [encoded_pronouns] + liwc_aggreg[i])
                sparse_data.append(encoded_stopwords)
                labels.append(label)
                subjects.append(subject_id)

        
        # using zeros for padding
        tokens_data_padded = sequence.pad_sequences(tokens_data, maxlen=self.seq_len)

        return ([np.array(tokens_data_padded), np.array(categ_data), np.array(sparse_data),
                np.array(subjects)],
                np.array(labels).reshape(-1, len(labels)).tolist()) # to have one array per output

In [None]:
# TODO: Don't split into the 3 sets, do leave-one-out cross-validation

In [None]:
logger.setLevel(logging.DEBUG)

# TODO: it is skipping the last batch
x_data = {'train': [], 'test': []}
y_data = {'train': [], 'test': []}
test_user_index = np.random.randint(len(user_level_data))
for set_type in ['train', 'test']:
    for x, y in DataGenerator(user_level_data, 
                                          set_type=set_type,
                             test_user_index=test_user_index):
        x_data[set_type].append(x)
        y_data[set_type].append(y)


In [None]:
y_data['test'][0]

In [None]:
np.array([[1,2],[3,4]]).reshape(2,-1)

In [None]:
sum([len(subjects_split[s]) for s in ['train', 'valid', 'test']])

In [None]:
x_data['test']

In [None]:
y_data

# Train

In [None]:
hyperparams = {
    'lstm_units': 0,
    'dense_bow_units': 20,
    'dropout': 0.0,
    'l2_dense': 0.00000011,
    'l2_embeddings': 0.000001,
    'dense_sentence_units': 100,
    'optimizer': 'adam',
    'decay': 0.00001,
    'lr': 0.001,
    "trainable_embeddings": True,
    "reduce_lr_factor": 0.0002,
    "reduce_lr_patience": 1000,
    "freeze_patience": 500,
    'threshold': 0.5,
    'ignore_layer': ['lstm_layers', 'batchnorm'],
    'norm_momentum': 0.1,

}
if not hyperparams['optimizer']:
    hyperparams['optimizer'] = optimizers.Adam(lr=hyperparams['lr'], #beta_1=0.9, beta_2=0.999, epsilon=0.0001,
                                   decay=hyperparams['decay'])

In [None]:
# class Metrics():
#     def __init__(self, threshold=0.5):
#         self.threshold=threshold
        
#     def recall_m(self, y_true, y_pred):
#             y_labels = y_true
#             y_pred = K.cast(K.greater(K.clip(y_pred, 0, 1), self.threshold), K.floatx())        
#             possible_positives = K.sum(K.round(K.clip(y_labels, 0, 1)))
#             true_positives = K.sum(K.round(K.clip(y_labels * y_pred, 0, 1)))
#             recall = true_positives / (possible_positives + K.epsilon())
#             return recall

#     def precision_m(self, y_true, y_pred):
#             y_labels = y_true
#             y_pred = K.cast(K.greater(K.clip(y_pred, 0, 1), self.threshold), K.floatx())        
#             true_positives = K.sum(K.round(K.clip(y_labels * y_pred, 0, 1)))
#             predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
#             precision = true_positives / (predicted_positives + K.epsilon())
#             return precision

#     def f1_m(self, y_true, y_pred):
#         precision = self.precision_m(y_true, y_pred)
#         recall = self.recall_m(y_true, y_pred)
#         return 2*((precision*recall)/(precision+recall+K.epsilon()))

# def binary_crossentropy_custom(y_true, y_pred):
#     y_labels = y_true
#     return K.binary_crossentropy(y_labels, 
#                                  y_pred)

# metrics_class = Metrics(threshold=hyperparams['threshold'])

In [None]:
def build_model(hyperparams, hyperparams_features, embedding_matrix, emotions, stopwords_list,
                liwc_categories, nr_classes,
               ignore_layer=[]):

    tokens_features = Input(shape=(hyperparams_features['maxlen'],), name='word_seq')
    embedding_layer = Embedding(hyperparams_features['max_features'], 
                                hyperparams_features['embedding_dim'], 
                                input_length=hyperparams_features['maxlen'],
                                embeddings_regularizer=regularizers.l2(hyperparams['l2_embeddings']),
                                weights=[embedding_matrix], 
                                trainable=hyperparams['trainable_embeddings'],
                               name='embeddings_layer')(
        tokens_features)
#     if 'batchnorm' not in ignore_layer:
#         embedding_layer_norm = BatchNormalization(axis=-1, momentum=hyperparams['norm_momentum'],
#                                                      name='embeddings_layer_norm')(embedding_layer)
#     lstm_layers = Bidirectional(LSTM(hyperparams['lstm_units']))(embedding_layer)

    if tf.test.is_gpu_available():
        lstm_layers = CuDNNLSTM(hyperparams['lstm_units'], 
                                return_sequences='attention' not in ignore_layer, # only True if using attention
                      name='LSTM_layer')(embedding_layer)
    else:
        lstm_layers = LSTM(hyperparams['lstm_units'], 
                           return_sequences='attention' not in ignore_layer,
                      name='LSTM_layer')(embedding_layer)
    
    # Attention
    if 'attention' not in ignore_layer:
        attention = Dense(1, activation='tanh', name='attention')(lstm_layers)
        attention = Flatten()(attention)
        attention = Activation('softmax')(attention)
        attention = RepeatVector(hyperparams['lstm_units'])(attention)
        attention = Permute([2, 1])(attention)

        sent_representation = Multiply()([lstm_layers, attention])
        sent_representation = Lambda(lambda xin: K.sum(xin, axis=1), 
                                     output_shape=(hyperparams['lstm_units'],)
                                    )(sent_representation)

        
    else:
        sent_representation = lstm_layers
        
    
    sent_representation = Dropout(hyperparams['dropout'], name='lstm_att_dropout')(sent_representation)
    if hyperparams['dense_sentence_units']:
        sent_representation = Dense(units=hyperparams['dense_sentence_units'],
                                   name='dense_sent_representation')(sent_representation)
    numerical_features = Input(shape=(len(emotions) + 1 + len(liwc_categories),), name='numeric_input') # emotions and pronouns
    dense_layer = Dense(units=1,
                        kernel_regularizer=regularizers.l2(hyperparams['l2_dense']),
                        name='numerical_dense_layer',
                       )(numerical_features)
    sparse_features = Input(shape=(len(stopwords_list),), name='sparse_input') # stopwords

    if hyperparams['dense_bow_units']:
        dense_layer_sparse = Dense(units=hyperparams['dense_bow_units'],
                              name='sparse_feat_dense_layer',
                                kernel_regularizer=regularizers.l2(hyperparams['l2_dense']),
                              )(sparse_features)
    else:
        dense_layer_sparse = sparse_features
    
    if 'batchnorm' not in ignore_layer:
        numerical_features_norm = BatchNormalization(axis=1, momentum=hyperparams['norm_momentum'],
                                                     name='numerical_features_norm')(numerical_features)
        sent_representation_norm = BatchNormalization(axis=1, momentum=hyperparams['norm_momentum'],
                                                      name='sent_repr_norm')(sent_representation)

        dense_layer_sparse_norm = BatchNormalization(axis=1, momentum=hyperparams['norm_momentum'],
                                                         name='sparse_features_norm')(dense_layer_sparse)
        
    subjects = Input(shape=(1,), name='subjects')
    

    all_layers = {
        'lstm_layers': sent_representation,
        'numerical_dense_layer': numerical_features,
        'sparse_feat_dense_layer': dense_layer_sparse
    }
    if 'batchnorm' not in ignore_layer:
        all_layers = {
            'lstm_layers': sent_representation_norm,
            'numerical_dense_layer': numerical_features_norm,
            'sparse_feat_dense_layer': dense_layer_sparse_norm
        }
    layers_to_merge = []
    for n, l in all_layers.items():
        if n in ignore_layer:
            continue
        layers_to_merge.append(l)
        
    if len(layers_to_merge) == 1:
        merged_layers = layers_to_merge[0]
    else:
        merged_layers = concatenate(layers_to_merge)
    output_layers = []
    for label in range(nr_questions):
        output_layer = Dense(1, activation='softmax',
                         name='output_layer%d' % label,
                        kernel_regularizer=regularizers.l2(hyperparams['l2_dense']))(merged_layers)
        output_layers.append(output_layer)

    # Compile model
    model = Model(inputs=[tokens_features, numerical_features, sparse_features, subjects], 
                  outputs=output_layers)

    model.compile(hyperparams['optimizer'], {'output_layer%d'%i: 'mean_squared_error' for i in range(nr_questions)},
                  metrics={'output_layer%d' % label: ['accuracy', 'mean_squared_error'] for label in range(nr_questions)})
    return model



In [None]:
model = build_model(hyperparams, hyperparams_features, embedding_matrix, emotions, stopword_list,
                    liwc_categories=[c for c in categories if c in writings_df.columns],
                    nr_classes=nr_questions,
                   ignore_layer=hyperparams['ignore_layer'])
model.summary()

In [None]:
def train_model(model, 
                data_generator_train, data_generator_valid,
                epochs, start_epoch=0, workers=4,
                callback_list = [],
                model_path='/tmp/model',
               verbose=1):
    logging.info('Train...')
    experiment.log_parameter('callbacks', callbacks)

    history = model.fit_generator(data_generator_train,
#               batch_size=batch_size,
#                 steps_per_epoch=steps_per_epoch,
              epochs=epochs, initial_epoch=start_epoch, 
              validation_data=data_generator_valid,
                        verbose=verbose,
#               validation_split=0.3,
                       workers=workers,
            callbacks = [
                callbacks.ModelCheckpoint(filepath='%s_best' % model_path, verbose=1, 
                                          save_best_only=True),
                callbacks.EarlyStopping(patience=500), *callback_list
            ])
    model.save(model_path)
    experiment.log_parameter('model_path', model_path)
    return model, history

In [None]:
experiment = Experiment(api_key="eoBdVyznAhfg3bK9pZ58ZSXfv",
                        project_name="mental", workspace="ananana", disabled=False)

experiment.log_parameters(hyperparams_features)

experiment.log_parameter('emotion_lexicon', nrc_lexicon_path)
experiment.log_parameter('emotions', emotions)
experiment.log_parameter('embeddings_path', pretrained_embeddings_path)

experiment.add_tag('T2')
experiment.log_parameters(hyperparams)

In [None]:
test_user_index = np.random.randint(len(user_level_data))
data_generator_train = DataGenerator(user_level_data, set_type='train', test_user_index=test_user_index)
data_generator_valid = DataGenerator(user_level_data, set_type='test',  test_user_index=test_user_index)
model, history = train_model(model, data_generator_train, data_generator_valid,
           epochs=1000, start_epoch=0,
                      callback_list = [],
                      model_path='models/mlp_t21', workers=1)

# Extra analysis


### Extract LIWC

In [None]:
def merge_tokens(row):
    tokens = []
    if row.tokenized_text:
        tokens += row.tokenized_text
    if row.tokenized_title:
        tokens += row.tokenized_title
    return tokens
writings_df['all_tokens'] = writings_df.apply (lambda row: merge_tokens(row), axis=1)

In [None]:
# TODO: include the title
def extract_emotions(tokens, emotion, relative=True):
    if not tokens:
        return None
    emotion_words = [t for t in tokens 
                     if t in nrc_lexicon[emotion]]
    if relative:
        return len(emotion_words) / len(tokens)
    else:
        return len(emotion_words)
    
    return encoded_emotions

from functools import partial
for emotion in emotions:
    writings_df[emotion] = writings_df['all_tokens'].apply(partial(extract_emotions, emotion=emotion, 
                                                                   relative=True))

In [None]:
writings_df['pronouns'] = writings_df['all_tokens'].apply(partial(encode_pronouns, relative=True))

In [None]:
writings_df[['label%i'%i for i in range(21)] + ['text', 'pronouns', 'text_len'] + emotions].corr('spearman')

In [None]:
# writings_df['label15'] = writings_df['label15'].apply(lambda l: encode_labels([l])[0])

In [None]:
# writings_df['label17'] = writings_df['label17'].apply(lambda l: encode_labels([l])[0])

In [None]:
writings_df.groupby('subject').mean()[
    ['label%i'%i for i in range(21)] + ['pronouns', 'text_len'] + emotions].corr(
    'spearman')[['pronouns', 'text_len'] + emotions]

In [None]:
writings_df.corrwith?

In [None]:
from liwc_readDict import readDict

liwc = readDict('/home/ana/resources/FakeOrFact/features/LIWC/LIWC/liwc.dic')
categories = [c for (w,c) in liwc]
set(categories)
liwc_dict = {}
for (w, c) in liwc:
    if c not in liwc_dict:
        liwc_dict[c] = []
    liwc_dict[c].append(w)

In [None]:
def encode_liwc_categories(tokens, category_words, relative=True):
    category_cnt = 0
    if not tokens:
        return None
    text_len = len(tokens)
    for t in tokens:
        for word in category_words:
            if t==word or (word[-1]=='*' and t.startswith(word[:-1])) \
            or (t==word.split("'")[0]):
                category_cnt += 1
                break # one token cannot belong to more than one word in the category
    if relative:
        return category_cnt/text_len
    else:
        return category_cnt

In [None]:
%%time
from functools import partial
# for categ in ['negemo', 'posemo', 'affect', 'sad', 'anx', 'pronoun']:#liwc_dict.keys():
for categ in liwc_dict.keys():
    if categ in writings_df.columns:
        continue
    print("Encoding for category %s..." % categ)
    writings_df[categ] = writings_df['all_tokens'].apply(partial(encode_liwc_categories, 
                                                                   category_words=liwc_dict[categ], 
                                                                   relative=True))


In [None]:
relevant_categs = ['posemo', 'negemo', 'anx', 'sad', 'affect', 'feel', 'social', 'health', 
                   'sexual', 'present', 'cogmech', 'inhib']
writings_df.groupby('subject').mean()[
    ['label%i'%i for i in range(21)] + relevant_categs].corr(
    'spearman')[relevant_categs]

In [None]:
list(writings_df.groupby('subject').min()[
    ['label%i'%i for i in range(21)] + list(liwc_dict.keys())].corr()[list(liwc_dict.keys())].mean().sort_values().index)

In [None]:
pickle.dump(writings_df, open('writings_df_T2_liwc.pkl', 'wb+'))