In [None]:
import pandas as pd
import xml.etree.ElementTree as ET
import glob, os
import numpy as np
from comet_ml import Experiment, Optimizer
import pickle
import logging
import sys
from sklearn.utils import class_weight
import re

In [None]:
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ['TF_KERAS'] = '1'

# only reserve 1 GPU

In [None]:
import tensorflow as tf
# tf.version

In [None]:
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, Lambda, BatchNormalization, TimeDistributed, \
     Bidirectional, Input, concatenate, Flatten, RepeatVector, Activation, Multiply, Permute#, CuDNNLSTM
from tensorflow.keras import regularizers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import callbacks, optimizers
from tensorflow.keras import backend as K
from tensorflow.keras.utils import plot_model, Sequence

from nltk.tokenize import RegexpTokenizer, TweetTokenizer
from nltk.corpus import stopwords

import tensorflow as tf

In [None]:
import tensorflow as tf
tf.test.is_gpu_available()

In [None]:
my_seed = 1234
# tf.set_random_seed(my_seed)

In [None]:
logger = logging.getLogger('training')
logger.addHandler(logging.StreamHandler(sys.stdout))
logger.setLevel(logging.DEBUG)

# Read data

In [None]:
def read_subject_writings(subject_file):
    writings = []
    with open(subject_file) as sf:
        contents = sf.read()
        root = ET.fromstring(contents)
        try:
            subject = root.findall('ID')[0].text.strip()
        except Exception:
            print('Cannot extract ID', contents[:500], '\n-------\n')        
        for w in root.iter('WRITING'):
            subject_writings = {'subject': subject}
            for title in w.findall('TITLE'):
                subject_writings['title'] = title.text
            for text in w.findall('TEXT'):
                subject_writings['text'] = text.text
            for date in w.findall('DATE'):
                subject_writings['date'] = date.text
            writings.append(subject_writings)
    return writings

In [None]:
# root_dir = '/home/anasab/' 
root_dir = '/home/anasab/'

In [None]:
datadir_T2 = root_dir + '/eRisk/data/eRisk2020_T2/eRisk2020_T2_TEST_DATA/'
# labels_file_T2 = root_dir + '/eRisk/data/eRisk2020_T2/eRisk2020_T2_TRAINING_DATA/Depression Questionnaires_anon.txt'
nr_questions = 21

In [None]:
def read_texts(datadir_T2,
                labels_file_T2=None):
    writings = []
    writings_df = pd.DataFrame()
    labels_df = pd.DataFrame()

        
    for subject_file in os.listdir(datadir_T2):
        if not subject_file.startswith('subject'):
            continue
        writings.extend(read_subject_writings(os.path.join(datadir_T2, subject_file)))
    writings_df = pd.DataFrame(writings)
    if labels_file_T2:
        labels_df = pd.read_csv(os.path.join(labels_file_T2), 
                                     delimiter='\s+', names=['subject'] + ['label%i' % i for i in range(nr_questions)])

        labels_df = labels_df.set_index('subject')

        writings_df = writings_df.join(labels_df, on='subject')
    
    return writings_df, labels_df

In [None]:
writings_df, labels_df = read_texts(datadir_T2)#, labels_file_T2)
# writings_df = pickle.load(open('writings_df_T2_liwc.pkl', 'rb'))

In [None]:
writings_df.groupby('subject').count()

In [None]:
writings_df

## Preprocess text

In [None]:
tokenizer = RegexpTokenizer(r'\w+')

def tokenize(t):
    return tokenizer.tokenize(t.lower())

In [None]:

tt = TweetTokenizer()
sw = stopwords.words("english")
def tokenize_tweets(t, tokenizer=tt, stop=True):
    tokens = tokenizer.tokenize(t.lower())
    tokens_clean = [token for token in tokens if 
                            re.match("^[a-z]*$", token)]
    if not stop:
        tokens_clean = [token for token in tokens_clean 
                        if token not in sw]
    return tokens_clean

def tokenize_fields(writings_df):
    writings_df['tokenized_title'] = writings_df['title'].apply(lambda t: tokenize_tweets(t) 
                                                                if type(t)==str and t else None)
    writings_df['title_len'] = writings_df['tokenized_title'].apply(lambda t: len(t) 
                                                                    if type(t)==list and t else None)
    writings_df['tokenized_text'] = writings_df['text'].apply(lambda t: tokenize_tweets(t) 
                                                              if type(t)==str and t else None)
    writings_df['text_len'] = writings_df['tokenized_text'].apply(lambda t: len(t) 
                                                                  if type(t)==list and t else None)
    return writings_df

In [None]:
writings_df = tokenize_fields(writings_df)

In [None]:
writings_df.text_len.describe()

In [None]:
writings_df.title_len.describe()

In [None]:
writings_df.groupby('subject').count().title.describe()

In [None]:
writings_df.groupby('subject').count().text.describe()

# RoBERTa & co

In [None]:
from simpletransformers.classification import MultiLabelClassificationModel


In [None]:
seq_len = 300

In [None]:
# Aggregate by users
writings_df = writings_df.fillna(value={'text': '', 'title':''})
column_functions = {'text': lambda t: " ".join(t), 
                                        'title': lambda t: " ".join(t),
                                        'tokenized_text': 'sum',
                                        'tokenized_title': 'sum',
                                        'text_len': 'sum',
                                        'title_len': 'sum'}
if 'label1' in writings_df.columns:
    column_functions.update({'label%i'%i: 'min' for i in range(21)})
writings_per_user_df = writings_df.groupby('subject').aggregate(column_functions)
#                                          'subset': 'min'})
# writings_per_user_df = writings_per_user_df.fillna("")
writings_per_user_df['text'] = writings_per_user_df['text'] + " " +  writings_per_user_df['title']
writings_per_user_df['text_len'] = writings_per_user_df['text_len'] + writings_per_user_df['title_len']

In [None]:
writings_per_user_df.text_len.describe()

In [None]:
def reverse_text(text):
    return " ".join(text.split()[::-1])
def truncate_text_beginning(text, seq_len=seq_len, epsilon=20):
    truncated_tokenized = text.split()[-seq_len-epsilon:]
    return " ".join(truncated_tokenized)
# writings_per_user_df['text'] = writings_per_user_df['text'].apply(truncate_text_beginning)

In [None]:
def get_subjects_split(writings_df, train_prop=0.8, test_slice=2, nr_slices=5, valid_prop=0):
    if 'subset' in writings_df.columns:
        training_subjects = list(set(writings_df[writings_df['subset']=='train'].subject))
        test_subjects = list(set(writings_df[writings_df['subset']=='test'].subject))
    else:
        all_subjects = sorted(list(set(writings_df.subject)))
        training_subjects_size = int(len(all_subjects) * train_prop)
        test_subjects_size = len(all_subjects) - training_subjects_size
        # Cross-validation, with fixed slice as input
        test_prop = 1-train_prop
        test_slice = min(test_slice, nr_slices)
        logger.debug("start index: %f, from %f\n" % (
            len(all_subjects)*(1/nr_slices)*test_slice, test_prop*test_slice))
        start_slice = int(len(all_subjects)*(1/nr_slices)*test_slice)
        test_subjects = all_subjects[start_slice: start_slice+test_subjects_size]
        training_subjects = [s for s in all_subjects if s not in test_subjects]
    training_subjects = sorted(training_subjects) # ensuring reproducibility
    valid_subjects_size = int(len(training_subjects) * valid_prop)
    valid_subjects = training_subjects[:valid_subjects_size]
    training_subjects = training_subjects[valid_subjects_size:]
    logger.debug("%d training users, %d validation users, %d test users." % (
        len(training_subjects), 
          len(valid_subjects),
          len(test_subjects)))
    subjects_split = {'train': training_subjects, 
                      'valid': valid_subjects, 
                      'test': test_subjects}
    return subjects_split

subjects_split = get_subjects_split(writings_df, nr_slices=5, test_slice=4)

In [None]:
train_df = writings_per_user_df[writings_per_user_df.index.isin(subjects_split['train'])]#[['text', 'labels']]
test_df = writings_per_user_df[writings_per_user_df.index.isin(subjects_split['test'])]#[['text', 'labels']]

In [None]:
train_df.head()

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
train_df['labels'] = train_df[['label%d'%i for i in range(21)]].values.tolist()
binarizer = MultiLabelBinarizer()
binarizer.fit([range(-3,3,1)])
# TODO: this is not the right way. it's not multilabel...?
train_df['labels'] = train_df['labels'].apply(lambda l: binarizer.transform((l,)))

In [None]:
train_df.head()

In [None]:
args = {
#    'model_type':  'roberta',
#    'model_name': 'roberta-base',
   'output_dir': 'outputs/',
   'cache_dir': 'cache/',
    'fp16': True,
   'fp16_opt_level': 'O1',
   'max_seq_length': seq_len,#256, #128,
   'train_batch_size': 8,
   'eval_batch_size': 8,
   'gradient_accumulation_steps': 1,
   'num_train_epochs': 2,
   'weight_decay': 0,
   'learning_rate': 4e-6,
   'adam_epsilon': 1e-8,
   'warmup_ratio': 0.06,
   'warmup_steps': 0,
   'max_grad_norm': 1.0,
    'logging_steps': 50,
   'evaluate_during_training': True,
   'save_steps': 2000,
   'eval_all_checkpoints': False,
    'evaluate_during_training': True,
    'evaluate_during_training_verbose': True,
    'evaluate_during_training_steps': 3,
   'use_tensorboard': True,
#     'tensorboard_dir': 'tensorboard/',
    'overwrite_output_dir': True,
   'reprocess_input_data': False,
}

In [None]:
# Create a TransformerModel
model = MultiLabelClassificationModel('roberta', 'roberta-base', args=args)

In [None]:
# Train the model
model.train_model(train_df=train_df, eval_df=test_df, text_ #acc=accuracy_score,
                                                          prec=precision_score,
                                                           f1=f1_score)#auto_weights=True)


# Extract features

In [None]:
hyperparams_features = {
    "max_features": 40000,
    # cut texts after this number of words
    # (among top max_features most common words)
    "maxlen": 512,
    "embedding_dim": 50,
    "user_level": True,
    "posts_per_user": 10,
    "batch_size": 2,
}

#### Emotions

In [None]:
def load_NRC(nrc_path):
    word_emotions = {}
    emotion_words = {}
    with open(nrc_path) as in_f:
        for line in in_f:
            line = line.strip()
            if not line:
                continue
            word, emotion, label = line.split()
            if word not in word_emotions:
                word_emotions[word] = set()
            if emotion not in emotion_words:
                emotion_words[emotion] = set()
            label = int(label)
            if label:
                word_emotions[word].add(emotion)
                emotion_words[emotion].add(word)
    return emotion_words

nrc_lexicon_path = root_dir + '/resources/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt'
nrc_lexicon = load_NRC(nrc_lexicon_path)
emotions = list(nrc_lexicon.keys())


In [None]:
def encode_emotions(tokens, emotion_lexicon, emotions, relative=True):
    text_len = len(tokens)
    encoded_emotions = [0 for e in emotions]
    for i, emotion in enumerate(emotions):
        try:
            emotion_words = [t for t in tokens if t in emotion_lexicon[emotion]]
            if relative:
                encoded_emotions[i] = len(emotion_words) / len(tokens)
            else:
                encoded_emotions[i] = len(emotion_words)
        except ValueError:
            print("Emotion not found.")
    return encoded_emotions

In [None]:
from liwc_readDict import readDict

liwc = readDict(root_dir + '/resources/liwc.dic')

categories = set([c for (w,c) in liwc])
len(categories)

#### Personal pronouns

In [None]:
first_person_pronouns = {"i", "me", "my", "mine", "myself"}
def encode_pronouns(tokens, pronouns={"i", "me", "my", "mine", "myself"}, relative=True):
    if not tokens:
        return np.nan
    text_len = len(tokens)
    nr_pronouns = len([t for t in tokens if t in pronouns])
    if relative:
        return nr_pronouns/text_len
    else:
        return nr_pronouns

#### Stopwords

In [None]:
stopword_list = stopwords.words("english")
def encode_stopwords(tokens, stopwords=stopword_list):
    encoded_stopwords = [0 for s in stopwords]
    if not tokens:
        return encoded_stopwords
    for i, stopword in enumerate(stopwords):
        if stopword in tokens:
            encoded_stopwords[i] += 1
    return encoded_stopwords

### BERT

In [None]:
# from easybert import Bert
# bert = Bert("https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1")

In [None]:
# x = bert.embed("A sequence of words is a sequebce.")

In [None]:
# def encode_bert(sequence):
#     return bert.embed(sequence)

## Simple transformers


In [None]:
# import simpletransformers

In [None]:
# prefix='/home/anasab/eRisk/data/'
# train_df = pd.read_csv(prefix + 'train.csv', header=None)
# train_df.head()

# eval_df = pd.read_csv(prefix + 'test.csv', header=None)
# eval_df.head()

# train_df[0] = (train_df[0] == 2).astype(int)
# eval_df[0] = (eval_df[0] == 2).astype(int)

# train_df = pd.DataFrame({
#     'text': train_df[1].replace(r'\n', ' ', regex=True),
#     'label':train_df[0]
# })

# print(train_df.head())

# eval_df = pd.DataFrame({
#     'text': eval_df[1].replace(r'\n', ' ', regex=True),
#     'label':eval_df[0]
# })

# print(eval_df.head())

In [None]:
# from simpletransformers.classification import ClassificationModel


# # Create a TransformerModel
# model = ClassificationModel('roberta', 'roberta-base')

# # Train the model
# model.train_model(train_df)

# # Evaluate the model
# result, model_outputs, wrong_predictions = model.eval_model(eval_df)

In [None]:
# from bert import albert_tokenization
# from bert import bert_tokenization

In [None]:
import tensorflow_hub as hub
# bert_path = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"
# bert_path = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1"
bert_path = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"

import bert
# from bert.tokenization.bert_tokenization import FullTokenizer
from bert.tokenization import FullTokenizer

In [None]:
bert_layer = hub.KerasLayer(bert_path,# signature='tokens' , signature_outputs_as_dict=True,
                            trainable=False)


In [None]:
class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, label=None):
        """Constructs a InputExample.
    Args:
      guid: Unique id for the example.
      text_a: string. The untokenized text of the first sequence. For single
        sequence tasks, only this sequence must be specified.
      text_b: (Optional) string. The untokenized text of the second sequence.
        Only must be specified for sequence pair tasks.
      label: (Optional) string. The label of the example. This should be
        specified for train and dev examples, but not for test examples.
    """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label

In [None]:
def encode_text_for_bert(tokenizer, example, max_seq_length=512):
    """Converts a single `InputExample` into a single `InputFeatures`."""

#     if isinstance(example, PaddingInputExample):
#         input_ids = [0] * max_seq_length
#         input_mask = [0] * max_seq_length
#         segment_ids = [0] * max_seq_length
#         label = 0
#         return input_ids, input_mask, segment_ids, label

    tokens_a = tokenizer.tokenize(example.text_a)
    if len(tokens_a) > max_seq_length - 2:
        tokens_a = tokens_a[0 : (max_seq_length - 2)]

    tokens = []
    segment_ids = []
    tokens.append("[CLS]")
    segment_ids.append(0)
    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(0)
    tokens.append("[SEP]")
    segment_ids.append(0)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        input_mask.append(0)
        segment_ids.append(0)

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    return input_ids, input_mask, segment_ids, example.label


In [None]:
def create_tokenizer_from_hub_module():
    """Get the vocab file and casing info from the Hub module."""
#     bert_module =  hub.Module(bert_path)
#     tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
    do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()

#     vocab_file, do_lower_case = sess.run(
#         [
#             tokenization_info["vocab_file"],
#             tokenization_info["do_lower_case"],
#         ]
#     )

    return FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)

In [None]:
# Instantiate tokenizer
# bert_tokenizer = FullTokenizer()
bert_tokenizer = create_tokenizer_from_hub_module()

encode_text_for_bert(bert_tokenizer, InputExample(None, 
                                               "Ana are mere"), 
                       hyperparams_features['maxlen'])

### tfhub albert

In [None]:
# input_ids = tf.keras.layers.Input(shape=[None], dtype=tf.int32)
# input_mask = tf.keras.layers.Input(shape=[None], dtype=tf.int32)
# sequence_mask = tf.keras.layers.Input(shape=[None], dtype=tf.int32)

# albert = hub.KerasLayer(
#     "https://tfhub.dev/google/albert_xlarge/3",
#     trainable=True,
#     signature="tokens",
#     output_key="pooled_output",
# )

# features = {
#     "input_ids": input_ids,
#     "input_mask": input_mask,
#     "segment_ids": sequence_mask,
# }
# out = albert(features)
# model = tf.keras.Model(inputs=[input_ids, input_mask, sequence_mask], outputs=out)
# model.compile("adam", loss="sparse_categorical_crossentropy")
# model.summary()

### Encode data

In [None]:
from collections import Counter
def encode_labels(labels):
    '''Convert ia to i and ib to -i'''
    encoded_labels = []
    for i, l in enumerate(labels):
        try:
            encoded_labels.append(int(l))
        except Exception as e:
            logger.debug("Encoding label %s\n" % l)
        
            if str(l)[-1] == 'a':
                encoded_labels.append(int(l[0]))
            elif str(l)[-1] == 'b':
                encoded_labels.append(-int(l[0]))
            else:
                logger.warning("Coult not encode label %s\n" % l)
    return encoded_labels

def load_erisk_data(writings_df, voc_size, emotion_lexicon, seq_len, emotions =  
                    ['anger', 'anticipation', 'disgust', 'fear', 'joy', 
                     'negative', 'positive', 'sadness', 'surprise', 'trust'],
                    liwc_categories = categories, ignore_features=[],
                    pronouns = ["i", "me", "my", "mine", "myself"],
                    train_prop=0.7, valid_prop=0.3, test_slice=2,
                    nr_slices=5,
                    min_post_len=3, min_word_len=1, 
                    user_level=True, vocabulary=None,
                   logger=logger):
    logger.debug("Loading data...\n")
    if not vocabulary:
        vocabulary = {}
        word_freqs = Counter()
        for words in writings_df.tokenized_text:
            word_freqs.update(words)
        for words in writings_df.tokenized_title:
            word_freqs.update(words)
        i = 1
        for w, f in word_freqs.most_common(voc_size-2): # keeping voc_size-1 for unk
            if len(w) < min_word_len:
                continue
            vocabulary[w] = i
            i += 1

    if 'subset' in writings_df.columns:
        training_subjects = list(set(writings_df[writings_df['subset']=='train'].subject))
        test_subjects = list(set(writings_df[writings_df['subset']=='test'].subject))
    else:
        all_subjects = sorted(list(set(writings_df.subject)))
        training_subjects_size = int(len(all_subjects) * train_prop)
        test_subjects_size = len(all_subjects) - training_subjects_size
        # Cross-validation, with fixed slice as input
        test_prop = 1-train_prop
        test_slice = min(test_slice, nr_slices)
        logger.debug("start index: %f, from %f\n" % (
            len(all_subjects)*(1/nr_slices)*test_slice, test_prop*test_slice))
        start_slice = int(len(all_subjects)*(1/nr_slices)*test_slice)
        test_subjects = all_subjects[start_slice: start_slice+test_subjects_size]
        training_subjects = [s for s in all_subjects if s not in test_subjects]
    training_subjects = sorted(training_subjects) # ensuring reproducibility
    valid_subjects_size = int(len(training_subjects) * valid_prop)
    valid_subjects = training_subjects[:valid_subjects_size]
    training_subjects = training_subjects[valid_subjects_size:]
    categories = [c for c in liwc_categories if c in writings_df.columns]
    logger.debug("%d training users, %d validation users, %d test users." % (
        len(training_subjects), 
          len(valid_subjects),
          len(test_subjects)))
    subjects_split = {'train': training_subjects, 
                      'valid': valid_subjects, 
                      'test': test_subjects}

    user_level_texts = {}
    for row in writings_df.sort_values(by='date').itertuples():
        words = []
        raw_text = ""
        if row.tokenized_title:
            words.extend(row.tokenized_title)
            raw_text += row.title
        if row.tokenized_text:
            words.extend(row.tokenized_text)
            raw_text += row.text
        if not words or len(words)<min_post_len:
            print(row.subject)
            continue
        labels = [getattr(row, 'label%d'%i) for i in range(nr_questions)]
        liwc_categs = [getattr(row, categ) for categ in categories]
        if row.subject not in user_level_texts.keys():
            user_level_texts[row.subject] = {}
            user_level_texts[row.subject]['texts'] = [words]
            user_level_texts[row.subject]['labels'] = encode_labels(labels)
            user_level_texts[row.subject]['liwc'] = [liwc_categs]
            user_level_texts[row.subject]['raw'] = [raw_text]
        else:
            user_level_texts[row.subject]['texts'].append(words)
            user_level_texts[row.subject]['liwc'].append(liwc_categs)
            user_level_texts[row.subject]['raw'].append(raw_text)

    return user_level_texts, subjects_split, vocabulary


In [None]:
vocabulary_list = pickle.load(open('all_vocab_clpsych_erisk_20000.pkl', 'rb'))
vocabulary_dict={}
for i,w in enumerate(vocabulary_list):
    vocabulary_dict[w] = i
user_level_data, subjects_split, vocabulary = load_erisk_data(writings_df, 
                                                            seq_len=hyperparams_features['maxlen'],
                                                            voc_size=hyperparams_features['max_features'],
                                                           emotion_lexicon=nrc_lexicon,
                                                           emotions=emotions,
                                                           user_level=hyperparams_features['user_level'],
                                                                                logger=logger,
#                                                            vocabulary=pickle.load(open('vocabulary_40K_all.pkl', 'rb'))
                                                             vocabulary=vocabulary_dict,           
                                                             )

In [None]:
user_level_data['subject6900']['labels']

In [None]:
def load_embeddings(path, embedding_dim, voc):
    # random matrix with mean value = 0
    embedding_matrix = np.random.random((len(voc)+2, embedding_dim)) - 0.5 # voc + unk + pad value(0)

    f = open(path)
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        word_i = voc.get(word)
        if word_i is not None:
            embedding_matrix[word_i] = coefs
    f.close()

    print('Total %s word vectors.' % len(embedding_matrix))

 
    return embedding_matrix

pretrained_embeddings_path = root_dir + '/resources/glove.twitter.27B/glove.twitter.27B.%dd.txt' % hyperparams_features['embedding_dim']
embedding_matrix = load_embeddings(pretrained_embeddings_path, hyperparams_features['embedding_dim'], vocabulary)


## Data Generator

In [None]:
class DataGenerator(Sequence):
    'Generates data for Keras'
    def __init__(self, user_level_data, subjects_split, set_type='train', bert_tokenizer=bert_tokenizer,
                 batch_size=hyperparams_features['batch_size'], seq_len=hyperparams_features['maxlen'], 
                 voc_size=hyperparams_features['max_features'], emotion_lexicon=nrc_lexicon, sparse_words=stopwords,
                 emotions=emotions, pronouns=["i", "me", "my", "mine", "myself"], 
                 max_posts_per_user=hyperparams_features['posts_per_user'],
                 shuffle=True):
        'Initialization'
        self.seq_len = seq_len
        self.bert_tokenizer = bert_tokenizer
        self.subjects_split = subjects_split
        self.set = set_type
        self.emotion_lexicon = emotion_lexicon
        self.batch_size = batch_size
        self.data = user_level_data
        self.emotions = emotions
        self.pronouns = pronouns
        self.shuffle = shuffle
        self.sparse_words = sparse_words
        self.voc_size = voc_size
        self.max_posts_per_user = max_posts_per_user
        self.on_epoch_end()

    def __encode_text(self, tokens, raw_text):
        # Using voc_size-1 value for OOV token
        encoded_tokens = [vocabulary.get(w, self.voc_size-1) for w in tokens]
        encoded_emotions = encode_emotions(tokens, self.emotion_lexicon, self.emotions)
        encoded_pronouns = encode_pronouns(tokens, self.pronouns)
        encoded_stopwords = encode_stopwords(tokens, self.sparse_words)
        bert_ids, bert_masks, bert_segments, label = encode_text_for_bert(self.bert_tokenizer, InputExample(None, 
                                               raw_text), self.seq_len)
        return (encoded_tokens, encoded_emotions, encoded_pronouns, encoded_stopwords,
               bert_ids, bert_masks, bert_segments)
        
    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.subjects_split[self.set]) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        user_indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        # Find users
        users = [self.subjects_split[self.set][i] for i in user_indexes
                    if self.subjects_split[self.set][i] in self.data.keys()] # TODO: maybe needs a warning that user is missing

        post_indexes = {}
        # Sample post ids
        for subject in users:
            posts_len = len(self.data[subject]['texts'])
            posts_index_sample = sorted(np.random.choice(posts_len, 
                                                         min(self.max_posts_per_user, posts_len),
                                                         replace=False))
            post_indexes[subject] = posts_index_sample
        # Generate data
        X, y = self.__data_generation(users, post_indexes)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.subjects_split[self.set]))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, users, post_indexes):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        tokens_data = []
        categ_data = []
        sparse_data = []
        subjects = []
        bert_ids_data = []
        bert_masks_data = []
        bert_segments_data = []
        labels = []

        for subject in users:
            texts = self.data[subject]['texts']
            raw_texts = self.data[subject]['raw']
            label = self.data[subject]['label']
            liwc_scores = self.data[subject]['liwc']
            
            # Sample
            texts = [texts[i] for i in post_indexes[subject]]
            liwc_selection = [liwc_scores[i] for i in post_indexes[subject]]
            raw_texts = [raw_texts[i] for i in post_indexes[subject]]
            
            all_words = [sum(texts, [])] # merge all texts in one list -- Ok, why sum?? this is wrong!!
            liwc_mean = [np.array(liwc_selection).mean(axis=0).tolist()]
            liwc_std = [np.array(liwc_selection).std(axis=0).tolist()]
            all_raw_texts = [" ".join(raw_texts)]
            
            for i, words in enumerate(all_words):
                encoded_tokens, encoded_emotions, encoded_pronouns, encoded_stopwords, \
                    bert_ids, bert_masks, bert_segments = self.__encode_text(words, all_raw_texts[i])
                subject_id = int(subject.split('t')[1])
                tokens_data.append(encoded_tokens)
                categ_data.append(encoded_emotions + [encoded_pronouns] + liwc_mean[i] + liwc_std[i])
                sparse_data.append(encoded_stopwords)
                bert_ids_data.append(bert_ids)
                bert_masks_data.append(bert_masks)
                bert_segments_data.append(bert_segments)
                
                labels.append(label)
                subjects.append(subject_id)

        
        # using zeros for padding
        tokens_data_padded = sequence.pad_sequences(tokens_data, maxlen=self.seq_len)

        return ([np.array(tokens_data_padded), np.array(categ_data), np.array(sparse_data),
                 np.array(bert_ids_data), np.array(bert_masks_data), np.array(bert_segments_data),
                np.array(subjects)],
                np.array(labels))

In [None]:
class DataGenerator(Sequence):
    'Generates data for Keras'
    def __init__(self, user_level_data, subjects_split, batch_size=hyperparams_features['batch_size'], 
                 seq_len=hyperparams_features['maxlen'], voc_size=hyperparams_features['max_features'], 
                 emotion_lexicon=nrc_lexicon, set_type='train', test_user_indexes=[0],
                 emotions=emotions, pronouns=["i", "me", "my", "mine", "myself"], 
                 max_posts_per_user=hyperparams_features['posts_per_user'],
                 bert_tokenizer=bert_tokenizer,
                 shuffle=True):
        'Initialization'
        self.seq_len = seq_len
        self.emotion_lexicon = emotion_lexicon
        self.bert_tokenizer = bert_tokenizer
        self.batch_size = batch_size
        self.data = user_level_data
        self.all_users = list(self.data.keys())
        self.emotions = emotions
        self.pronouns = pronouns
        self.set = set_type
        self.subjects_split = subjects_split
        self.shuffle = shuffle
        self.voc_size = voc_size
        self.max_posts_per_user = max_posts_per_user
        self.test_user_indexes = test_user_indexes
        self.on_epoch_end()


    def __encode_text(self, tokens, raw_text):
        # Using voc_size-1 value for OOV token
        encoded_tokens = [vocabulary.get(w, self.voc_size-1) for w in tokens]
        encoded_emotions = encode_emotions(tokens, self.emotion_lexicon, self.emotions)
        encoded_pronouns = encode_pronouns(tokens, self.pronouns)
        encoded_stopwords = encode_stopwords(tokens)
        bert_ids, bert_masks, bert_segments, label = encode_text_for_bert(self.bert_tokenizer, InputExample(None, 
                                               raw_text), self.seq_len)
        return (encoded_tokens, encoded_emotions, encoded_pronouns, encoded_stopwords,
               bert_ids, bert_masks, bert_segments)
        
    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.subjects_split[self.set]) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        user_indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        # Find users
        users = [self.subjects_split[self.set][i] for i in user_indexes
                    if self.subjects_split[self.set][i] in self.data.keys()] # TODO: maybe needs a warning that user is missing

        post_indexes = {}
        # Sample post ids
        for subject in users:
            posts_len = len(self.data[subject]['texts'])
            posts_index_sample = sorted(np.random.choice(posts_len, 
                                                         min(self.max_posts_per_user, posts_len),
                                                         replace=False))
            post_indexes[subject] = posts_index_sample
        # Generate data
        X, y = self.__data_generation(users, post_indexes)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.subjects_split[self.set]))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, users, post_indexes):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        tokens_data = []
        categ_data = []
        sparse_data = []
        subjects = []
        bert_ids_data = []
        bert_masks_data = []
        bert_segments_data = []
        labels = []
        for subject in users:
            texts = self.data[subject]['texts']
            label = self.data[subject]['labels']
            raw_texts = self.data[subject]['raw']

            # Sample
            texts = [texts[i] for i in post_indexes[subject]]
            liwc_selection = [self.data[subject]['liwc'][i] for i in post_indexes[subject]]
            raw_texts = [raw_texts[i] for i in post_indexes[subject]]

            all_words = [sum(texts, [])] # merge all texts in one list
            liwc_aggreg = [np.array(liwc_selection).mean(axis=0).tolist()]
            all_raw_texts = [" ".join(raw_texts)]

            
            for i, words in enumerate(all_words):
                encoded_tokens, encoded_emotions, encoded_pronouns, encoded_stopwords, \
                    bert_ids, bert_masks, bert_segments = self.__encode_text(words, all_raw_texts[i])
                subject_id = int(subject.split('t')[1])
                tokens_data.append(encoded_tokens)
                categ_data.append(encoded_emotions + [encoded_pronouns] + liwc_aggreg[i])
                sparse_data.append(encoded_stopwords)
                labels.append(label)
                bert_ids_data.append(bert_ids)
                bert_masks_data.append(bert_masks)
                bert_segments_data.append(bert_segments)
                
                subjects.append(subject_id)

        
        # using zeros for padding
        tokens_data_padded = sequence.pad_sequences(tokens_data, maxlen=self.seq_len)

        return ([np.array(tokens_data_padded), np.array(categ_data), np.array(sparse_data),
#                 np.array(subjects),
                np.array(bert_ids_data, dtype=np.int32), 
                 np.array(bert_masks_data, dtype=np.int32), 
                 np.array(bert_segments_data, dtype=np.int32),],
                np.array(labels))
#                 np.array(labels).reshape(self.batch_size, -1, len(labels)).tolist()) # to have one array per output

In [None]:
# TODO: Don't split into the 3 sets, do leave-one-out cross-validation

In [None]:
def get_subjects_split(test_size=hyperparams_features['batch_size']):
    test_user_indexes = [np.random.randint(len(user_level_data)) for i in range(test_size)]

    subjects_split = {'test': [u for i,u in 
                               enumerate(user_level_data.keys()) if i in test_user_indexes],
                     'train': [u for i,u in 
                               enumerate(user_level_data.keys()) if i not in test_user_indexes],}
    return subjects_split

In [None]:
logger.setLevel(logging.DEBUG)

# TODO: it is skipping the last batch
x_data = {'train': [], 'test': []}
y_data = {'train': [], 'test': []}
subjects_split = get_subjects_split()
for set_type in ['train', 'test']:
    for x, y in DataGenerator(user_level_data, batch_size=hyperparams_features['batch_size'],
                            set_type=set_type,
                             subjects_split=subjects_split):
        print(x)
        x_data[set_type].append(x)
        y_data[set_type].append(y)
#         break


In [None]:
y_data['test'][0]

In [None]:
x_data['train'][0][5].shape

In [None]:
np.array([[1,2],[3,4]]).reshape(2,-1)

In [None]:
sum([len(subjects_split[s]) for s in ['train', 'test']])

In [None]:
x_data['train']

In [None]:
y_data

# Train

In [None]:
hyperparams = {
    'lstm_units': 10,
    'dense_bow_units': 20,
    'dropout': 0.0,
    'l2_dense': 0.00000011,
    'l2_embeddings': 0.000001,
    'dense_sentence_units': 100,
    'optimizer': 'adam',
    'bert_dense_units': 256,
    'decay': 0.00001,
    'lr': 0.01,
    "trainable_embeddings": False,
    "reduce_lr_factor": 0.0002,
    "reduce_lr_patience": 1000,
    "freeze_patience": 500,
    'threshold': 0.5,
    'bert_len': 768,
    'ignore_layer': ['batchnorm', 'lstm_layers', 'numerical_dense_layer', 'sparse_feat_dense_layer'],
    'norm_momentum': 0.1,

}
if not hyperparams['optimizer']:
    hyperparams['optimizer'] = optimizers.Adam(lr=hyperparams['lr'], #beta_1=0.9, beta_2=0.999, epsilon=0.0001,
                                   decay=hyperparams['decay'])

In [None]:
# class Metrics():
#     def __init__(self, threshold=0.5):
#         self.threshold=threshold
        
#     def recall_m(self, y_true, y_pred):
#             y_labels = y_true
#             y_pred = K.cast(K.greater(K.clip(y_pred, 0, 1), self.threshold), K.floatx())        
#             possible_positives = K.sum(K.round(K.clip(y_labels, 0, 1)))
#             true_positives = K.sum(K.round(K.clip(y_labels * y_pred, 0, 1)))
#             recall = true_positives / (possible_positives + K.epsilon())
#             return recall

#     def precision_m(self, y_true, y_pred):
#             y_labels = y_true
#             y_pred = K.cast(K.greater(K.clip(y_pred, 0, 1), self.threshold), K.floatx())        
#             true_positives = K.sum(K.round(K.clip(y_labels * y_pred, 0, 1)))
#             predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
#             precision = true_positives / (predicted_positives + K.epsilon())
#             return precision

#     def f1_m(self, y_true, y_pred):
#         precision = self.precision_m(y_true, y_pred)
#         recall = self.recall_m(y_true, y_pred)
#         return 2*((precision*recall)/(precision+recall+K.epsilon()))

# def binary_crossentropy_custom(y_true, y_pred):
#     y_labels = y_true
#     return K.binary_crossentropy(y_labels, 
#                                  y_pred)

# metrics_class = Metrics(threshold=hyperparams['threshold'])

In [None]:
def build_model(hyperparams, hyperparams_features, embedding_matrix, emotions, stopwords_list,
                liwc_categories, nr_classes,
               ignore_layer=[]):

    tokens_features = Input(shape=(hyperparams_features['maxlen'],), name='word_seq')
    embedding_layer = Embedding(hyperparams_features['max_features'], 
                                hyperparams_features['embedding_dim'], 
                                input_length=hyperparams_features['maxlen'],
                                embeddings_regularizer=regularizers.l2(hyperparams['l2_embeddings']),
                                weights=[embedding_matrix], 
                                trainable=hyperparams['trainable_embeddings'],
                               name='embeddings_layer')(
        tokens_features)
#     if 'batchnorm' not in ignore_layer:
#         embedding_layer_norm = BatchNormalization(axis=-1, momentum=hyperparams['norm_momentum'],
#                                                      name='embeddings_layer_norm')(embedding_layer)
#     lstm_layers = Bidirectional(LSTM(hyperparams['lstm_units']))(embedding_layer)


    lstm_layers = LSTM(hyperparams['lstm_units'], 
                           return_sequences='attention' not in ignore_layer,
                      name='LSTM_layer')(embedding_layer)
    
    # Attention
    if 'attention' not in ignore_layer:
        attention = Dense(1, activation='tanh', name='attention')(lstm_layers)
        attention = Flatten()(attention)
        attention = Activation('softmax')(attention)
        attention = RepeatVector(hyperparams['lstm_units'])(attention)
        attention = Permute([2, 1])(attention)

        sent_representation = Multiply()([lstm_layers, attention])
        sent_representation = Lambda(lambda xin: K.sum(xin, axis=1), 
                                     output_shape=(hyperparams['lstm_units'],)
                                    )(sent_representation)

        
    else:
        sent_representation = lstm_layers
        
    
    sent_representation = Dropout(hyperparams['dropout'], name='lstm_att_dropout')(sent_representation)
    if hyperparams['dense_sentence_units']:
        sent_representation = Dense(units=hyperparams['dense_sentence_units'],
                                   name='dense_sent_representation')(sent_representation)
    numerical_features = Input(shape=(len(emotions) + 1 + len(liwc_categories),), name='numeric_input') # emotions and pronouns
    
    in_id_bert = Input(shape=(hyperparams_features['maxlen'],), dtype='int32', name="input_ids_bert")
    in_mask_bert = Input(shape=(hyperparams_features['maxlen'],), dtype='int32', name="input_masks_bert")
    in_segment_bert = Input(shape=(hyperparams_features['maxlen'],), dtype='int32', name="segment_ids_bert")
#     bert_layer = hub.Module(
#         "https://tfhub.dev/google/albert_xlarge/3",
#         bert_path, trainable=True,
#         trainable=False,
#         signature="tokens",
#         signature_outputs_as_dict=True,
#         output_key="pooled_output",
#     )

#     bert_layer = hub.Module(
# #         "https://tfhub.dev/google/albert_xlarge/3",
#         bert_path, trainable=True,
# #         trainable=False,
# #         signature="tokens",
# #         signature_outputs_as_dict=True,
#         output_key="pooled_output",
#     )

    albert = hub.KerasLayer(
        "https://tfhub.dev/google/albert_base/3",
        trainable=False,
        signature="tokens",
        output_key="pooled_output",
    )

    bert_features = {
        "input_ids": in_id_bert,
        "input_mask": in_mask_bert,
        "segment_ids": in_segment_bert,
    }
    bert_output = albert(bert_features)
#     bert_output = albert([in_id_bert, in_mask_bert, in_segment_bert])['pooled_output']  # TODO: can also be 'mean'. Check BertLayer
    dense_layer_bert = Dense(units=hyperparams['bert_dense_units'],
                        kernel_regularizer=regularizers.l2(hyperparams['l2_dense']),
                        name='bert_dense_layer',
                       )(bert_output)
    sparse_features = Input(shape=(len(stopwords_list),), name='sparse_input') # stopwords

    if hyperparams['dense_bow_units']:
        dense_layer_sparse = Dense(units=hyperparams['dense_bow_units'],
                              name='sparse_feat_dense_layer',
                                kernel_regularizer=regularizers.l2(hyperparams['l2_dense']),
                              )(sparse_features)
    else:
        dense_layer_sparse = sparse_features
    
    if 'batchnorm' not in ignore_layer:
        numerical_features_norm = BatchNormalization(axis=1, momentum=hyperparams['norm_momentum'],
                                                     name='numerical_features_norm')(numerical_features)
        sent_representation_norm = BatchNormalization(axis=1, momentum=hyperparams['norm_momentum'],
                                                      name='sent_repr_norm')(sent_representation)

        dense_layer_sparse_norm = BatchNormalization(axis=1, momentum=hyperparams['norm_momentum'],
                                                         name='sparse_features_norm')(dense_layer_sparse)
        dense_layer_bert_norm = BatchNormalization(axis=1, momentum=hyperparams['norm_momentum'],
                                                         name='bert_features_norm')(dense_layer_bert)
        
    subjects = Input(shape=(1,), name='subjects')
    

    all_layers = {
        'lstm_layers': sent_representation,
        'numerical_dense_layer': numerical_features,
        'sparse_feat_dense_layer': dense_layer_sparse,
        'bert_layer': dense_layer_bert
    }
    if 'batchnorm' not in ignore_layer:
        all_layers = {
            'lstm_layers': sent_representation_norm,
            'numerical_dense_layer': numerical_features_norm,
            'sparse_feat_dense_layer': dense_layer_sparse_norm,
            'bert_layer': dense_layer_bert_norm
        }
    layers_to_merge = []
    for n, l in all_layers.items():
        if n in ignore_layer:
            continue
        layers_to_merge.append(l)
        
    if len(layers_to_merge) == 1:
        merged_layers = layers_to_merge[0]
    else:
        merged_layers = concatenate(layers_to_merge)
    output_layers = []
    for label in range(nr_classes):
        output_layer = Dense(1, activation='softmax',
                         name='output_layer%d' % label,
                        kernel_regularizer=regularizers.l2(hyperparams['l2_dense']))(merged_layers)
        output_layers.append(output_layer)

    # Compile model
    model = Model(inputs=[tokens_features, numerical_features, sparse_features,
                         in_id_bert, in_mask_bert, in_segment_bert], 
                  outputs=output_layers)

    model.compile(hyperparams['optimizer'], {'output_layer%d'%i: 
                                             'mean_squared_error' for i in range(nr_classes)},
                  metrics={'output_layer%d' % label: 
                           ['accuracy', 'mean_squared_error'] for label in range(nr_classes)})
    return model



In [None]:
hub.KerasLayer

In [None]:
model = build_model(hyperparams, hyperparams_features, embedding_matrix, emotions, stopword_list,
                    liwc_categories=[c for c in categories if c in writings_df.columns],
                    nr_classes=nr_questions,
                   ignore_layer=hyperparams['ignore_layer'])
model.summary()

In [None]:
def train_model(model, 
                data_generator_train, data_generator_valid,
                epochs, start_epoch=0, workers=4,
                callback_list = [],
                model_path='/tmp/model',
               verbose=1):
    logging.info('Train...')
    experiment.log_parameter('callbacks', callbacks)

    history = model.fit_generator(data_generator_train,
#               batch_size=batch_size,
#                 steps_per_epoch=steps_per_epoch,
              epochs=epochs, initial_epoch=start_epoch, 
              validation_data=data_generator_valid,
                        verbose=verbose,
#               validation_split=0.3,
                       workers=workers,
            callbacks = [
                callbacks.ModelCheckpoint(filepath='%s_best' % model_path, verbose=1, 
                                          save_best_only=True),
                callbacks.EarlyStopping(patience=500), *callback_list
            ])
    model.save(model_path)
    experiment.log_parameter('model_path', model_path)
    return model, history

In [None]:
experiment = Experiment(api_key="eoBdVyznAhfg3bK9pZ58ZSXfv",
                        project_name="mental", workspace="ananana", disabled=False)

experiment.log_parameters(hyperparams_features)

experiment.log_parameter('emotion_lexicon', nrc_lexicon_path)
experiment.log_parameter('emotions', emotions)
experiment.log_parameter('embeddings_path', pretrained_embeddings_path)

experiment.add_tag('T2')
experiment.log_parameters(hyperparams)

In [None]:
subjects_split = get_subjects_split(test_size=10)
data_generator_train = DataGenerator(user_level_data, set_type='train', 
                                     subjects_split=subjects_split)
data_generator_valid = DataGenerator(user_level_data, set_type='test',  
                                     subjects_split=subjects_split)
model, history = train_model(model, data_generator_train, data_generator_valid,
           epochs=1000, start_epoch=0,
                      callback_list = [],
                      model_path='models/bert_t21', workers=1)

In [None]:
subjects_split

In [None]:
for d in DataGenerator(user_level_data, set_type='train', 
                                     subjects_split=subjects_split):
    print(d[0])
    break

In [None]:
tf.test.is_gpu_available()

# SVM

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score


In [None]:
writings_df = writings_df.fillna(value={'text': '', 'title':''})
column_functions = {'text': lambda t: " ".join(t), 
                                        'title': lambda t: " ".join(t),
                                        'text_len': 'sum',
                                        'title_len': 'sum',
                                        'tokenized_text': 'sum',
                                        'tokenized_title': 'sum',
                     }
column_functions.update({'label%i'%i: 'min' for i in range(21)})
column_functions.update({c: 'mean' for c in list(categories) + emotions + ["pronouns"]})
writings_per_user_df = writings_df.groupby('subject').aggregate(column_functions)

In [None]:
writings_df.groupby('subject').mean().columns

In [None]:
def get_avg_embedding(writings_df, subject, column):
    return writings_df[writings_df['subject']==subject][column].apply(lambda l: np.array(l)).values.mean()

In [None]:
avg_embeddings_text = {s: get_avg_embedding(writings_df, s, 'use_embeddings_text') 
                       for s in set(writings_df.subject.values)}
avg_embeddings_title = {s: get_avg_embedding(writings_df, s, 'use_embeddings_title') 
                       for s in set(writings_df.subject.values)}

In [None]:
series_embeddings_text = pd.Series(avg_embeddings_text)
series_embeddings_text.name = 'avg_embeddings_text'
series_embeddings_title = pd.Series(avg_embeddings_title)
series_embeddings_title.name = 'avg_embeddings_title'

In [None]:
writings_per_user_df = writings_per_user_df.join(series_embeddings_text, on='subject')
writings_per_user_df = writings_per_user_df.join(series_embeddings_title, on='subject')

In [None]:
writings_per_user_df.columns.values

In [None]:
writings_per_user_df.join?

### Universal sentence encoder

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
# import sentencepiece
# import tensorflow_text


In [None]:
import tensorflow as tf
os.environ['TFHUB_CACHE_DIR'] = '/home/anasab/tf_cache'

import tensorflow_hub as hub
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/3"
# Import the Universal Sentence Encoder's TF Hub module
embed = hub.Module(module_url)

In [None]:
# embed_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder-large/4",input_shape=[],dtype=tf.string,trainable=True)

# model = tf.keras.Sequential()
# model.add(embed_layer)
# model.add(tf.keras.layers.Dense(16, activation='relu'))
# model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

# model.summary()

# model.compile(optimizer='adam',
#           loss='binary_crossentropy',
#           metrics=['accuracy'])

# history = model.fit(train_dataset,
#                 validation_data=validation_dataset,
#                 epochs=30,
#                 verbose=1
#                )

In [None]:
# tf.version

In [None]:
# g = tf.Graph()
# with g.as_default():
#     use = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
#     # use = tf.saved_model.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")
#     # hub.Module("https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3")

In [None]:
# tf.compat.v1.disable_eager_execution()

# module_url = "https://tfhub.dev/google/universal-sentence-encoder/4" #@param ["https://tfhub.dev/google/universal-sentence-encoder/4", "https://tfhub.dev/google/universal-sentence-encoder-large/5"]
# model = hub.load(module_url)

In [None]:
# with tf.compat.v1.Session() as session:
#     session.run([tf.compat.v1.global_variables_initializer(), tf.compat.v1.tables_initializer()])
#     message_embeddings = session.run(embed(["The cat is on the may"]))

In [None]:
sess.close()
sess_config = tf.ConfigProto(
        device_count={ 'GPU' : 1, 'CPU': 4 },
        intra_op_parallelism_threads = 0,
        inter_op_parallelism_threads = 4,
        allow_soft_placement=True
    )
sess_config.gpu_options.allow_growth = True
sess_config.gpu_options.per_process_gpu_memory_fraction = 1
sess = tf.Session(config=sess_config)

def initialize_vars(sess):
    sess.run(tf.local_variables_initializer())
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    K.set_session(sess)
initialize_vars(sess)

In [None]:
embeddings = embed(["hello world", "good morning"])

embeddings.eval(session=sess)

In [None]:
def get_use(texts):
    embeddings=embed(texts)
    return embeddings.eval(session=sess)

In [None]:
get_use(["Come on, man!"])

In [None]:
%%time

In [None]:
%%time
writings_df['use_embedding_text'] = writings_df['text'].apply(lambda t: get_use([t])[0])

In [None]:
all_texts = writings_df.text.values


In [None]:
%%time
all_embeddings_dict = {}
for i in range(0, int(len(all_texts)/2000)+1):
    all_embeddings_dict[i] = get_use(all_texts[2000*i:min(2000*(i+1), len(all_texts))])

In [None]:
[len(all_embeddings_dict[i]) for i in all_embeddings_dict]
# len(all_embeddings_dict)

In [None]:
all_embeddings_texts = np.concatenate([all_embeddings_dict[i] for i in range(18)])

In [None]:
[all_embeddings_dict[i] for i in range(18)]

In [None]:
# all_embeddings.tolist()
writings_df['use_embeddings_text'] = all_embeddings_texts.tolist()

In [None]:
writings_df.head()

In [None]:
# pickle.dump(writings_df, open("writings_df_t2_test_wuse.pkl", "wb+"))

In [None]:
writings_embeddings = writings_df[['subject', 'text', 'title', 'use_embeddings_text', 'use_embeddings_title']]

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
# writings_embeddings['use_similarity'] = writings_embeddings['use_embeddings_text'].apply()

In [None]:
cosine_similarity(writings_embeddings['use_embeddings_text'], writings_embeddings['use_embeddings_title'])

In [None]:
# embed = hub.Module("../resources/sentence_wise_email/module/module_useT")

In [None]:
# with tf.compat.v1.Session() as sess:
#     model(["The cat is on the mat"])

In [None]:
# features = [np.random.rand(75) for i in range(20)]
# features = writings_per_user_df[list(categories) + emotions + ["pronouns"]]
features = writings_per_user_df['avg_embeddings_title'].values.tolist()

def cross_validation(folds=2):
    svmmodels= {}
    total_score = 0
    for l in range(21):
#         print("Classifier for label", l)
        labels = writings_per_user_df['label%d' % l].values
        svmmodels[l] = SVC(kernel='rbf', C=5)
        cvscores = cross_val_score(svmmodels[l], features, labels, cv=folds)
#         print(sum(cvscores)/folds, cvscores)
        total_score += sum(cvscores)/folds
    return total_score/21

print(cross_validation())

In [None]:
def results_for_label(features, l, train_examples=16):
    labels = writings_per_user_df['label%d' % l].values
    svmmodel=SVC()
    svmmodel.fit(features[:train_examples], labels[:train_examples])
    predictions = svmmodel.predict(features[train_examples:])
    print(l, predictions, labels[train_examples:], labels[:train_examples])
    return labels[train_examples:]==predictions

In [None]:
features

In [None]:
cumresults = []
for l in range(21):
    results = results_for_label(features, l)
    cumresults.append(results)

nrusers = len(cumresults[0])
nrques = 21
correct_per_user = {u: 0 for u in range(nrusers)}
for q, ques in enumerate(cumresults):
    for u, answ in enumerate(cumresults[q]):
        if answ:
            correct_per_user[u] += 1

for u in correct_per_user:
    print("u", u, correct_per_user[u]/nrques)
print("AHR", sum(correct_per_user.values())/nrusers/nrques)

## Data augmentation

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.linear_model import LogisticRegression, LinearRegression

In [None]:
hyperparams_features = {
    "max_features": 20002,
    # cut texts after this number of words
    # (among top max_features most common words)
    "maxlen": 512,
    "embedding_dim": 100,
    "user_level": True,
    "posts_per_user": 10,
    "batch_size": 1,
}

In [None]:
from collections import Counter
def encode_labels(labels):
    '''Convert ia to i and ib to -i'''
    encoded_labels = []
    for i, l in enumerate(labels):
        try:
            encoded_labels.append(int(l))
        except Exception as e:
            logger.debug("Encoding label %s\n" % l)
        
            if str(l)[-1] == 'a':
                encoded_labels.append(int(l[0]))
            elif str(l)[-1] == 'b':
                encoded_labels.append(-int(l[0]))
            else:
                logger.warning("Coult not encode label %s\n" % l)
    return encoded_labels

def load_erisk_data(writings_df, voc_size, emotion_lexicon, emotions =  
                    ['anger', 'anticipation', 'disgust', 'fear', 'joy', 
                     'negative', 'positive', 'sadness', 'surprise', 'trust'],
                    liwc_categories = categories, ignore_features=[],
                    pronouns = ["i", "me", "my", "mine", "myself"],
                    train_prop=0.7, valid_prop=0.0, test_slice=2,
                    nr_slices=5,
                    min_post_len=3, min_word_len=1, 
                    user_level=True, vocabulary=None,
                   logger=logger):
    logger.debug("Loading data...\n")
    if not vocabulary:
        vocabulary = {}
        word_freqs = Counter()
        for words in writings_df.tokenized_text:
            word_freqs.update(words)
        for words in writings_df.tokenized_title:
            word_freqs.update(words)
        i = 1
        for w, f in word_freqs.most_common(voc_size-2): # keeping voc_size-1 for unk
            if len(w) < min_word_len:
                continue
            vocabulary[w] = i
            i += 1

    if 'subset' in writings_df.columns:
        training_subjects = list(set(writings_df[writings_df['subset']=='train'].subject))
        test_subjects = list(set(writings_df[writings_df['subset']=='test'].subject))
    else:
        all_subjects = sorted(list(set(writings_df.subject)))
        training_subjects_size = int(len(all_subjects) * train_prop)
        test_subjects_size = len(all_subjects) - training_subjects_size
        # Cross-validation, with fixed slice as input
        test_prop = 1-train_prop
        test_slice = min(test_slice, nr_slices)
        logger.debug("start index: %f, from %f\n" % (
            len(all_subjects)*(1/nr_slices)*test_slice, test_prop*test_slice))
        start_slice = int(len(all_subjects)*(1/nr_slices)*test_slice)
        test_subjects = all_subjects[start_slice: start_slice+test_subjects_size]
        training_subjects = [s for s in all_subjects if s not in test_subjects]
    training_subjects = sorted(training_subjects) # ensuring reproducibility
    valid_subjects_size = int(len(training_subjects) * valid_prop)
    valid_subjects = training_subjects[:valid_subjects_size]
    training_subjects = training_subjects[valid_subjects_size:]
    categories = [c for c in liwc_categories if c in writings_df.columns]
    logger.debug("%d training users, %d validation users, %d test users." % (
        len(training_subjects), 
          len(valid_subjects),
          len(test_subjects)))
    subjects_split = {'train': training_subjects, 
                      'valid': valid_subjects, 
                      'test': test_subjects}

    user_level_texts = {}
    for row in writings_df.sort_values(by='date').itertuples():
        words = []
        raw_text = ""
        if row.tokenized_title:
            words.extend(row.tokenized_title)
            raw_text += row.title
        if row.tokenized_text:
            words.extend(row.tokenized_text)
            raw_text += row.text
        if not words or len(words)<min_post_len:
            print(row.subject)
            continue
        labels = [getattr(row, 'label%d'%i) for i in range(nr_questions)]
        liwc_categs = [getattr(row, categ) for categ in categories]
        if row.subject not in user_level_texts.keys():
            user_level_texts[row.subject] = {}
            user_level_texts[row.subject]['texts'] = [words]
            user_level_texts[row.subject]['labels'] = encode_labels(labels)
            user_level_texts[row.subject]['liwc'] = [liwc_categs]
            user_level_texts[row.subject]['raw'] = [raw_text]
        else:
            user_level_texts[row.subject]['texts'].append(words)
            user_level_texts[row.subject]['liwc'].append(liwc_categs)
            user_level_texts[row.subject]['raw'].append(raw_text)

    return user_level_texts, subjects_split, vocabulary


In [None]:
vocabulary_list = pickle.load(open('all_vocab_clpsych_erisk_%d.pkl' % (
    hyperparams_features['max_features']-2), 'rb'))
vocabulary_dict={}
for i,w in enumerate(vocabulary_list):
    vocabulary_dict[w] = i
user_level_data, subjects_split, vocabulary = load_erisk_data(writings_df, 
                                                            voc_size=hyperparams_features['max_features'],
                                                           emotion_lexicon=nrc_lexicon,
                                                           emotions=emotions,
                                                           user_level=hyperparams_features['user_level'],
                                                                                logger=logger,
#                                                            vocabulary=pickle.load(open('vocabulary_40K_all.pkl', 'rb')),
#                                                            vocabulary=pickle.load(open('vocab_clpsych_10000.pkl', 'rb')),
                                                              vocabulary=vocabulary_dict,
#                                                               by_subset=True
                                                                               )

In [None]:
writings_df

In [None]:
class DataGenerator(Sequence):
    'Generates data for Keras'
    def __init__(self, user_level_data, subjects_split, set_type='train', #bert_tokenizer=bert_tokenizer,
                 batch_size=1, seq_len=50, 
                 voc_size=hyperparams_features['max_features'], emotion_lexicon=nrc_lexicon,
                 hierarchical=False, pad_value=0, padding='pre', sparse_words=stopword_list,
                 post_groups_per_user=1, posts_per_group=10,
                 sampling_distr_alfa=0.1, sampling_distr='exp', # 'exp', 'uniform'
                 emotions=emotions, pronouns=["i", "me", "my", "mine", "myself"], pad_with_duplication=False,
                 max_posts_per_user=None, sample_seqs=False,
                 shuffle=True):
        'Initialization'
        self.seq_len = seq_len
#         self.bert_tokenizer = bert_tokenizer
        self.subjects_split = subjects_split
        self.set = set_type
        self.emotion_lexicon = emotion_lexicon
        self.batch_size = batch_size
        self.hierarchical = hierarchical
        self.data = user_level_data
        self.pad_value = pad_value
        self.sampling_distr_alfa = sampling_distr_alfa
        self.sampling_distr = sampling_distr
        self.emotions = emotions
        self.pronouns = pronouns
        self.sparse_words = sparse_words
        self.sample_seqs = sample_seqs
        self.pad_with_duplication = pad_with_duplication
        self.padding = padding
        self.shuffle = shuffle
        self.voc_size = voc_size
        self.max_posts_per_user = max_posts_per_user
        self.post_groups_per_user = post_groups_per_user
        self.posts_per_group = posts_per_group
        self.__post_indexes_per_user()
        self.on_epoch_end()
        
    @staticmethod
    def _random_sample(population_size, sample_size, sampling_distr, alfa=0.1, replacement=False):
        if sampling_distr == 'exp':
            # Exponential sampling
            sample = sorted(np.random.choice(population_size, 
                            min(sample_size, population_size),
                            p = DataGenerator.__generate_reverse_exponential_indices(population_size, alfa),
                            replace=replacement))
                                                                # if pad_with_duplication, 
                                                                # pad by adding the same post multiple times
                                                                # if there are not enough posts
        elif sampling_distr == 'uniform':
            # Uniform sampling
            sample = sorted(np.random.choice(population_size,
                            min(sample_size, population_size),
                            replace=replacement))
        return sample
    
    @staticmethod
    def __generate_reverse_exponential_indices(max_index, alfa=1):
        probabilities = []
        for x in range(max_index):
            probabilities.append(alfa * (np.exp(alfa*x)))
        reverse_probabilities = [p for p in probabilities]
        sump = sum(reverse_probabilities)
        normalized_probabilities = [p/sump for p in reverse_probabilities]
        return normalized_probabilities
    
    def __post_indexes_per_user(self):
        self.indexes_per_user = {u: [] for u in range(len(self.subjects_split[self.set]))}
        self.indexes_with_user = []
        for u in range(len(self.subjects_split[self.set])):
            if self.subjects_split[self.set][u] not in self.data:
                logger.warning("User %s has no posts in %s set. Ignoring.\n" % (
                    self.subjects_split[self.set][u], self.set))
                continue
            user_posts = self.data[self.subjects_split[self.set][u]]['texts']
            if self.max_posts_per_user:
                user_posts = user_posts[:self.max_posts_per_user]
            nr_post_groups = int(np.ceil(len(user_posts) / self.posts_per_group))
            
            if self.post_groups_per_user:
#                 Limit total number of posts generated for a user to his total nr of posts
                if not self.sample_seqs:
                    nr_post_groups = min(self.post_groups_per_user, nr_post_groups)
                else:
                    nr_post_groups = self.post_groups_per_user
            for i in range(nr_post_groups):
                # Generate random ordered samples of the posts
                if self.sample_seqs:
                    indexes_sample = DataGenerator._random_sample(population_size=len(user_posts),
                                                         sample_size=self.posts_per_group,
                                                         sampling_distr=self.sampling_distr,
                                                         alfa=self.sampling_distr_alfa,
                                                         replacement=self.pad_with_duplication)
                    self.indexes_per_user[u].append(indexes_sample)
                    self.indexes_with_user.append((u, indexes_sample))
                    # break # just generate one?
                # Generate all subsets of the posts in order
                else:
                    self.indexes_per_user[u].append(range(i*self.posts_per_group ,
                                                        min((i+1)*self.posts_per_group, len(user_posts))))
                    self.indexes_with_user.append((u, range(i*self.posts_per_group ,
                                                        min((i+1)*self.posts_per_group, len(user_posts)))))

    def __encode_text(self, tokens, raw_text):
        # Using voc_size-1 value for OOV token
        encoded_tokens = [vocabulary.get(w, self.voc_size-1) for w in tokens]
        encoded_emotions = encode_emotions(tokens, self.emotion_lexicon, self.emotions)
        encoded_pronouns = encode_pronouns(tokens, self.pronouns)
        encoded_stopwords = encode_stopwords(tokens, self.sparse_words)
#         bert_ids, bert_masks, bert_segments, label = encode_text_for_bert(self.bert_tokenizer, InputExample(None, 
#                                                raw_text), self.seq_len)
        return (encoded_tokens, encoded_emotions, encoded_pronouns, encoded_stopwords,)
#                bert_ids, bert_masks, bert_segments)
        
    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.ceil(len(self.indexes) / self.batch_size)) # + 1 to not discard last batch
        
    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        # Find users
        user_indexes = [t[0] for t in indexes]
        users = [self.subjects_split[self.set][i] for i in user_indexes
                    if self.subjects_split[self.set][i] in self.data.keys()] # TODO: maybe needs a warning that user is missing

        post_indexes_per_user = {}
        # Sample post ids
        for u, post_indexes in indexes:
            user = self.subjects_split[self.set][u]
            post_indexes_per_user[user] = post_indexes
  
        # Generate data
        if self.hierarchical:
            X, y, g = self.__data_generation_hierarchical(users, post_indexes_per_user)
        else:
            X, y, g = self.__data_generation(users, post_indexes_per_user)

        return X, y, g

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = self.indexes_with_user
#         np.arange(len(self.subjects_split[self.set]))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, users, post_indexes):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        tokens_data = []
        categ_data = []
        sparse_data = []
        subjects = []
#         bert_ids_data = []
#         bert_masks_data = []
#         bert_segments_data = []
        labels = []

        for subject in users:
            texts = self.data[subject]['texts']
            raw_texts = self.data[subject]['raw']
            label = self.data[subject]['labels']
            liwc_scores = self.data[subject]['liwc']
            
            # Sample
            texts = [texts[i] for i in post_indexes[subject]]
            liwc_selection = [liwc_scores[i] for i in post_indexes[subject]]
            raw_texts = [raw_texts[i] for i in post_indexes[subject]]
            
            all_words = [sum(texts, [])] # merge all texts in one list
            liwc_mean = [np.array(liwc_selection).mean(axis=0).tolist()]
            liwc_std = [np.array(liwc_selection).mean(axis=0).tolist()]
            all_raw_texts = [" ".join(raw_texts)]

            #                     bert_ids, bert_masks, bert_segments 
            for i, words in enumerate(all_words):
                encoded_tokens, encoded_emotions, encoded_pronouns, encoded_stopwords, \
                        = self.__encode_text(words, all_raw_texts[i])
                try:
                    subject_id = int(re.findall('[0-9]+', subject)[0])
                except IndexError:
                    subject_id = subject
                tokens_data.append(encoded_tokens)
                categ_data.append(encoded_emotions + [encoded_pronouns] + liwc_mean[i] + liwc_std[i])
                sparse_data.append(encoded_stopwords)
#                 bert_ids_data.append(bert_ids)
#                 bert_masks_data.append(bert_masks)
#                 bert_segments_data.append(bert_segments)
                
                labels.append(label)
                subjects.append(subject_id)

        
        # using zeros for padding
        tokens_data_padded = sequence.pad_sequences(tokens_data, maxlen=self.seq_len, 
                                                    padding=self.padding,
                                                   truncating=self.padding)

        return ([np.array(tokens_data_padded), np.array(categ_data), np.array(sparse_data),
#                  np.array(bert_ids_data), np.array(bert_masks_data), np.array(bert_segments_data),
                ],
                np.array(labels),
                np.array(subjects),
                )
    
    def __data_generation_hierarchical(self, users, post_indexes):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        user_tokens = []
        user_categ_data = []
        user_sparse_data = []
#         user_bert_ids_data = []
#         user_bert_masks_data = []
#         user_bert_segments_data = []
        
        labels = []
        for subject in users:
            tokens_data = []
            categ_data = []
            sparse_data = []
            subject_ids = []
#             bert_ids_data = []
#             bert_masks_data = []
#             bert_segments_data = []
            
            texts = self.data[subject]['texts']
            raw_texts = self.data[subject]['raw']
            label = self.data[subject]['label']
            liwc_scores = self.data[subject]['liwc']
            
#             if len(texts) < self.max_posts_per_user:
#                 # TODO: pad with zeros
#                 pass

            for i in post_indexes[subject]:
                raw_text = raw_texts[i]
                words = texts[i]
                liwc = liwc_scores[i]
#                     bert_ids, bert_masks, bert_segments 
                encoded_tokens, encoded_emotions, encoded_pronouns, encoded_stopwords, \
                                        = self.__encode_text(words, raw_text)
                try:
                    subject_id = int(re.findall('[0-9]+', subject)[0])
                except IndexError:
                    subject_id = subject
                tokens_data.append(encoded_tokens)
                # using zeros for padding
                # TODO: there is something wrong with this
                categ_data.append(encoded_emotions + [encoded_pronouns] + liwc)
                sparse_data.append(encoded_stopwords)
#                 bert_ids_data.append(bert_ids)
#                 bert_masks_data.append(bert_masks)
#                 bert_segments_data.append(bert_segments)
            tokens_data_padded = np.array(sequence.pad_sequences(tokens_data, maxlen=self.seq_len,
                                          padding=self.padding,
                                        truncating=self.padding))
            user_tokens.append(tokens_data_padded)

            user_categ_data.append(categ_data)
            user_sparse_data.append(sparse_data)
            
#             user_bert_ids_data.append(bert_ids_data)
#             user_bert_masks_data.append(bert_masks_data)
#             user_bert_segments_data.append(bert_segments_data)

            labels.append(label)
            subject_ids.append(subject_id)
    

        user_tokens = sequence.pad_sequences(user_tokens, 
                                             maxlen=self.posts_per_group, 
                                             value=self.pad_value)
        user_tokens = np.rollaxis(np.dstack(user_tokens), -1)
        
        user_categ_data = sequence.pad_sequences(user_categ_data,  
                                                 maxlen=self.posts_per_group, 
                                                 value=self.pad_value)
        user_categ_data = np.rollaxis(np.dstack(user_categ_data), -1)
        
        user_sparse_data = sequence.pad_sequences(user_sparse_data, 
                                                  maxlen=self.posts_per_group, 
                                                  value=self.pad_value)
        user_sparse_data = np.rollaxis(np.dstack(user_sparse_data), -1)
        
#         user_bert_ids_data = sequence.pad_sequences(user_bert_ids_data, 
#                                                     maxlen=self.posts_per_group, 
#                                                     value=self.pad_value)
#         user_bert_ids_data = np.rollaxis(np.dstack(user_bert_ids_data), -1)
        
#         user_bert_masks_data = sequence.pad_sequences(user_bert_masks_data, 
#                                                       maxlen=self.posts_per_group, 
#                                                       value=self.pad_value)
#         user_bert_masks_data = np.rollaxis(np.dstack(user_bert_masks_data), -1)
        
#         user_bert_segments_data = sequence.pad_sequences(user_bert_segments_data, 
#                                                          maxlen=self.posts_per_group, 
#                                                          value=self.pad_value)
#         user_bert_segments_data = np.rollaxis(np.dstack(user_bert_segments_data), -1)
        
        return ((user_tokens, user_categ_data, user_sparse_data,
                )
                ,
#                  user_bert_ids_data, user_bert_masks_data, user_bert_segments_data),
                np.array(labels), np.array(subject_ids))


In [None]:
most_common_words = [k for k,v in 
                     pickle.load(open("all_vocab_clpsyck_erisk.pkl", "rb")).most_common(200)]

In [None]:
features_extracted_embeddings_nonover = pickle.load(open("common_words_uneven_neighbors_overlap.pkl", "rb"))
features_extracted_embeddings_over = pickle.load(open("common_words_even_neighbors_overlap.pkl", "rb"))
features_extracted_embeddings_corr = pickle.load(open("common_words_uneven_neighbors_correlated.pkl", "rb"))[:300]
features_extracted_embeddings_uncorr = pickle.load(open("common_words_uneven_neighbors_anticorrelated2.pkl", "rb"))[:300]

In [None]:
best_features = list(pickle.load(open("common_best_features_erisk_clpsych.pkl", "rb")))[:300]
len(best_features)

In [None]:
most_common_words

In [None]:
best_features

In [None]:
features_extracted_embeddings_corr

In [None]:
features_extracted_embeddings_uncorr

In [None]:
features_extracted_embeddings_nonover

In [None]:
def get_data(set_type='train', liwc=True, bow=True):
    features = []
    labels = []
    subjects = []
    if not set_type:
        set_types = ['train', 'test']
    else:
        set_types = [set_type]
    for set_type in set_types:
        for d in DataGenerator(user_level_data, subjects_split, batch_size=1, sample_seqs=False, 
                               post_groups_per_user=1, posts_per_group=100,
                               set_type=set_type, sampling_distr='exp', 
                               sparse_words=features_extracted_embeddings_corr):
            if liwc and bow:
                features.append(np.concatenate([d[0][1].flatten(), d[0][2].flatten()]))
            elif liwc:
                features.append(d[0][1].flatten())
            elif bow:
                features.append(d[0][2].flatten())
                
            labels.append(d[1].flatten())
            subjects.append(d[2].flatten())
    return np.array(features), np.array(labels), np.array(subjects)

In [None]:
f, l, g = get_data(set_type='train', bow=True, liwc=False)
len(f), f[0].shape, len(l), l[0].shape, len(g), g[0].shape

In [None]:
features_extracted_embeddings_over

In [None]:
model_class = MultinomialNB
model_arguments = {}
# model_arguments = {'kernel': 'rbf', 'C': 5}

In [None]:

from sklearn.model_selection import GroupKFold, LeaveOneGroupOut



def cross_validation(features, labels, groups, folds=2):
    svmmodels= {}
    total_score = 0
    for l in range(21):
#         print("Classifier for label", l)
        label = labels[:,l]
        svmmodels[l] = model_class(**model_arguments)
#         print(features, label)
        validator = GroupKFold(n_splits=folds)
#         logo = LeaveOneGroupOut()
#         validator = logo.split(features, label, groups)
        cvscores = cross_val_score(svmmodels[l], features, label, groups=groups, cv=validator)
        score = sum(cvscores)/folds
        print(cvscores)
        if not np.isnan(score):
            total_score += sum(cvscores)/folds
    return total_score/21

f, l, g = get_data(set_type=None, bow=True, liwc=True)
print("final score", cross_validation(f,l,g, folds=5))

In [None]:
def results_for_label(train_features, train_labels, test_features, test_labels):
    svmmodel=model_class(**model_arguments)
    svmmodel.fit(train_features, train_labels)
    predictions = svmmodel.predict(test_features)
    # for regression, round to the nearest integer
    predictions = [round(n) for n in predictions]
    print(l, predictions, test_labels, train_labels)
    return test_labels==predictions

In [None]:
# TODO: Is this right?
# TODO: I should do a majority vote or smth. This is not precisely AHR, unless I have 1 group per user

trainf, trainl, trainu = get_data(set_type='train', bow=True, liwc=True)
testf, testl, testu = get_data(set_type='test', bow=True, liwc=True)
cumresults = []
for l in range(21):
    results = results_for_label(trainf, trainl[:,l], testf, testl[:,l])
    cumresults.append(results)

nrques = 21
correct_per_user = {u: 0 for u in set(testu.flatten())}
total_per_user = {u: 0 for u in set(testu.flatten())}
for q, ques in enumerate(cumresults):
    s = 0
    for u, answ in enumerate(cumresults[q]):
        if answ:
            correct_per_user[testu[s].item()] += 1
        total_per_user[testu[s].item()] += 1
        s+=1

for u in correct_per_user:
    print("u", u, correct_per_user[u]/total_per_user[u], "correct", correct_per_user[u], "total", total_per_user[u])
# print("AHR", sum(correct_per_user.values())/len(set(testu.flatten()))/nrques)
print("AHR", sum(correct_per_user.values())/sum(total_per_user.values()))

In [None]:
writings_df.columns

## Feature extraction

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

In [None]:
texts_positive = pickle.load(open("all_texts_clpsych_erisk_positive.pkl", "rb")).split("\n")
texts_negative = pickle.load(open("all_texts_clpsych_erisk_negative.pkl", "rb")).split("\n")

In [None]:
tt = TweetTokenizer()
sw = stopwords.words("english")
def tokenize_tweets(t, tokenizer=tt, stop=True):
    tokens = tokenizer.tokenize(t.lower())
    tokens_clean = [token for token in tokens if 
                            re.match("^[a-z]*$", token)]
    if not stop:
        tokens_clean = [token for token in tokens_clean 
                        if token not in sw]
    return tokens_clean

In [None]:
vocabulary_kbest = list(pickle.load(open("all_vocab_clpsyck_erisk.pkl", "rb")).keys())[:100000]
pipe = Pipeline([('count', CountVectorizer(vocabulary=vocabulary_kbest, tokenizer=tt.tokenize)),
                    ('tfid', TfidfTransformer())]).fit(texts_positive + texts_negative)


In [None]:
positive_X = pipe['count'].transform(["\n".join(texts_positive)]).toarray()

In [None]:
negative_X = pipe['count'].transform(["\n".join(texts_negative)]).toarray()

In [None]:
positive_X.mean()

In [None]:
negative_X.sum()

In [None]:
positive_X.shape

In [None]:
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif, f_classif, SelectPercentile

In [None]:
best_features = {'kbest': None, 'percentile': None}
selectors = {'kbest': None, 'percentile': None}

In [None]:
# KBest
kbest = SelectKBest(chi2, k=200)
kbest.fit_transform([positive_X.flatten(), negative_X.flatten()], [1,0])
selectors['kbest'] = kbest

# Percentile
percent = SelectPercentile(chi2, percentile=0.4)
percent.fit_transform([positive_X.flatten(), negative_X.flatten()], [1,0])
selectors['percentile'] = percent
        
def get_features(selector, vocabulary):
    features = []
    support = selector.get_support()
    for i,w in enumerate(vocabulary):
        if support[i]:
            features.append(w)
    return features

for key, selector in selectors.items():
    best_features[key] = get_features(selector, vocabulary_kbest)

In [None]:
len(common_features)

In [None]:
len(best_features['kbest'])

In [None]:
common_features = set(best_features['kbest']).intersection(set(best_features['percentile']))
len(common_features)
pickle.dump(common_features, open("common_best_features_erisk_clpsych.pkl", "wb+"))

In [None]:
pickle.dump(best_features['kbest'], open("kbest_features_erisk_clpsych200.pkl", "wb+"))

# Extra analysis


### Extract LIWC

In [None]:
def merge_tokens(row):
    tokens = []
    if row.tokenized_text:
        tokens += row.tokenized_text
    if row.tokenized_title:
        tokens += row.tokenized_title
    return tokens
writings_df['all_tokens'] = writings_df.apply (lambda row: merge_tokens(row), axis=1)

In [None]:
# TODO: include the title
def extract_emotions(tokens, emotion, relative=True):
    if not tokens:
        return None
    emotion_words = [t for t in tokens 
                     if t in nrc_lexicon[emotion]]
    if relative:
        return len(emotion_words) / len(tokens)
    else:
        return len(emotion_words)
    
    return encoded_emotions

from functools import partial
for emotion in emotions:
    writings_df[emotion] = writings_df['all_tokens'].apply(partial(extract_emotions, emotion=emotion, 
                                                                   relative=True))

In [None]:
writings_df['pronouns'] = writings_df['all_tokens'].apply(partial(encode_pronouns, relative=True))

In [None]:
# writings_df[['label%i'%i for i in range(21)] + ['text', 'pronouns', 'text_len'] + emotions].corr('spearman')

In [None]:
# writings_df['label15'] = writings_df['label15'].apply(lambda l: encode_labels([l])[0])

In [None]:
# writings_df['label17'] = writings_df['label17'].apply(lambda l: encode_labels([l])[0])

In [None]:
writings_df.groupby('subject').mean()[
    ['label%i'%i for i in range(21)] + ['pronouns', 'text_len'] + emotions].corr(
    'spearman')[['pronouns', 'text_len'] + emotions]

In [None]:
writings_df.corrwith?

In [None]:
from liwc_readDict import readDict

liwc = readDict('/home/anasab/resources/liwc.dic')
categories = [c for (w,c) in liwc]
set(categories)
liwc_dict = {}
for (w, c) in liwc:
    if c not in liwc_dict:
        liwc_dict[c] = []
    liwc_dict[c].append(w)

In [None]:
writings_per_user_df['all_tokens'] = writings_per_user_df.apply (lambda row: merge_tokens(row), axis=1)

In [None]:
def encode_liwc_categories(tokens, category_words, relative=True):
    category_cnt = 0
    if not tokens:
        return None
    text_len = len(tokens)
    for t in tokens:
        for word in category_words:
            if t==word or (word[-1]=='*' and t.startswith(word[:-1])) \
            or (t==word.split("'")[0]):
                category_cnt += 1
                break # one token cannot belong to more than one word in the category
    if relative:
        return category_cnt/text_len
    else:
        return category_cnt

In [None]:
%%time
from functools import partial
# for categ in ['negemo', 'posemo', 'affect', 'sad', 'anx', 'pronoun']:#liwc_dict.keys():
for categ in liwc_dict.keys():
    if categ in writings_df.columns:
        continue
    print("Encoding for category %s..." % categ)
    writings_df[categ] = writings_df['all_tokens'].apply(partial(encode_liwc_categories, 
                                                                   category_words=liwc_dict[categ], 
                                                                   relative=True))


In [None]:
# pickle.dump(writings_per_user_df, open("writings_df_t2_test_liwc.pkl", "wb+"))

In [None]:
relevant_categs = ['posemo', 'negemo', 'anx', 'sad', 'affect', 'feel', 'social', 'health', 
                   'sexual', 'present', 'cogmech', 'inhib']
writings_df.groupby('subject').mean()[
    ['label%i'%i for i in range(21)] + relevant_categs].corr(
    'spearman')[relevant_categs]

In [None]:
list(writings_df.groupby('subject').min()[
    ['label%i'%i for i in range(21)] + list(liwc_dict.keys())].corr()[list(liwc_dict.keys())].mean().sort_values().index)

In [None]:
pickle.dump(writings_df, open('writings_df_T2_liwc.pkl', 'wb+'))

## Feature extraction / analysis

#### Correlations with means

In [None]:
label_liwc_correlations_df = writings_df.groupby('subject').mean().corr(method='spearman')[
    ['label%d'%d for d in range(21)]]
label_liwc_correlations_df

In [None]:
top_correlated_categs_overall_cnt = Counter()
for d in range(21):
    top_correlated_categs = list(label_liwc_correlations_df['label%d'%d][list(categories)].sort_values(ascending=False)[:5].index)
    print(d, top_correlated_categs)
    top_correlated_categs_overall_cnt.update(top_correlated_categs)
top_correlated_categs_overall_cnt.most_common(10)
top_correlated_categs_overall = [k for k, v in top_correlated_categs_overall_cnt.most_common(10)]
top_correlated_categs_overall_cnt.most_common(10)


In [None]:
writings_df.groupby('subject').mean()[top_correlated_categs_overall]

In [None]:
sorted_categs_values = {}
for subject in set(writings_df.subject):
    sorted_categs_values[subject] = writings_df[writings_df['subject']==subject][top_correlated_categs_overall + ['date']].sort_values('date')

In [None]:
for categ in top_correlated_categs_overall:
    sorted_categs_values['subject2827'][categ].rolling(50).mean().plot(legend=True)


In [None]:
for categ in top_correlated_categs_overall:
    sorted_categs_values['subject9798'][categ].rolling(50).mean().plot(legend=True)


In [None]:
for categ in top_correlated_categs_overall:
    sorted_categs_values['subject9218'][categ].rolling(50).mean().plot(legend=True)

#### Correlations with deviations

In [None]:
column_functions = {'text': lambda t: " ".join(t), 
                                        'title': lambda t: " ".join(t),
                                        'text_len': 'sum',
                                        'title_len': 'sum'}
column_functions.update({'label%i'%i: 'min' for i in range(21)})
column_functions.update({categ: 'std' for categ in categories})
writings_df_categs_devs = writings_df.groupby('subject').aggregate(column_functions)[
    ['label%i'%i for i in range(21)] + list(categories)
]
#                                          'subset': 'min'})

In [None]:
writings_df_categs_devs

In [None]:
label_liwc_dev_correlations_df = writings_df_categs_devs.corr(method='spearman')[
    ['label%d'%d for d in range(21)]]
label_liwc_dev_correlations_df

In [None]:
top_correlated_categs_overall_devs_cnt = Counter()
for d in range(21):
    top_correlated_categs = list(label_liwc_dev_correlations_df['label%d'%d][list(categories)].sort_values(ascending=False)[:5].index)
    print(d, top_correlated_categs)
    top_correlated_categs_overall_devs_cnt.update(top_correlated_categs)
top_correlated_categs_overall_devs_cnt.most_common(10)
top_correlated_categs_overall_devs = [k for k, v in top_correlated_categs_overall_devs_cnt.most_common(10)]
top_correlated_categs_overall_devs_cnt.most_common(10)


In [None]:
writings_df.groupby('subject').std()[top_correlated_categs_overall_devs]

In [None]:
sorted_categs_devs_values = {}
for subject in set(writings_df.subject):
    sorted_categs_devs_values[subject] = writings_df[writings_df['subject']==subject][top_correlated_categs_overall_devs + ['date']].sort_values('date')

In [None]:
for categ in top_correlated_categs_overall_devs:
    sorted_categs_devs_values['subject2827'][categ].rolling(50).mean().plot(legend=True)


In [None]:
for categ in top_correlated_categs_overall_devs[:2]:
    sorted_categs_devs_values['subject9694'][categ].rolling(50).mean().plot(legend=True)

In [None]:
for categ in top_correlated_categs_overall_devs[:2]:
    sorted_categs_devs_values['subject1272'][categ].rolling(50).mean().plot(legend=True)

In [None]:
writings_df.groupby('subject')[['label%d'%d for d in range(21)]].mean().sum(axis=1).sort_values()

In [None]:
from matplotlib import pyplot as plt
for categ in top_correlated_categs_overall_devs[:3]:
    sorted_categs_devs_values['subject3993'][categ].rolling(50).mean().plot(legend=True)    
plt.show()
for categ in top_correlated_categs_overall_devs[:3]:
    sorted_categs_devs_values['subject9454'][categ].rolling(50).mean().plot(legend=True)

In [None]:
low_subjects = ['subject9694', 'subject2903']
high_subjects = ['subject5897', 'subject436', 'subject9454']
all_subjects = list(set(writings_df.subject))[:10]
for subject in low_subjects+high_subjects:
#     color='k'
#     if subject in high_subjects:
#         color='b'
#     else:
#         color='y'
    plt.plot(sorted_categs_values[subject]['ipron'][:500].rolling(50).mean().values)#, color)
    
plt.legend(low_subjects+high_subjects)

In [None]:
bl