In [1]:
import pandas as pd
import xml.etree.ElementTree as ET
import glob, os
import numpy as np
from comet_ml import Experiment, Optimizer
import pickle
import logging
import sys
from sklearn.utils import class_weight

In [2]:
os.environ['TF_KERAS'] = '1'
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, Lambda, BatchNormalization, TimeDistributed, \
    CuDNNLSTM, Bidirectional, Input, concatenate, Flatten, RepeatVector, Activation, Multiply, Permute
from tensorflow.keras import regularizers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import callbacks, optimizers
from tensorflow.keras import backend as K
from tensorflow.keras.utils import plot_model, Sequence

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

import tensorflow as tf

In [3]:
my_seed = 1234
tf.set_random_seed(my_seed)

In [4]:
logger = logging.getLogger('training')
logger.addHandler(logging.StreamHandler(sys.stdout))
logger.setLevel(logging.DEBUG)

# Read data

In [5]:
def read_subject_writings(subject_file):
    writings = []
    with open(subject_file) as sf:
        contents = sf.read()
        root = ET.fromstring(contents)
        try:
            subject = root.findall('ID')[0].text.strip()
        except Exception:
            print('Cannot extract ID', contents[:500], '\n-------\n')        
        for w in root.iter('WRITING'):
            subject_writings = {'subject': subject}
            for title in w.findall('TITLE'):
                subject_writings['title'] = title.text
            for text in w.findall('TEXT'):
                subject_writings['text'] = text.text
            for date in w.findall('DATE'):
                subject_writings['date'] = date.text
            writings.append(subject_writings)
    return writings

In [6]:
# root_dir = '/home/anasab/' 
root_dir = '/home/ana/'

In [7]:
datadir_T2 = root_dir + '/eRisk/data/eRisk2020_T2/eRisk2020_T2_TRAINING_DATA/'
labels_file_T2 = root_dir + '/eRisk/data/eRisk2020_T2/eRisk2020_T2_TRAINING_DATA/Depression Questionnaires_anon.txt'
nr_questions = 21

In [8]:
def read_texts(datadir_T2,
                labels_file_T2):
    writings = []
    writings_df = pd.DataFrame()
    labels_df = pd.DataFrame()

        
    for subject_file in os.listdir(datadir_T2):
        if not subject_file.startswith('subject'):
            continue
        writings.extend(read_subject_writings(os.path.join(datadir_T2, subject_file)))
    writings_df = pd.DataFrame(writings)
    
    labels_df = pd.read_csv(os.path.join(labels_file_T2), 
                                 delimiter='\s+', names=['subject'] + ['label%i' % i for i in range(nr_questions)])

    labels_df = labels_df.set_index('subject')
    
    writings_df = writings_df.join(labels_df, on='subject')
    
    return writings_df, labels_df

In [9]:
# writings_df, labels_df = read_texts(datadir_T2, labels_file_T2)
writings_df = pickle.load(open('writings_df_T2_liwc.pkl', 'rb'))

In [10]:
writings_df.groupby('subject').count()

Unnamed: 0_level_0,title,text,date,label0,label1,label2,label3,label4,label5,label6,...,feel,excl,future,nonfl,ppron,shehe,i,we,you,they
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
subject1272,120,120,120,120,120,120,120,120,120,120,...,120,120,120,120,120,120,120,120,120,120
subject2341,129,129,129,129,129,129,129,129,129,129,...,129,129,129,129,129,129,129,129,129,129
subject2432,332,332,332,332,332,332,332,332,332,332,...,332,332,332,332,332,332,332,332,332,332
subject2827,663,663,663,663,663,663,663,663,663,663,...,659,659,659,659,659,659,659,659,659,659
subject2903,313,313,313,313,313,313,313,313,313,313,...,313,313,313,313,313,313,313,313,313,313
subject2961,180,180,180,180,180,180,180,180,180,180,...,180,180,180,180,180,180,180,180,180,180
subject3707,1022,1022,1022,1022,1022,1022,1022,1022,1022,1022,...,1022,1022,1022,1022,1022,1022,1022,1022,1022,1022
subject3993,1510,1510,1510,1510,1510,1510,1510,1510,1510,1510,...,1509,1509,1509,1509,1509,1509,1509,1509,1509,1509
subject4058,1028,1028,1028,1028,1028,1028,1028,1028,1028,1028,...,1028,1028,1028,1028,1028,1028,1028,1028,1028,1028
subject436,29,29,29,29,29,29,29,29,29,29,...,29,29,29,29,29,29,29,29,29,29


In [11]:
writings_df

Unnamed: 0,subject,title,text,date,label0,label1,label2,label3,label4,label5,...,feel,excl,future,nonfl,ppron,shehe,i,we,you,they
0,subject5791,,"Great, thanks a ton!",2018-10-30 17:35:30,1,0,1,1,0,0,...,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0
1,subject5791,The search button gives me a 404 error,Just downloaded GBA4ios 2.1 and when I go to ...,2018-10-30 17:19:41,1,0,1,1,0,0,...,0.0,0.074468,0.074468,0.0,0.063830,0.0,0.063830,0.0,0.000000,0.0
2,subject5791,,Remindme! 1 week,2018-10-30 14:33:49,1,0,1,1,0,0,...,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0
3,subject5791,,Me too please,2018-10-19 18:06:38,1,0,1,1,0,0,...,0.0,0.000000,0.000000,0.0,0.333333,0.0,0.333333,0.0,0.000000,0.0
4,subject5791,,Any chance you can pm me what this spoiler is...,2018-10-19 18:04:14,1,0,1,1,0,0,...,0.0,0.027778,0.111111,0.0,0.138889,0.0,0.111111,0.0,0.027778,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10936,subject3993,Alternative Currency Being Considered in Penn...,,2009-01-07 18:41:30,0,0,0,0,0,0,...,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0
10937,subject3993,Asus' new keyboard. Oh wait... thats not a ke...,,2009-01-07 17:13:53,0,0,0,0,0,0,...,0.0,0.076923,0.076923,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0
10938,subject3993,Homeland Security USA - tripe to entertain mo...,,2009-01-07 07:09:19,0,0,0,0,0,0,...,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0
10939,subject3993,10 dead as Israeli missile hits near U.N. sch...,,2009-01-06 17:15:24,0,0,0,0,0,0,...,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0


## Preprocess text

In [12]:
tokenizer = RegexpTokenizer(r'\w+')

def tokenize(t):
    return tokenizer.tokenize(t.lower())

In [13]:
def tokenize_fields(writings_df):
    writings_df['tokenized_title'] = writings_df['title'].apply(lambda t: tokenize(t) 
                                                                if type(t)==str and t else None)
    writings_df['title_len'] = writings_df['tokenized_title'].apply(lambda t: len(t) 
                                                                    if type(t)==list and t else None)
    writings_df['tokenized_text'] = writings_df['text'].apply(lambda t: tokenize(t) 
                                                              if type(t)==str and t else None)
    writings_df['text_len'] = writings_df['tokenized_text'].apply(lambda t: len(t) 
                                                                  if type(t)==list and t else None)
    return writings_df

In [14]:
writings_df = tokenize_fields(writings_df)

In [15]:
writings_df.text_len.describe()

count    10409.000000
mean        50.365069
std         84.811676
min          1.000000
25%          9.000000
50%         24.000000
75%         54.000000
max       1567.000000
Name: text_len, dtype: float64

In [16]:
writings_df.title_len.describe()

count    1119.000000
mean       11.246649
std         6.979392
min         1.000000
25%         6.000000
50%        10.000000
75%        15.000000
max        51.000000
Name: title_len, dtype: float64

In [17]:
writings_df.groupby('subject').count().title.describe()

count      20.000000
mean      547.050000
std       446.144828
min        29.000000
25%       180.000000
50%       327.500000
75%      1006.250000
max      1510.000000
Name: title, dtype: float64

In [18]:
writings_df.groupby('subject').count().text.describe()

count      20.000000
mean      547.050000
std       446.144828
min        29.000000
25%       180.000000
50%       327.500000
75%      1006.250000
max      1510.000000
Name: text, dtype: float64

# Extract features

In [64]:
hyperparams_features = {
    "max_features": 20000,
    # cut texts after this number of words
    # (among top max_features most common words)
    "maxlen": 100,
    "embedding_dim": 50,
    "user_level": True,
    "posts_per_user": 10,
    "batch_size": 20,
}

#### Emotions

In [65]:
def load_NRC(nrc_path):
    word_emotions = {}
    emotion_words = {}
    with open(nrc_path) as in_f:
        for line in in_f:
            line = line.strip()
            if not line:
                continue
            word, emotion, label = line.split()
            if word not in word_emotions:
                word_emotions[word] = set()
            if emotion not in emotion_words:
                emotion_words[emotion] = set()
            label = int(label)
            if label:
                word_emotions[word].add(emotion)
                emotion_words[emotion].add(word)
    return emotion_words

nrc_lexicon_path = root_dir + '/resources/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt'
nrc_lexicon = load_NRC(nrc_lexicon_path)
emotions = list(nrc_lexicon.keys())


In [66]:
def encode_emotions(tokens, emotion_lexicon, emotions, relative=True):
    text_len = len(tokens)
    encoded_emotions = [0 for e in emotions]
    for i, emotion in enumerate(emotions):
        try:
            emotion_words = [t for t in tokens if t in emotion_lexicon[emotion]]
            if relative:
                encoded_emotions[i] = len(emotion_words) / len(tokens)
            else:
                encoded_emotions[i] = len(emotion_words)
        except ValueError:
            print("Emotion not found.")
    return encoded_emotions

In [67]:
from liwc_readDict import readDict

liwc = readDict(root_dir + '/resources/liwc.dic')

categories = set([c for (w,c) in liwc])
len(categories)

64

#### Personal pronouns

In [68]:
first_person_pronouns = {"i", "me", "my", "mine", "myself"}
def encode_pronouns(tokens, pronouns={"i", "me", "my", "mine", "myself"}, relative=True):
    if not tokens:
        return np.nan
    text_len = len(tokens)
    nr_pronouns = len([t for t in tokens if t in pronouns])
    if relative:
        return nr_pronouns/text_len
    else:
        return nr_pronouns

#### Stopwords

In [69]:
stopword_list = stopwords.words("english")
def encode_stopwords(tokens, stopwords=stopword_list):
    encoded_stopwords = [0 for s in stopword_list]
    if not tokens:
        return encoded_stopwords
    for i, stopword in enumerate(stopwords):
        if stopword in tokens:
            encoded_stopwords[i] += 1
    return encoded_stopwords

### Encode data

In [70]:
from collections import Counter
def encode_labels(labels):
    '''Convert ia to i and ib to -i'''
    encoded_labels = []
    for i, l in enumerate(labels):
        try:
            encoded_labels.append(int(l))
        except Exception as e:
            logger.debug("Encoding label %s\n" % l)
        
            if str(l)[-1] == 'a':
                encoded_labels.append(int(l[0]))
            elif str(l)[-1] == 'b':
                encoded_labels.append(-int(l[0]))
            else:
                logger.warning("Coult not encode label %s\n" % l)
    return encoded_labels

def load_erisk_data(writings_df, voc_size, emotion_lexicon, seq_len, emotions =  
                    ['anger', 'anticipation', 'disgust', 'fear', 'joy', 
                     'negative', 'positive', 'sadness', 'surprise', 'trust'],
                    liwc_categories = categories, ignore_features=[],
                    pronouns = ["i", "me", "my", "mine", "myself"],
                    train_prop=0.7, valid_prop=0.3, test_slice=2,
                    nr_slices=5,
                    min_post_len=3, min_word_len=1, 
                    user_level=True, vocabulary=None,
                   logger=logger):
    logger.debug("Loading data...\n")
    if not vocabulary:
        vocabulary = {}
        word_freqs = Counter()
        for words in writings_df.tokenized_text:
            word_freqs.update(words)
        for words in writings_df.tokenized_title:
            word_freqs.update(words)
        i = 1
        for w, f in word_freqs.most_common(voc_size-2): # keeping voc_size-1 for unk
            if len(w) < min_word_len:
                continue
            vocabulary[w] = i
            i += 1

    if 'subset' in writings_df.columns:
        training_subjects = list(set(writings_df[writings_df['subset']=='train'].subject))
        test_subjects = list(set(writings_df[writings_df['subset']=='test'].subject))
    else:
        all_subjects = sorted(list(set(writings_df.subject)))
        training_subjects_size = int(len(all_subjects) * train_prop)
        test_subjects_size = len(all_subjects) - training_subjects_size
        # Cross-validation, with fixed slice as input
        test_prop = 1-train_prop
        test_slice = min(test_slice, nr_slices)
        logger.debug("start index: %f, from %f\n" % (
            len(all_subjects)*(1/nr_slices)*test_slice, test_prop*test_slice))
        start_slice = int(len(all_subjects)*(1/nr_slices)*test_slice)
        test_subjects = all_subjects[start_slice: start_slice+test_subjects_size]
        training_subjects = [s for s in all_subjects if s not in test_subjects]
    training_subjects = sorted(training_subjects) # ensuring reproducibility
    valid_subjects_size = int(len(training_subjects) * valid_prop)
    valid_subjects = training_subjects[:valid_subjects_size]
    training_subjects = training_subjects[valid_subjects_size:]
    categories = [c for c in liwc_categories if c in writings_df.columns]
    logger.debug("%d training users, %d validation users, %d test users." % (
        len(training_subjects), 
          len(valid_subjects),
          len(test_subjects)))
    subjects_split = {'train': training_subjects, 
                      'valid': valid_subjects, 
                      'test': test_subjects}

    user_level_texts = {}
    for row in writings_df.sort_values(by='date').itertuples():
        words = []
        if row.tokenized_title:
            words.extend(row.tokenized_title)
        if row.tokenized_text:
            words.extend(row.tokenized_text)
        if not words or len(words)<min_post_len:
            print(row.subject)
            continue
        labels = [getattr(row, 'label%d'%i) for i in range(nr_questions)]
        liwc_categs = [getattr(row, categ) for categ in categories]
        if row.subject not in user_level_texts.keys():
            user_level_texts[row.subject] = {}
            user_level_texts[row.subject]['texts'] = [words]
            user_level_texts[row.subject]['labels'] = encode_labels(labels)
            user_level_texts[row.subject]['liwc'] = [liwc_categs]
        else:
            user_level_texts[row.subject]['texts'].append(words)
            user_level_texts[row.subject]['liwc'].append(liwc_categs)
    return user_level_texts, subjects_split, vocabulary


In [71]:
user_level_data, subjects_split, vocabulary = load_erisk_data(writings_df, 
                                                            seq_len=hyperparams_features['maxlen'],
                                                            voc_size=hyperparams_features['max_features'],
                                                           emotion_lexicon=nrc_lexicon,
                                                           emotions=emotions,
                                                           user_level=hyperparams_features['user_level'],
                                                                                logger=logger
#                                                            vocabulary=pickle.load(open('vocabulary20K_selfharm.pkl', 'rb'))
                                                                               )

Loading data...



DEBUG:training:Loading data...



start index: 8.000000, from 0.600000



DEBUG:training:start index: 8.000000, from 0.600000



10 training users, 4 validation users, 6 test users.


DEBUG:training:10 training users, 4 validation users, 6 test users.


subject3993
subject3993
subject3993
subject9798
subject9798
subject9798
subject9798
subject9798
subject9798
subject9798
subject9798
subject2903
subject9798
subject6619
subject2903
subject2903
subject9798
subject6619
subject6619
subject6619
subject6619
subject6619
subject6619
subject6619
subject6619
subject6619
subject6619
subject3993
subject5791
subject6619
subject5791
subject9798
subject6619
subject9798
subject6619
subject6619
subject6619
subject6619
subject6619
subject6619
subject5791
subject7039
subject5791
subject5791
subject6619
subject6619
subject6619
subject6619
subject6619
subject6635
subject6619
subject6635
subject6619
subject6635
subject6635
subject7039
subject3993
subject6635
subject6619
subject7039
subject9694
subject6619
subject7039
subject7039
subject7039
subject7039
subject7039
subject7039
subject7039
subject7039
subject7039
subject6619
subject7039
subject9694
subject6635
subject6635
subject7039
subject9694
subject9694
subject9694
subject9694
subject9694
subject7039
subj

subject6900
subject6900
subject6900
subject6900
subject9798
subject3993
subject9694
subject6900
subject9694
subject9694
subject2903
subject4058
subject3707
subject3707
subject9694
subject9694
subject9694
subject9798
subject3993
subject9694
subject2903
subject6900
subject6900
subject6900
subject9798
subject2903
subject2903


In [72]:
def load_embeddings(path, embedding_dim, voc):
    # random matrix with mean value = 0
    embedding_matrix = np.random.random((len(voc)+2, embedding_dim)) - 0.5 # voc + unk + pad value(0)

    f = open(path)
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        word_i = voc.get(word)
        if word_i is not None:
            embedding_matrix[word_i] = coefs
    f.close()

    print('Total %s word vectors.' % len(embedding_matrix))

 
    return embedding_matrix

pretrained_embeddings_path = root_dir + '/resources/glove.twitter.27B/glove.twitter.27B.%dd.txt' % hyperparams_features['embedding_dim']
embedding_matrix = load_embeddings(pretrained_embeddings_path, hyperparams_features['embedding_dim'], vocabulary)


Total 20000 word vectors.


## Data Generator

In [86]:
class DataGenerator(Sequence):
    'Generates data for Keras'
    def __init__(self, user_level_data, batch_size=hyperparams_features['batch_size'], 
                 seq_len=hyperparams_features['maxlen'], voc_size=hyperparams_features['max_features'], 
                 emotion_lexicon=nrc_lexicon, set_type='train', test_user_index=0,
                 emotions=emotions, pronouns=["i", "me", "my", "mine", "myself"], 
                 max_posts_per_user=hyperparams_features['posts_per_user'],
                 shuffle=True):
        'Initialization'
        self.seq_len = seq_len
        self.emotion_lexicon = emotion_lexicon
        self.batch_size = batch_size
        self.data = user_level_data
        self.all_users = list(self.data.keys())
        self.emotions = emotions
        self.pronouns = pronouns
        self.set = set_type
        self.shuffle = shuffle
        self.voc_size = voc_size
        self.max_posts_per_user = max_posts_per_user
        self.test_user_index = test_user_index
        self.on_epoch_end()

    def __encode_text(self, tokens):
        # Using voc_size-1 value for OOV token
        encoded_tokens = [vocabulary.get(w, self.voc_size-1) for w in tokens]
        encoded_emotions = encode_emotions(tokens, self.emotion_lexicon, self.emotions)
        encoded_pronouns = encode_pronouns(tokens, self.pronouns)
        encoded_stopwords = encode_stopwords(tokens)
        return (encoded_tokens, encoded_emotions, encoded_pronouns, encoded_stopwords)
        
    def __len__(self):
        'Denotes the number of batches per epoch'
        if self.set == 'test':
            return 1
        return int(np.floor(len(self.data) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        if self.set == 'test':
            users = [self.all_users[self.test_user_index]]
        else:
            # Generate indexes of the batch
            user_indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
            # Find users

            users = [self.all_users[i] for i in user_indexes
                     if i!=self.test_user_index
                    ]

        post_indexes = {}
        # Sample post ids
        for subject in users:
            posts_len = len(self.data[subject]['texts'])
            posts_index_sample = sorted(np.random.choice(posts_len, 
                                                         min(self.max_posts_per_user, posts_len),
                                                         replace=False))
            post_indexes[subject] = posts_index_sample
        # Generate data
        X, y = self.__data_generation(users, post_indexes)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.data))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, users, post_indexes):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        tokens_data = []
        categ_data = []
        sparse_data = []
        subjects = []
        labels = []
        for subject in users:
            texts = self.data[subject]['texts']
            label = self.data[subject]['labels']
            
            # Sample
            texts = [texts[i] for i in post_indexes[subject]]
            liwc_selection = [self.data[subject]['liwc'][i] for i in post_indexes[subject]]

            all_words = [sum(texts, [])] # merge all texts in one list
            liwc_aggreg = [np.array(liwc_selection).mean(axis=0).tolist()]

            for i, words in enumerate(all_words):
                encoded_tokens, encoded_emotions, encoded_pronouns, encoded_stopwords = self.__encode_text(words)
                subject_id = int(subject.split('t')[1])
                tokens_data.append(encoded_tokens)
                categ_data.append(encoded_emotions + [encoded_pronouns] + liwc_aggreg[i])
                sparse_data.append(encoded_stopwords)
                labels.append(label)
                subjects.append(subject_id)

        
        # using zeros for padding
        tokens_data_padded = sequence.pad_sequences(tokens_data, maxlen=self.seq_len)

        return ([np.array(tokens_data_padded), np.array(categ_data), np.array(sparse_data),
                np.array(subjects)],
                np.array(labels).reshape(-1, len(labels)).tolist()) # to have one array per output

In [87]:
# TODO: Don't split into the 3 sets, do leave-one-out cross-validation

In [88]:
logger.setLevel(logging.DEBUG)

# TODO: it is skipping the last batch
x_data = {'train': [], 'test': []}
y_data = {'train': [], 'test': []}
test_user_index = np.random.randint(len(user_level_data))
for set_type in ['train', 'test']:
    for x, y in DataGenerator(user_level_data, 
                                          set_type=set_type,
                             test_user_index=test_user_index):
        x_data[set_type].append(x)
        y_data[set_type].append(y)


In [89]:
y_data['test'][0]

[[0.0],
 [0.0],
 [0.0],
 [1.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [1.0],
 [0.0],
 [0.0],
 [1.0],
 [-2.0],
 [0.0],
 [0.0],
 [2.0],
 [1.0],
 [0.0]]

In [90]:
np.array([[1,2],[3,4]]).reshape(2,-1)

array([[1, 2],
       [3, 4]])

In [91]:
sum([len(subjects_split[s]) for s in ['train', 'valid', 'test']])

20

In [92]:
x_data['test']

[[array([[  206,   174,   399,    15,     2,   757, 11823,    14, 19060,
              2,   214,     4,  8411,    18, 19061,   278,    15,     7,
             14,    25,  1927,   969,  5554,   218,     2,   140,   720,
              9,    52,    34,     1,   105,    59,  2011,  3078,    77,
           5427,    12,  1180,  4866,  5428,     1,   806,   494,  5429,
            495,  1826,  1461,  1180,   418,     5,  2876,  5430,     2,
            119,   446,    19,  3078,    15,   253,    52,   365,    13,
             57,   547,  3495,    34,    81,   322,     1,   161,  2712,
             55,    91,  2881,  3425,   225,   164,  1080,    35,  3425,
             10,  3495,   472,     3,  1691,     1,   708,     8,     1,
            495,   803,   495,  1826,  1461,     5,  5431,     4,  4782,
            744]], dtype=int32),
  array([[0.02380952, 0.01984127, 0.01190476, 0.03174603, 0.02380952,
          0.0515873 , 0.04365079, 0.02777778, 0.01587302, 0.02380952,
          0.03571429, 0.

In [93]:
y_data

{'train': [[[1.0,
    1.0,
    0.0,
    1.0,
    0.0,
    0.0,
    2.0,
    0.0,
    0.0,
    0.0,
    0.0,
    1.0,
    0.0,
    0.0,
    0.0,
    0.0,
    2.0,
    0.0,
    0.0],
   [0.0,
    2.0,
    1.0,
    3.0,
    3.0,
    2.0,
    3.0,
    2.0,
    2.0,
    2.0,
    2.0,
    1.0,
    1.0,
    3.0,
    1.0,
    3.0,
    2.0,
    1.0,
    2.0],
   [1.0,
    2.0,
    1.0,
    0.0,
    1.0,
    3.0,
    3.0,
    2.0,
    1.0,
    3.0,
    3.0,
    3.0,
    2.0,
    3.0,
    2.0,
    3.0,
    3.0,
    2.0,
    2.0],
   [-3.0,
    1.0,
    3.0,
    3.0,
    3.0,
    0.0,
    2.0,
    2.0,
    1.0,
    2.0,
    1.0,
    0.0,
    3.0,
    2.0,
    1.0,
    2.0,
    2.0,
    3.0,
    1.0],
   [2.0,
    2.0,
    2.0,
    0.0,
    2.0,
    2.0,
    1.0,
    1.0,
    1.0,
    3.0,
    0.0,
    1.0,
    0.0,
    0.0,
    0.0,
    1.0,
    1.0,
    2.0,
    3.0],
   [0.0,
    0.0,
    2.0,
    2.0,
    -3.0,
    1.0,
    -1.0,
    2.0,
    1.0,
    3.0,
    0.0,
    0.0,
    0.0,
    1.0,
  

# Train

In [94]:
hyperparams = {
    'lstm_units': 0,
    'dense_bow_units': 20,
    'dropout': 0.0,
    'l2_dense': 0.00000011,
    'l2_embeddings': 0.000001,
    'dense_sentence_units': 100,
    'optimizer': 'adam',
    'decay': 0.00001,
    'lr': 0.001,
    "trainable_embeddings": True,
    "reduce_lr_factor": 0.0002,
    "reduce_lr_patience": 1000,
    "freeze_patience": 500,
    'threshold': 0.5,
    'ignore_layer': ['lstm_layers', 'batchnorm'],
    'norm_momentum': 0.1,

}
if not hyperparams['optimizer']:
    hyperparams['optimizer'] = optimizers.Adam(lr=hyperparams['lr'], #beta_1=0.9, beta_2=0.999, epsilon=0.0001,
                                   decay=hyperparams['decay'])

In [95]:
# class Metrics():
#     def __init__(self, threshold=0.5):
#         self.threshold=threshold
        
#     def recall_m(self, y_true, y_pred):
#             y_labels = y_true
#             y_pred = K.cast(K.greater(K.clip(y_pred, 0, 1), self.threshold), K.floatx())        
#             possible_positives = K.sum(K.round(K.clip(y_labels, 0, 1)))
#             true_positives = K.sum(K.round(K.clip(y_labels * y_pred, 0, 1)))
#             recall = true_positives / (possible_positives + K.epsilon())
#             return recall

#     def precision_m(self, y_true, y_pred):
#             y_labels = y_true
#             y_pred = K.cast(K.greater(K.clip(y_pred, 0, 1), self.threshold), K.floatx())        
#             true_positives = K.sum(K.round(K.clip(y_labels * y_pred, 0, 1)))
#             predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
#             precision = true_positives / (predicted_positives + K.epsilon())
#             return precision

#     def f1_m(self, y_true, y_pred):
#         precision = self.precision_m(y_true, y_pred)
#         recall = self.recall_m(y_true, y_pred)
#         return 2*((precision*recall)/(precision+recall+K.epsilon()))

# def binary_crossentropy_custom(y_true, y_pred):
#     y_labels = y_true
#     return K.binary_crossentropy(y_labels, 
#                                  y_pred)

# metrics_class = Metrics(threshold=hyperparams['threshold'])

In [98]:
def build_model(hyperparams, hyperparams_features, embedding_matrix, emotions, stopwords_list,
                liwc_categories, nr_classes,
               ignore_layer=[]):

    tokens_features = Input(shape=(hyperparams_features['maxlen'],), name='word_seq')
    embedding_layer = Embedding(hyperparams_features['max_features'], 
                                hyperparams_features['embedding_dim'], 
                                input_length=hyperparams_features['maxlen'],
                                embeddings_regularizer=regularizers.l2(hyperparams['l2_embeddings']),
                                weights=[embedding_matrix], 
                                trainable=hyperparams['trainable_embeddings'],
                               name='embeddings_layer')(
        tokens_features)
#     if 'batchnorm' not in ignore_layer:
#         embedding_layer_norm = BatchNormalization(axis=-1, momentum=hyperparams['norm_momentum'],
#                                                      name='embeddings_layer_norm')(embedding_layer)
#     lstm_layers = Bidirectional(LSTM(hyperparams['lstm_units']))(embedding_layer)

    if tf.test.is_gpu_available():
        lstm_layers = CuDNNLSTM(hyperparams['lstm_units'], 
                                return_sequences='attention' not in ignore_layer, # only True if using attention
                      name='LSTM_layer')(embedding_layer)
    else:
        lstm_layers = LSTM(hyperparams['lstm_units'], 
                           return_sequences='attention' not in ignore_layer,
                      name='LSTM_layer')(embedding_layer)
    
    # Attention
    if 'attention' not in ignore_layer:
        attention = Dense(1, activation='tanh', name='attention')(lstm_layers)
        attention = Flatten()(attention)
        attention = Activation('softmax')(attention)
        attention = RepeatVector(hyperparams['lstm_units'])(attention)
        attention = Permute([2, 1])(attention)

        sent_representation = Multiply()([lstm_layers, attention])
        sent_representation = Lambda(lambda xin: K.sum(xin, axis=1), 
                                     output_shape=(hyperparams['lstm_units'],)
                                    )(sent_representation)

        
    else:
        sent_representation = lstm_layers
        
    
    sent_representation = Dropout(hyperparams['dropout'], name='lstm_att_dropout')(sent_representation)
    if hyperparams['dense_sentence_units']:
        sent_representation = Dense(units=hyperparams['dense_sentence_units'],
                                   name='dense_sent_representation')(sent_representation)
    numerical_features = Input(shape=(len(emotions) + 1 + len(liwc_categories),), name='numeric_input') # emotions and pronouns
    dense_layer = Dense(units=1,
                        kernel_regularizer=regularizers.l2(hyperparams['l2_dense']),
                        name='numerical_dense_layer',
                       )(numerical_features)
    sparse_features = Input(shape=(len(stopwords_list),), name='sparse_input') # stopwords

    if hyperparams['dense_bow_units']:
        dense_layer_sparse = Dense(units=hyperparams['dense_bow_units'],
                              name='sparse_feat_dense_layer',
                                kernel_regularizer=regularizers.l2(hyperparams['l2_dense']),
                              )(sparse_features)
    else:
        dense_layer_sparse = sparse_features
    
    if 'batchnorm' not in ignore_layer:
        numerical_features_norm = BatchNormalization(axis=1, momentum=hyperparams['norm_momentum'],
                                                     name='numerical_features_norm')(numerical_features)
        sent_representation_norm = BatchNormalization(axis=1, momentum=hyperparams['norm_momentum'],
                                                      name='sent_repr_norm')(sent_representation)

        dense_layer_sparse_norm = BatchNormalization(axis=1, momentum=hyperparams['norm_momentum'],
                                                         name='sparse_features_norm')(dense_layer_sparse)
        
    subjects = Input(shape=(1,), name='subjects')
    

    all_layers = {
        'lstm_layers': sent_representation,
        'numerical_dense_layer': numerical_features,
        'sparse_feat_dense_layer': dense_layer_sparse
    }
    if 'batchnorm' not in ignore_layer:
        all_layers = {
            'lstm_layers': sent_representation_norm,
            'numerical_dense_layer': numerical_features_norm,
            'sparse_feat_dense_layer': dense_layer_sparse_norm
        }
    layers_to_merge = []
    for n, l in all_layers.items():
        if n in ignore_layer:
            continue
        layers_to_merge.append(l)
        
    if len(layers_to_merge) == 1:
        merged_layers = layers_to_merge[0]
    else:
        merged_layers = concatenate(layers_to_merge)
    output_layers = []
    for label in range(nr_questions):
        output_layer = Dense(1, activation='softmax',
                         name='output_layer%d' % label,
                        kernel_regularizer=regularizers.l2(hyperparams['l2_dense']))(merged_layers)
        output_layers.append(output_layer)

    # Compile model
    model = Model(inputs=[tokens_features, numerical_features, sparse_features, subjects], 
                  outputs=output_layers)

    model.compile(hyperparams['optimizer'], {'output_layer%d'%i: 'mean_squared_error' for i in range(nr_questions)},
                  metrics={'output_layer%d' % label: ['accuracy', 'mean_squared_error'] for label in range(nr_questions)})
    return model



In [99]:
model = build_model(hyperparams, hyperparams_features, embedding_matrix, emotions, stopword_list,
                    liwc_categories=[c for c in categories if c in writings_df.columns],
                    nr_classes=nr_questions,
                   ignore_layer=hyperparams['ignore_layer'])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
sparse_input (InputLayer)       (None, 179)          0                                            
__________________________________________________________________________________________________
numeric_input (InputLayer)      (None, 75)           0                                            
__________________________________________________________________________________________________
sparse_feat_dense_layer (Dense) (None, 20)           3600        sparse_input[0][0]               
__________________________________________________________________________________________________
concatenate_10 (Concatenate)    (None, 95)           0           numeric_input[0][0]              
                                                                 sparse_feat_dense_layer[0][0]    
__________

In [100]:
def train_model(model, 
                data_generator_train, data_generator_valid,
                epochs, start_epoch=0, workers=4,
                callback_list = [],
                model_path='/tmp/model',
               verbose=1):
    logging.info('Train...')
    experiment.log_parameter('callbacks', callbacks)

    history = model.fit_generator(data_generator_train,
#               batch_size=batch_size,
#                 steps_per_epoch=steps_per_epoch,
              epochs=epochs, initial_epoch=start_epoch, 
              validation_data=data_generator_valid,
                        verbose=verbose,
#               validation_split=0.3,
                       workers=workers,
            callbacks = [
                callbacks.ModelCheckpoint(filepath='%s_best' % model_path, verbose=1, 
                                          save_best_only=True),
                callbacks.EarlyStopping(patience=500), *callback_list
            ])
    model.save(model_path)
    experiment.log_parameter('model_path', model_path)
    return model, history

In [101]:
experiment = Experiment(api_key="eoBdVyznAhfg3bK9pZ58ZSXfv",
                        project_name="mental", workspace="ananana", disabled=False)

experiment.log_parameters(hyperparams_features)

experiment.log_parameter('emotion_lexicon', nrc_lexicon_path)
experiment.log_parameter('emotions', emotions)
experiment.log_parameter('embeddings_path', pretrained_embeddings_path)

experiment.add_tag('T2')
experiment.log_parameters(hyperparams)

COMET INFO: ----------------------------
COMET INFO: Comet.ml Experiment Summary:
COMET INFO:   Data:
COMET INFO:     url: https://www.comet.ml/ananana/mental/e5cb467425c84e0cbd402308db11a519
COMET INFO:   Metrics [count] (min, max):
COMET INFO:     sys.cpu.percent.01 [11] : (33.4, 100.0)
COMET INFO:     sys.cpu.percent.02 [11] : (25.4, 63.0)
COMET INFO:     sys.cpu.percent.03 [11] : (31.9, 65.0)
COMET INFO:     sys.cpu.percent.04 [11] : (30.0, 64.6)
COMET INFO:     sys.cpu.percent.avg [11]: (31.799999999999997, 67.0)
COMET INFO:     sys.gpu.0.total_memory  : (1073414144.0, 1073414144.0)
COMET INFO:     sys.load.avg [11]       : (1.55, 3.55)
COMET INFO:     sys.ram.total [11]      : (8277311488.0, 8277311488.0)
COMET INFO:     sys.ram.used [11]       : (6389862400.0, 7427514368.0)
COMET INFO:   Other [count]:
COMET INFO:     trainable_params: 5616
COMET INFO: ----------------------------
COMET INFO: old comet version (3.0.2) detected. current: 3.1.0 please update your comet lib with co

In [None]:
test_user_index = np.random.randint(len(user_level_data))
data_generator_train = DataGenerator(user_level_data, set_type='train', test_user_index=test_user_index)
data_generator_valid = DataGenerator(user_level_data, set_type='test',  test_user_index=test_user_index)
model, history = train_model(model, data_generator_train, data_generator_valid,
           epochs=1000, start_epoch=0,
                      callback_list = [],
                      model_path='models/mlp_t21', workers=1)

# Extra analysis


### Extract LIWC

In [42]:
def merge_tokens(row):
    tokens = []
    if row.tokenized_text:
        tokens += row.tokenized_text
    if row.tokenized_title:
        tokens += row.tokenized_title
    return tokens
writings_df['all_tokens'] = writings_df.apply (lambda row: merge_tokens(row), axis=1)

In [43]:
# TODO: include the title
def extract_emotions(tokens, emotion, relative=True):
    if not tokens:
        return None
    emotion_words = [t for t in tokens 
                     if t in nrc_lexicon[emotion]]
    if relative:
        return len(emotion_words) / len(tokens)
    else:
        return len(emotion_words)
    
    return encoded_emotions

from functools import partial
for emotion in emotions:
    writings_df[emotion] = writings_df['all_tokens'].apply(partial(extract_emotions, emotion=emotion, 
                                                                   relative=True))

In [44]:
writings_df['pronouns'] = writings_df['all_tokens'].apply(partial(encode_pronouns, relative=True))

In [46]:
writings_df[['label%i'%i for i in range(21)] + ['text', 'pronouns', 'text_len'] + emotions].corr('spearman')

Unnamed: 0,label0,label1,label2,label3,label4,label5,label6,label7,label8,label9,...,anger,anticipation,disgust,fear,joy,negative,positive,sadness,surprise,trust
label0,1.0,0.780066,0.394792,0.660577,0.327117,0.345621,0.455499,0.735686,0.309227,0.635479,...,0.034671,0.015535,0.017945,0.039971,0.0044,0.021333,-0.029398,0.049543,-0.004456,-0.027378
label1,0.780066,1.0,0.400175,0.726685,0.425165,0.533214,0.535219,0.860917,0.664714,0.775183,...,0.07413,0.077976,0.064694,0.09635,0.070177,0.044671,0.011881,0.095107,0.053556,0.022792
label2,0.394792,0.400175,1.0,0.422413,0.842698,0.596555,0.534411,0.641306,0.258487,0.236198,...,-0.100216,-0.101196,-0.048849,-0.109288,-0.039814,-0.102508,-0.059662,-0.072564,-0.090881,-0.094225
label3,0.660577,0.726685,0.422413,1.0,0.401902,0.601494,0.658633,0.779561,0.710596,0.562745,...,0.047958,0.025483,0.043743,0.052658,0.031434,0.024096,0.011029,0.05228,0.009147,0.012804
label4,0.327117,0.425165,0.842698,0.401902,1.0,0.458988,0.506148,0.601865,0.190326,0.073146,...,-0.092402,-0.097064,-0.041111,-0.094764,-0.04275,-0.086652,-0.035937,-0.060032,-0.083494,-0.061473
label5,0.345621,0.533214,0.596555,0.601494,0.458988,1.0,0.421742,0.620465,0.601454,0.546038,...,-0.026898,-0.01271,-0.02851,-0.026102,0.01203,-0.037177,-0.001477,-0.005002,-0.02543,-0.018877
label6,0.455499,0.535219,0.534411,0.658633,0.506148,0.421742,1.0,0.691181,0.394583,0.205096,...,0.006807,0.005744,0.063409,-0.008144,0.067497,-0.003552,0.032044,0.026211,-0.005417,0.01468
label7,0.735686,0.860917,0.641306,0.779561,0.601865,0.620465,0.691181,1.0,0.665261,0.682315,...,0.061217,0.041191,0.067184,0.065648,0.056784,0.039552,0.017343,0.080756,0.023736,0.014293
label8,0.309227,0.664714,0.258487,0.710596,0.190326,0.601454,0.394583,0.665261,1.0,0.701349,...,0.09839,0.074703,0.075011,0.117817,0.07412,0.072291,0.043375,0.090285,0.062752,0.047916
label9,0.635479,0.775183,0.236198,0.562745,0.073146,0.546038,0.205096,0.682315,0.701349,1.0,...,0.085899,0.075637,0.04698,0.09993,0.053006,0.05527,-0.003065,0.088706,0.046994,0.007802


In [59]:
# writings_df['label15'] = writings_df['label15'].apply(lambda l: encode_labels([l])[0])

In [61]:
# writings_df['label17'] = writings_df['label17'].apply(lambda l: encode_labels([l])[0])

In [75]:
writings_df.groupby('subject').mean()[
    ['label%i'%i for i in range(21)] + ['pronouns', 'text_len'] + emotions].corr(
    'spearman')[['pronouns', 'text_len'] + emotions]

Unnamed: 0,pronouns,text_len,anger,anticipation,disgust,fear,joy,negative,positive,sadness,surprise,trust
label0,0.516969,-0.072057,0.342691,-0.045245,0.08295,0.571431,0.109762,0.276499,0.017595,0.567242,-0.17344,0.042732
label1,0.258942,-0.184847,0.258162,0.078775,0.070195,0.502286,-0.00156,0.203566,-0.049917,0.474208,0.042897,-0.023398
label2,0.418806,-0.46534,0.077557,0.155113,0.224914,0.349005,0.41105,0.124091,0.302471,0.449829,0.349005,0.23267
label3,0.342212,-0.177381,0.251011,0.039325,-0.033468,0.621671,0.249338,0.2234,0.264399,0.628365,-0.025101,0.296193
label4,0.01872,-0.243359,-0.003256,0.026045,0.013836,0.275101,0.241731,0.076508,0.354051,0.425675,0.132667,0.345098
label5,0.156537,-0.437907,0.33487,0.295241,0.299204,0.426018,0.303167,0.382426,0.390352,0.536981,0.354685,0.330907
label6,0.531552,-0.111129,0.285981,-0.07305,0.190395,0.442183,0.446068,0.335717,0.4212,0.520672,-0.086261,0.355922
label7,0.464098,-0.210246,0.476557,0.247623,0.211803,0.728852,0.420491,0.445409,0.412705,0.80672,0.132377,0.395573
label8,0.321705,-0.278811,0.385221,0.210345,0.164977,0.546898,0.168276,0.293659,0.158378,0.561746,0.115484,0.080014
label9,0.459068,-0.293339,0.653799,0.301626,0.362945,0.626454,0.310741,0.489728,0.21379,0.628111,-0.008286,0.228705


Unnamed: 0_level_0,label0,label1,label2,label3,label4,label5,label6,label7,label8,label9,...,label11,label12,label13,label14,label15,label16,label17,label18,label19,label20
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
subject1272,1,2,2,1,0,1,3,2,1,3,...,1,2,2,2,-3,0,3,1,2,2
subject2341,1,2,3,2,3,2,3,3,1,0,...,2,3,3,2,-2,2,-3,2,2,0
subject2432,1,3,3,2,3,2,2,2,2,1,...,3,1,3,2,1,2,1,2,1,0
subject2827,1,3,3,2,3,2,2,3,2,2,...,1,2,3,1,-2,1,0,2,2,1
subject2903,0,0,1,1,0,0,1,0,0,0,...,0,0,0,1,0,0,-1,0,1,2
subject2961,1,1,1,1,0,0,1,1,1,1,...,1,1,0,2,-1,0,-3,1,1,1
subject3707,1,3,0,1,0,0,0,1,1,2,...,0,0,2,2,-3,1,-1,2,1,3
subject3993,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,-1,0,0,1
subject4058,0,0,0,1,0,0,0,0,1,0,...,0,0,0,3,2,1,0,0,3,0
subject436,2,1,2,3,2,0,3,3,1,2,...,3,2,2,3,2,1,-2,1,3,2


In [65]:
writings_df.corrwith?

In [76]:
from liwc_readDict import readDict

liwc = readDict('/home/ana/resources/FakeOrFact/features/LIWC/LIWC/liwc.dic')
categories = [c for (w,c) in liwc]
set(categories)
liwc_dict = {}
for (w, c) in liwc:
    if c not in liwc_dict:
        liwc_dict[c] = []
    liwc_dict[c].append(w)

In [77]:
def encode_liwc_categories(tokens, category_words, relative=True):
    category_cnt = 0
    if not tokens:
        return None
    text_len = len(tokens)
    for t in tokens:
        for word in category_words:
            if t==word or (word[-1]=='*' and t.startswith(word[:-1])) \
            or (t==word.split("'")[0]):
                category_cnt += 1
                break # one token cannot belong to more than one word in the category
    if relative:
        return category_cnt/text_len
    else:
        return category_cnt

In [78]:
%%time
from functools import partial
# for categ in ['negemo', 'posemo', 'affect', 'sad', 'anx', 'pronoun']:#liwc_dict.keys():
for categ in liwc_dict.keys():
    if categ in writings_df.columns:
        continue
    print("Encoding for category %s..." % categ)
    writings_df[categ] = writings_df['all_tokens'].apply(partial(encode_liwc_categories, 
                                                                   category_words=liwc_dict[categ], 
                                                                   relative=True))


Encoding for category funct...
Encoding for category article...
Encoding for category affect...
Encoding for category negemo...
Encoding for category sad...
Encoding for category cogmech...
Encoding for category inhib...
Encoding for category bio...
Encoding for category body...
Encoding for category achieve...
Encoding for category health...
Encoding for category sexual...
Encoding for category adverb...
Encoding for category preps...
Encoding for category space...
Encoding for category relativ...
Encoding for category time...
Encoding for category work...
Encoding for category certain...
Encoding for category assent...
Encoding for category posemo...
Encoding for category insight...
Encoding for category verb...
Encoding for category past...
Encoding for category money...
Encoding for category percept...
Encoding for category social...
Encoding for category friend...
Encoding for category motion...
Encoding for category cause...
Encoding for category leisure...
Encoding for category 

In [106]:
relevant_categs = ['posemo', 'negemo', 'anx', 'sad', 'affect', 'feel', 'social', 'health', 
                   'sexual', 'present', 'cogmech', 'inhib']
writings_df.groupby('subject').mean()[
    ['label%i'%i for i in range(21)] + relevant_categs].corr(
    'spearman')[relevant_categs]

Unnamed: 0,posemo,negemo,anx,sad,affect,feel,social,health,sexual,present,cogmech,inhib
label0,0.01173,0.326771,0.21701,0.172602,0.222875,0.238794,-0.268958,0.483454,0.266444,0.366152,0.27985,0.073733
label1,0.035878,0.316658,0.159109,0.258942,0.209026,0.372815,0.253483,0.513205,0.281561,0.44301,0.209806,0.14585
label2,0.426562,0.240426,0.023267,0.085312,0.46534,0.294715,-0.046534,0.224914,0.434317,0.255937,0.124091,-0.062045
label3,0.153954,0.291173,0.250175,0.188258,0.278623,0.114628,0.039325,0.355599,0.058569,0.459351,0.371497,0.118812
label4,0.174177,0.157085,0.022789,0.097669,0.290566,0.109878,-0.100111,0.035812,0.109064,0.139992,0.153015,0.048021
label5,0.354685,0.453759,-0.017833,0.215981,0.517166,0.247685,0.4815,0.156537,0.219944,0.358648,0.001981,-0.105018
label6,0.12434,0.404104,0.207492,0.125117,0.384676,0.36447,0.156979,0.33494,0.254119,0.399441,0.319397,-0.246348
label7,0.267869,0.524836,0.327049,0.30836,0.53418,0.404918,0.101229,0.543524,0.25541,0.576229,0.378442,0.101229
label8,0.190548,0.332428,0.300258,0.128682,0.358825,0.345627,0.308507,0.478433,0.179,0.623613,0.284585,0.118783
label9,0.043918,0.543589,0.365431,0.081207,0.461554,0.265166,0.189759,0.635569,0.339743,0.503815,0.03066,0.031488


In [105]:
list(writings_df.groupby('subject').min()[
    ['label%i'%i for i in range(21)] + list(liwc_dict.keys())].corr()[list(liwc_dict.keys())].mean().sort_values().index)

['funct',
 'article',
 'affect',
 'negemo',
 'sad',
 'cogmech',
 'inhib',
 'bio',
 'body',
 'achieve',
 'health',
 'sexual',
 'adverb',
 'preps',
 'space',
 'relativ',
 'time',
 'work',
 'certain',
 'assent',
 'anger',
 'posemo',
 'insight',
 'verb',
 'past',
 'money',
 'percept',
 'social',
 'friend',
 'motion',
 'cause',
 'leisure',
 'incl',
 'home',
 'present',
 'humans',
 'anx',
 'relig',
 'auxverb',
 'negate',
 'ingest',
 'death',
 'quant',
 'tentat',
 'conj',
 'pronoun',
 'ipron',
 'swear',
 'hear',
 'family',
 'see',
 'discrep',
 'number',
 'filler',
 'feel',
 'excl',
 'future',
 'nonfl',
 'ppron',
 'shehe',
 'i',
 'we',
 'you',
 'they']

In [80]:
pickle.dump(writings_df, open('writings_df_T2_liwc.pkl', 'wb+'))