In [1]:
import pandas as pd
import xml.etree.ElementTree as ET
import glob, os
import numpy as np
from comet_ml import Experiment, Optimizer
import pickle

In [2]:
os.environ['TF_KERAS'] = '1'
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional, Input, concatenate
from tensorflow.keras import regularizers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import callbacks, optimizers
from tensorflow.keras import backend as K
from tensorflow.keras.utils import plot_model

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

import tensorflow as tf

In [3]:
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
# only reserve 1 GPU

In [4]:
tf.enable_eager_execution()

# Read data

In [5]:
def read_subject_writings(subject_file):
    writings = []
    with open(subject_file) as sf:
        contents = sf.read()
        root = ET.fromstring(contents)
        try:
            subject = root.findall('ID')[0].text.strip()
        except Exception:
            print('Cannot extract ID', contents[:500], '\n-------\n')        
        for w in root.iter('WRITING'):
            subject_writings = {'subject': subject}
            for title in w.findall('TITLE'):
                subject_writings['title'] = title.text
            for text in w.findall('TEXT'):
                subject_writings['text'] = text.text
            for date in w.findall('DATE'):
                subject_writings['date'] = date.text
            writings.append(subject_writings)
    return writings

### eRisk 2020 T1

In [6]:
datadir_T1 = '/home/ana/eRisk/data/eRisk2020_T1_train/eRISK2020_T1_training_data/eRISK2020_training_data/data/'
labels_file_T1 = '/home/ana/eRisk/data/eRisk2020_T1_train/eRISK2020_T1_training_data/eRISK2020_training_data/golden_truth.txt'

In [7]:
def read_texts_2020(datadir_T1, labels_file_T1):
    writings = []
    for subject_file in os.listdir(datadir_T1):
        print(subject_file)
        writings.extend(read_subject_writings(os.path.join(datadir_T1, subject_file)))
    writings_df = pd.DataFrame(writings)

    labels_T1 = pd.read_csv(labels_file_T1, delimiter=' ', names=['subject', 'label'])
    labels_T1 = labels_T1.set_index('subject')

    writings_df['label'] = writings_df['subject'].apply(
    lambda s: labels_T1.loc[s, 'label'])
    
    return writings_df



### eRisk 2019 T1 (Anorexia)

In [8]:
datadirs_T1_2019 = {
    'train': ['2018 test/', '2018 train/positive_examples/', '2018 train/negative_examples/'],
    'test': ['data/']
}
datadir_root_T1_2019 = {
    'train': '/home/ana/eRisk/data/past/eRisk2019_T1/training data - t1/',
    'test': '/home/ana/eRisk/data/past/eRisk2019_T1/test data - T1/'
}
    
labels_files_T1_2019 = {
    'train': ['2018 train/risk_golden_truth.txt', '2018 test/risk-golden-truth-test.txt'],
    'test': ['T1_erisk_golden_truth.txt']
}

In [10]:
def read_texts_2019(datadir_root_T1_2019,
                   datadirs_T1_2019,
                   labels_files_T1_2019):
    writings = {'train': [], 'test': []}
    writings_df = pd.DataFrame()
    labels_df = pd.DataFrame()

    for subset in ('train', 'test'):
        for subdir in [os.path.join(datadir_root_T1_2019[subset], subp) for subp in datadirs_T1_2019[subset]]:
            if subset=='train':
                chunkdirs = [os.path.join(datadir_root_T1_2019[subset], subdir, chunkdir) 
                             for chunkdir in os.listdir(subdir)]
            else:
                chunkdirs = [os.path.join(datadir_root_T1_2019[subset], subdir)]
                
            for chunkdir in chunkdirs:
                if not os.path.isdir(chunkdir):
                    continue
                for subject_file in os.listdir(chunkdir):
                    print(subset, subject_file)
                    writings[subset].extend(read_subject_writings(os.path.join(chunkdir, subject_file)))
        writings_df_part = pd.DataFrame(writings[subset])
        writings_df_part['subset'] = subset
        writings_df = pd.concat([writings_df, writings_df_part])

        for label_file in labels_files_T1_2019[subset]:
            labels = pd.read_csv(os.path.join(datadir_root_T1_2019[subset], label_file), 
                                 delimiter='\s+', names=['subject', 'label'])
            labels_df = pd.concat([labels_df, labels])
    
    labels_df = labels_df.set_index('subject')

    writings_df['label'] = writings_df['subject'].apply(
    lambda s: labels_df.loc[s, 'label'])
    
    return writings_df

## Preprocess text

In [None]:
# writings_df = read_texts_2020(datadir_T1, labels_file_T1)
writings_df = read_texts_2019(datadir_root_T1_2019,
                   datadirs_T1_2019,
                   labels_files_T1_2019)

train subject6083_10.xml
train subject6333_10.xml
train subject9978_10.xml
train subject6154_10.xml
train subject5461_10.xml
train subject7468_10.xml
train subject4371_10.xml
train subject4838_10.xml
train subject508_10.xml
train subject9195_10.xml
train subject8247_10.xml
train subject6951_10.xml
train subject6731_10.xml
train subject6394_10.xml
train subject811_10.xml
train subject9589_10.xml
train subject8384_10.xml
train subject8338_10.xml
train subject96_10.xml
train subject4994_10.xml
train subject4351_10.xml
train subject6407_10.xml
train subject9716_10.xml
train subject8377_10.xml
train subject6292_10.xml
train subject7278_10.xml
train subject6910_10.xml
train subject5776_10.xml
train subject5984_10.xml
train subject5517_10.xml
train subject2568_10.xml
train subject7831_10.xml
train subject7710_10.xml
train subject4283_10.xml
train subject6334_10.xml
train subject7778_10.xml
train subject8127_10.xml
train subject4836_10.xml
train subject6302_10.xml
train subject9015_10.xml
trai

train subject7320_3.xml
train subject4994_3.xml
train subject8016_3.xml
train subject4103_3.xml
train subject5776_3.xml
train subject6412_3.xml
train subject545_3.xml
train subject9597_3.xml
train subject5173_3.xml
train subject6214_3.xml
train subject9090_3.xml
train subject7925_3.xml
train subject5311_3.xml
train subject7001_3.xml
train subject8054_3.xml
train subject7616_3.xml
train subject8607_3.xml
train subject6863_3.xml
train subject4276_3.xml
train subject5118_3.xml
train subject6072_3.xml
train subject5297_3.xml
train subject6414_3.xml
train subject4283_3.xml
train subject8799_3.xml
train subject4916_3.xml
train subject5236_3.xml
train subject4351_3.xml
train subject8900_3.xml
train subject7937_3.xml
train subject5067_3.xml
train subject5984_3.xml
train subject5802_3.xml
train subject4588_3.xml
train subject9218_3.xml
train subject8200_3.xml
train subject2338_3.xml
train subject2568_3.xml
train subject8932_3.xml
train subject9499_3.xml
train subject8486_3.xml
train subject9928

train subject9597_5.xml
train subject6951_5.xml
train subject8597_5.xml
train subject8371_5.xml
train subject7251_5.xml
train subject5663_5.xml
train subject5517_5.xml
train subject8607_5.xml
train subject4450_5.xml
train subject8032_5.xml
train subject467_5.xml
train subject531_5.xml
train subject9654_5.xml
train subject3162_5.xml
train subject6075_5.xml
train subject896_5.xml
train subject5325_5.xml
train subject5177_5.xml
train subject8338_5.xml
train subject7433_5.xml
train subject4569_5.xml
train subject6386_5.xml
train subject8236_5.xml
train subject8401_5.xml
train subject9716_5.xml
train subject5830_5.xml
train subject9166_5.xml
train subject9225_5.xml
train subject5078_5.xml
train subject8167_5.xml
train subject6755_5.xml
train subject4916_5.xml
train subject8254_5.xml
train subject7469_5.xml
train subject8054_5.xml
train subject6410_5.xml
train subject7087_5.xml
train subject6695_5.xml
train subject959_5.xml
train subject7925_5.xml
train subject4160_5.xml
train subject4103_5.

train subject6714_2.xml
train subject7478_2.xml
train subject5297_2.xml
train subject7301_2.xml
train subject2845_2.xml
train subject5759_2.xml
train subject4351_2.xml
train subject5173_2.xml
train subject8240_2.xml
train subject8296_2.xml
train subject6168_2.xml
train subject5095_2.xml
train subject7779_2.xml
train subject5233_2.xml
train subject9195_2.xml
train subject9244_2.xml
train subject1152_2.xml
train subject5577_2.xml
train subject4588_2.xml
train subject9153_2.xml
train subject7371_2.xml
train subject9003_2.xml
train subject6072_2.xml
train subject6310_2.xml
train subject7529_2.xml
train subject6921_2.xml
train subject4071_2.xml
train subject6731_2.xml
train subject9499_2.xml
train subject8401_2.xml
train subject9773_2.xml
train subject5452_2.xml
train subject5078_2.xml
train subject21_2.xml
train subject6863_2.xml
train subject4276_2.xml
train subject7617_2.xml
train subject7927_2.xml
train subject7483_2.xml
train subject959_2.xml
train subject3076_2.xml
train subject7248_2

train subject6412_7.xml
train subject7616_7.xml
train subject6792_7.xml
train subject9218_7.xml
train subject4406_7.xml
train subject9499_7.xml
train subject6947_7.xml
train subject7710_7.xml
train subject5173_7.xml
train subject6956_7.xml
train subject7249_7.xml
train subject5236_7.xml
train subject6863_7.xml
train subject9436_7.xml
train subject8371_7.xml
train subject4169_7.xml
train subject6216_7.xml
train subject6288_7.xml
train subject7639_7.xml
train subject9166_7.xml
train subject7001_7.xml
train subject6334_7.xml
train subject3135_7.xml
train subject5984_7.xml
train subject5221_7.xml
train subject4196_7.xml
train subject5095_7.xml
train subject8173_7.xml
train subject6731_7.xml
train subject9654_7.xml
train subject8932_7.xml
train subject6407_7.xml
train subject8254_7.xml
train subject6461_7.xml
train subject8340_7.xml
train subject898_7.xml
train subject9230_7.xml
train subject4556_7.xml
train subject8597_7.xml
train subject5854_7.xml
train subject7529_7.xml
train subject9928

train subject6714_4.xml
train subject6620_4.xml
train subject7469_4.xml
train subject6072_4.xml
train subject6037_4.xml
train subject6334_4.xml
train subject6755_4.xml
train subject4155_4.xml
train subject6837_4.xml
train subject531_4.xml
train subject9654_4.xml
train subject1518_4.xml
train subject8133_4.xml
train subject5028_4.xml
train subject7320_4.xml
train subject5297_4.xml
train subject6114_4.xml
train subject4371_4.xml
train subject5562_4.xml
train subject7617_4.xml
train subject5802_4.xml
train subject5825_4.xml
train subject9950_4.xml
train subject3275_4.xml
train subject6758_4.xml
train subject5085_4.xml
train subject6168_4.xml
train subject4187_4.xml
train subject4244_4.xml
train subject8173_4.xml
train subject4569_4.xml
train subject4072_4.xml
train subject545_4.xml
train subject4112_4.xml
train subject6269_4.xml
train subject6807_4.xml
train subject5984_4.xml
train subject9090_4.xml
train subject5364_4.xml
train subject8841_4.xml
train subject7442_4.xml
train subject5808_

train subject7422_1.xml
train subject5469_1.xml
train subject5854_1.xml
train subject8701_1.xml
train subject8338_1.xml
train subject5808_1.xml
train subject4443_1.xml
train subject6333_1.xml
train subject6334_1.xml
train subject5663_1.xml
train subject5562_1.xml
train subject896_1.xml
train subject6114_1.xml
train subject6994_1.xml
train subject5802_1.xml
train subject7778_1.xml
train subject6216_1.xml
train subject8247_1.xml
train subject7632_1.xml
train subject8900_1.xml
train subject5660_1.xml
train subject7165_1.xml
train subject8016_1.xml
train subject4982_1.xml
train subject559_1.xml
train subject8932_1.xml
train subject7710_1.xml
train subject9153_1.xml
train subject4196_1.xml
train subject4858_1.xml
train subject874_1.xml
train subject8392_1.xml
train subject5532_1.xml
train subject6168_1.xml
train subject9098_1.xml
train subject6620_1.xml
train subject536_1.xml
train subject545_1.xml
train subject5103_1.xml
train subject4153_1.xml
train subject6947_1.xml
train subject6800_1.x

train subject6302_8.xml
train subject2529_8.xml
train subject5426_8.xml
train subject4073_8.xml
train subject5115_8.xml
train subject8754_8.xml
train subject5802_8.xml
train subject5984_8.xml
train subject9371_8.xml
train subject5969_8.xml
train subject9597_8.xml
train subject4244_8.xml
train subject9218_8.xml
train subject803_8.xml
train subject9744_8.xml
train subject4588_8.xml
train subject4916_8.xml
train subject5325_8.xml
train subject9982_8.xml
train subject8561_8.xml
train subject6333_8.xml
train subject4421_8.xml
train subject7351_8.xml
train subject6168_8.xml
train subject6755_8.xml
train subject8841_8.xml
train subject1518_8.xml
train subject874_8.xml
train subject5236_8.xml
train subject4994_8.xml
train subject8607_8.xml
train subject7597_8.xml
train subject5776_8.xml
train subject6994_8.xml
train subject6978_8.xml
train subject8411_8.xml
train subject8200_8.xml
train subject2062_8.xml
train subject6293_8.xml
train subject7483_8.xml
train subject5779_8.xml
train subject6522_

train subject1169_9.xml
train subject4153_9.xml
train subject6114_9.xml
train subject2529_9.xml
train subject901_9.xml
train subject9337_9.xml
train subject8841_9.xml
train subject9078_9.xml
train subject8032_9.xml
train subject6154_9.xml
train subject7066_9.xml
train subject5452_9.xml
train subject536_9.xml
train subject8247_9.xml
train subject4072_9.xml
train subject992_9.xml
train subject4071_9.xml
train subject6292_9.xml
train subject6168_9.xml
train subject6731_9.xml
train subject9225_9.xml
train subject7077_9.xml
train subject4155_9.xml
train subject4410_9.xml
train subject6139_9.xml
train subject9716_9.xml
train subject8562_9.xml
train subject8587_9.xml
train subject5426_9.xml
train subject8701_9.xml
train subject4838_9.xml
train subject6790_9.xml
train subject6910_9.xml
train subject2992_9.xml
train subject973_9.xml
train subject6800_9.xml
train subject7248_9.xml
train subject8740_9.xml
train subject7778_9.xml
train subject9499_9.xml
train subject7616_9.xml
train subject9090_9.

train subject6472_6.xml
train subject9218_6.xml
train subject3162_6.xml
train subject2062_6.xml
train subject8587_6.xml
train subject1169_6.xml
train subject4071_6.xml
train subject8512_6.xml
train subject6288_6.xml
train subject9903_6.xml
train subject6462_6.xml
train subject8444_6.xml
train subject4072_6.xml
train subject7831_6.xml
train subject8254_6.xml
train subject6863_6.xml
train subject5808_6.xml
train subject5719_6.xml
train subject5236_6.xml
train subject6292_6.xml
train subject6358_6.xml
train subject803_6.xml
train subject2568_6.xml
train subject8340_6.xml
train subject5103_6.xml
train subject8247_6.xml
train subject3813_6.xml
train subject7371_6.xml
train subject4896_6.xml
train subject5085_6.xml
train subject6695_6.xml
train subject4283_6.xml
train subject6446_10.xml
train subject1113_10.xml
train subject1953_10.xml
train subject1637_10.xml
train subject5127_10.xml
train subject5711_10.xml
train subject7221_10.xml
train subject3132_10.xml
train subject845_10.xml
train sub

train subject3183_10.xml
train subject3247_10.xml
train subject2346_10.xml
train subject1323_10.xml
train subject3835_10.xml
train subject3901_10.xml
train subject3659_10.xml
train subject2840_10.xml
train subject2777_10.xml
train subject2129_10.xml
train subject398_10.xml
train subject2101_10.xml
train subject2865_10.xml
train subject2069_10.xml
train subject3274_3.xml
train subject2728_3.xml
train subject3364_3.xml
train subject3039_3.xml
train subject1143_3.xml
train subject1417_3.xml
train subject1101_3.xml
train subject3278_3.xml
train subject1499_3.xml
train subject244_3.xml
train subject2746_3.xml
train subject1358_3.xml
train subject2482_3.xml
train subject3750_3.xml
train subject1193_3.xml
train subject3504_3.xml
train subject3313_3.xml
train subject1549_3.xml
train subject3318_3.xml
train subject2418_3.xml
train subject3772_3.xml
train subject1497_3.xml
train subject1563_3.xml
train subject2947_3.xml
train subject1369_3.xml
train subject1772_3.xml
train subject2662_3.xml
trai

train subject3183_7.xml
train subject2469_7.xml
train subject1474_7.xml
train subject1772_7.xml
train subject2257_7.xml
train subject2037_7.xml
train subject1397_7.xml
train subject1875_7.xml
train subject2513_7.xml
train subject3097_7.xml
train subject1563_7.xml
train subject3763_7.xml
train subject2894_7.xml
train subject3240_7.xml
train subject3313_7.xml
train subject2472_7.xml
train subject1271_7.xml
train subject2423_7.xml
train subject2728_7.xml
train subject3605_7.xml
train subject3220_7.xml
train subject1143_7.xml
train subject1074_7.xml
train subject31_7.xml
train subject3659_7.xml
train subject3247_7.xml
train subject1549_7.xml
train subject1358_7.xml
train subject1404_7.xml
train subject2923_7.xml
train subject3750_7.xml
train subject2577_7.xml
train subject3269_7.xml
train subject354_7.xml
train subject366_7.xml
train subject2797_7.xml
train subject2662_7.xml
train subject1789_7.xml
train subject1369_7.xml
train subject3339_7.xml
train subject1120_7.xml
train subject2997_7.

train subject2994_1.xml
train subject3530_1.xml
train subject3323_1.xml
train subject3313_1.xml
train subject3278_1.xml
train subject195_1.xml
train subject3944_1.xml
train subject1577_1.xml
train subject1137_1.xml
train subject3274_1.xml
train subject1789_1.xml
train subject3364_1.xml
train subject2482_1.xml
train subject1875_1.xml
train subject3614_1.xml
train subject2101_1.xml
train subject163_1.xml
train subject3659_1.xml
train subject2840_1.xml
train subject2525_1.xml
train subject2337_1.xml
train subject2167_1.xml
train subject1397_1.xml
train subject3835_1.xml
train subject2797_1.xml
train subject322_1.xml
train subject1474_1.xml
train subject2419_1.xml
train subject1101_1.xml
train subject244_1.xml
train subject331_1.xml
train subject2240_1.xml
train subject1193_1.xml
train subject3727_1.xml
train subject2129_1.xml
train subject3232_1.xml
train subject366_1.xml
train subject2359_1.xml
train subject3394_1.xml
train subject3640_1.xml
train subject1565_1.xml
train subject1120_1.xm

train subject2013_6.xml
train subject398_6.xml
train subject1496_6.xml
train subject3359_6.xml
train subject1272_6.xml
train subject146_6.xml
train subject2504_6.xml
train subject3097_6.xml
train subject3145_6.xml
train subject350_6.xml
train subject1417_6.xml
train subject1563_6.xml
train subject1271_6.xml
train subject3339_6.xml
train subject2865_6.xml
train subject1604_6.xml
train subject3428_6.xml
train subject2746_6.xml
train subject331_6.xml
train subject3278_6.xml
train subject3944_6.xml
train subject3727_6.xml
train subject3313_6.xml
train subject1143_6.xml
train subject2359_6.xml
train subject2797_6.xml
train subject1397_6.xml
train subject1323_6.xml
train subject3530_6.xml
train subject3323_6.xml
train subject3183_6.xml
train subject2519_6.xml
train subject2879_6.xml
train subject3274_6.xml
train subject2159_6.xml
train subject2997_6.xml
train subject2947_6.xml
train subject2923_6.xml
train subject2577_6.xml
train subject3288_6.xml
train subject1423_6.xml
train subject3504_6.

test subject4152.xml
test subject1053.xml
test subject2214.xml
test subject5873.xml
test subject8502.xml
test subject5483.xml
test subject3390.xml
test subject8586.xml
test subject7762.xml
test subject5548.xml
test subject5787.xml
test subject2180.xml
test subject5180.xml
test subject1776.xml
test subject8812.xml
test subject493.xml
test subject1078.xml
test subject8740.xml
test subject3518.xml
test subject743.xml
test subject7324.xml
test subject6728.xml
test subject2786.xml
test subject4716.xml
test subject3648.xml
test subject1360.xml
test subject9858.xml
test subject3567.xml
test subject7833.xml
test subject5886.xml
test subject5170.xml
test subject1167.xml
test subject3785.xml
test subject1537.xml
test subject5133.xml
test subject8993.xml
test subject678.xml
test subject3773.xml
test subject8073.xml
test subject7513.xml
test subject4502.xml
test subject3588.xml
test subject8399.xml
test subject772.xml
test subject2513.xml
test subject9489.xml
test subject612.xml
test subject4849.x

test subject1919.xml
test subject3562.xml
test subject5294.xml
test subject1902.xml
test subject4303.xml
test subject3432.xml
test subject3240.xml
test subject1171.xml
test subject8694.xml
test subject6353.xml
test subject8949.xml
test subject5212.xml
test subject9898.xml
test subject7882.xml
test subject2977.xml
test subject6200.xml
test subject391.xml
test subject7809.xml
test subject9810.xml
test subject7812.xml
test subject5757.xml
test subject8119.xml
test subject5760.xml
test subject4027.xml
test subject5955.xml
test subject8510.xml
test subject66.xml
test subject6074.xml
test subject9675.xml
test subject2807.xml
test subject1805.xml
test subject9777.xml
test subject893.xml
test subject6337.xml
test subject9637.xml
test subject6904.xml
test subject3967.xml
test subject9926.xml
test subject9856.xml
test subject4612.xml
test subject5342.xml
test subject2698.xml
test subject3919.xml
test subject2239.xml
test subject3420.xml
test subject9803.xml
test subject1183.xml
test subject1469.

In [None]:
writings_df[writings_df['subset']=='test']

In [81]:
writings_df[writings_df['text'].isna()][~writings_df['title'].isna()]

  """Entry point for launching an IPython kernel.


Unnamed: 0,subject,title,text,date,subset,label


In [None]:
writings_df.label.values

In [12]:
tokenizer = RegexpTokenizer(r'\w+')

def tokenize(t):
    return tokenizer.tokenize(t.lower())

In [13]:
tokenize("I wasn't ready to leave! buh-buw(dd). Sasa .")

['i', 'wasn', 't', 'ready', 'to', 'leave', 'buh', 'buw', 'dd', 'sasa']

In [14]:
writings_df['tokenized_title'] = writings_df['title'].apply(lambda t: tokenize(t) if type(t)==str else None)
writings_df['title_len'] = writings_df['tokenized_title'].apply(lambda t: len(t) if type(t)==list else None)
writings_df['tokenized_text'] = writings_df['text'].apply(lambda t: tokenize(t) if type(t)==str else None)
writings_df['text_len'] = writings_df['tokenized_text'].apply(lambda t: len(t) if type(t)==list else None)

In [15]:
writings_df.text_len.describe()

count    127941.000000
mean         32.268929
std          82.590713
min           0.000000
25%           6.000000
50%          13.000000
75%          31.000000
max        7201.000000
Name: text_len, dtype: float64

In [16]:
writings_df.title_len.describe()

count    49762.000000
mean        10.699771
std          9.282454
min          0.000000
25%          4.000000
50%          8.000000
75%         14.000000
max        149.000000
Name: title_len, dtype: float64

In [17]:
writings_df.groupby('subject').mean()

Unnamed: 0_level_0,label,title_len,text_len
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
subject0,0,20.285714,31.711712
subject1027,0,7.769231,1.190476
subject1055,0,16.666667,79.983193
subject1064,1,13.000000,68.410256
subject1089,0,9.823529,13.254902
...,...,...,...
subject9917,1,8.983607,95.806897
subject9918,0,5.000000,11.865269
subject992,0,5.872928,19.876190
subject9949,0,10.609756,42.346979


In [18]:
writings_df.groupby('subject').max().groupby('label').count()

Unnamed: 0_level_0,date,title_len,text_len
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,299,296,299
1,41,40,41


In [19]:
print("Average number of posts per user", writings_df.groupby('subject').count().title.mean())
print("Average number of comments per user", writings_df.groupby('subject').count().text.mean())


Average number of posts per user 146.35882352941175
Average number of comments per user 376.2970588235294


In [20]:
writings_df[(~writings_df['text_len'].isnull()) & (writings_df['text_len'] > 100)]


Unnamed: 0,subject,title,date,text,label,tokenized_title,title_len,tokenized_text,text_len
122,subject8292,Okay friends so I messed up and posted to do a...,2017-04-25 22:37:57,"Sorry for that, I truly didn't think it was go...",0,"[okay, friends, so, i, messed, up, and, posted...",34.0,"[sorry, for, that, i, truly, didn, t, think, i...",120.0
390,subject8292,,2017-09-16 06:29:13,You've got plenty of time to fix that. You can...,0,,,"[you, ve, got, plenty, of, time, to, fix, that...",104.0
498,subject8292,,2017-11-24 01:33:22,"LCD, Glass animals, Kendrick, The Weeknd, Jack...",0,,,"[lcd, glass, animals, kendrick, the, weeknd, j...",127.0
752,subject8292,Getting that coachella bod,2018-01-09 00:54:06,First I want to say whatever skin is your skin...,0,"[getting, that, coachella, bod]",4.0,"[first, i, want, to, say, whatever, skin, is, ...",149.0
904,subject8292,,2018-03-12 17:14:03,Not the same but me and my wife saw a man and ...,0,,,"[not, the, same, but, me, and, my, wife, saw, ...",151.0
...,...,...,...,...,...,...,...,...,...
170652,subject217,,2018-05-28 12:23:00,/r/keto /r/ketorecipes /r/ketodessert all are ...,0,,,"[r, keto, r, ketorecipes, r, ketodessert, all,...",197.0
170653,subject217,,2018-05-28 12:32:36,its okay dont worry . as long as you don't exc...,0,,,"[its, okay, dont, worry, as, long, as, you, do...",109.0
170662,subject217,,2018-06-20 00:33:57,the national number is :1919 here are more com...,0,,,"[the, national, number, is, 1919, here, are, m...",115.0
170693,subject217,,2018-08-19 11:29:21,"this is my personal experience ,it may not ref...",0,,,"[this, is, my, personal, experience, it, may, ...",153.0


# Recurrent NN

## Extract features and encode data

In [45]:
hyperparams_features = {
    "max_features": 20000,
    # cut texts after this number of words
    # (among top max_features most common words)
    "maxlen": 100,
    "embedding_dim": 100,
    "user_level": False,
}


### Emotions

In [46]:
def load_NRC(nrc_path):
    word_emotions = {}
    emotion_words = {}
    with open(nrc_path) as in_f:
        for line in in_f:
            line = line.strip()
            if not line:
                continue
            word, emotion, label = line.split()
            if word not in word_emotions:
                word_emotions[word] = set()
            if emotion not in emotion_words:
                emotion_words[emotion] = set()
            label = int(label)
            if label:
                word_emotions[word].add(emotion)
                emotion_words[emotion].add(word)
    return emotion_words

nrc_lexicon_path = '/home/ana/resources/NRC-Sentiment-Emotion-Lexicons/NRC-Sentiment-Emotion-Lexicons/NRC-Emotion-Lexicon-v0.92/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt'
nrc_lexicon = load_NRC(nrc_lexicon_path)
emotions = list(nrc_lexicon.keys())


In [47]:
def encode_emotions(tokens, emotion_lexicon, emotions, relative=True):
    text_len = len(tokens)
    encoded_emotions = [0 for e in emotions]
    for i, emotion in enumerate(emotions):
        try:
            emotion_words = [t for t in tokens if t in emotion_lexicon[emotion]]
            if relative:
                encoded_emotions[i] = len(emotion_words) / len(tokens)
            else:
                encoded_emotions[i] = len(emotion_words)
        except ValueError:
            print("Emotion not found.")
    return encoded_emotions

#### Sentiment

### Style features

#### Char n-grams

In [48]:
def extract_ngrams(tokens):
    pass

#### Personal pronouns

In [49]:
first_person_pronouns = {"i", "me", "my", "mine"}
def encode_pronouns(tokens, pronouns={"i", "me", "my", "mine"}, relative=True):
    if not tokens:
        return np.nan
    text_len = len(tokens)
    nr_pronouns = len([t for t in tokens if t in pronouns])
    if relative:
        return nr_pronouns/text_len
    else:
        return nr_pronouns

#### Stopwords

In [50]:
stopword_list = stopwords.words("english")
def encode_stopwords(tokens, stopwords=stopword_list):
    encoded_stopwords = [0 for s in stopword_list]
    if not tokens:
        return encoded_stopwords
    for i, stopword in enumerate(stopwords):
        if stopword in tokens:
            encoded_stopwords[i] += 1
    return encoded_stopwords

### Topics

### Encode data

In [69]:
from collections import Counter
def load_erisk_data(writings_df, voc_size, emotion_lexicon, seq_len, emotions =  
                    ['anger', 'anticipation', 'disgust', 'fear', 'joy', 
                     'negative', 'positive', 'sadness', 'surprise', 'trust'],
                    pronouns = ["i", "me", "my", "mine", "myself"],
                    train_prop=0.7, min_post_len=3, min_word_len=1, 
                    user_level=True, vocabulary=None):
    print("Loading data...")
    if not vocabulary:
        vocabulary = {}
        word_freqs = Counter()
        for words in writings_df.tokenized_text:
            word_freqs.update(words)
        for words in writings_df.tokenized_title:
            word_freqs.update(words)
        i = 1
        for w, f in word_freqs.most_common(voc_size-2): # keeping voc_size-1 for unk
            if len(w) < min_word_len:
                continue
            vocabulary[w] = i
            i += 1
    tokens_data_train = []
    categ_data_train = []
    sparse_data_train = []
    tokens_data_test = []
    categ_data_test = []
    sparse_data_test = []
    labels_train = []
    users_train = []
    users_test = []
    labels_test = []
    all_subjects = sorted(list(set(writings_df.subject)))
    training_subjects_size = int(len(all_subjects) * train_prop)
    training_subjects = all_subjects[:training_subjects_size]
    print(training_subjects_size, "training users, ", len(all_subjects)-training_subjects_size, " test users.")
#     training_rows = writings_df[writings_df['subject'].isin(training_subjects)].sample(frac=1) # shuffling
#     test_rows = writings_df[~writings_df['subject'].isin(training_subjects)].sample(frac=1)
#     positive_training_users = training_rows.groupby('subject').max().groupby('label').count().date[1]
#     positive_test_users = test_rows.groupby('subject').max().groupby('label').count().date[1]
#     print("Positive training users: ", positive_training_users, ", positive test users: ", positive_test_users)
    def encode_text(tokens):
        # Using voc_size-1 value for OOV token
        encoded_tokens = [vocabulary.get(w, voc_size-1) for w in tokens]
        encoded_emotions = encode_emotions(tokens, emotion_lexicon, emotions)
        encoded_pronouns = encode_pronouns(tokens, pronouns)
        encoded_stopwords = encode_stopwords(tokens)
        return (encoded_tokens, encoded_emotions, encoded_pronouns, encoded_stopwords)
    user_level_texts = {}
    for row in writings_df.itertuples():
        words = []
        if row.tokenized_title:
            words.extend(row.tokenized_title)
        if row.tokenized_text:
            words.extend(row.tokenized_text)
        if not words or len(words)<min_post_len:
            continue
        label = row.label
        if row.subject not in user_level_texts.keys():
            user_level_texts[row.subject] = {}
            user_level_texts[row.subject]['texts'] = [words]
            user_level_texts[row.subject]['label'] = label
        else:
            user_level_texts[row.subject]['texts'].append(words) # TODO: sort datapoints chronologically
    for subject in user_level_texts.keys():
        texts = user_level_texts[subject]['texts']
        label = user_level_texts[subject]['label']
        if user_level:
            all_words = [sum(texts, [])] # merge all texts in one list
        else:
            all_words = texts
        for words in all_words:
            encoded_tokens, encoded_emotions, encoded_pronouns, encoded_stopwords = encode_text(words)
            subject_id = int(subject.split('t')[1])
            if subject in training_subjects:
                tokens_data_train.append(encoded_tokens)
                categ_data_train.append(encoded_emotions + [encoded_pronouns])
                sparse_data_train.append(encoded_stopwords)
                labels_train.append(label)
                users_train.append(subject_id)
            else:
                tokens_data_test.append(encoded_tokens)
                categ_data_test.append(encoded_emotions + [encoded_pronouns])
                sparse_data_test.append(encoded_stopwords)
                labels_test.append(label)
                users_test.append(subject_id)
        
    # using zeros for padding
    tokens_data_train_padded = sequence.pad_sequences(tokens_data_train, maxlen=seq_len)
    tokens_data_test_padded = sequence.pad_sequences(tokens_data_test, maxlen=seq_len)
        
    return ([np.array(tokens_data_train_padded), np.array(categ_data_train), np.array(sparse_data_train),
            np.array(users_train)],
            np.array(labels_train)), \
            ([np.array(tokens_data_test_padded), np.array(categ_data_test), np.array(sparse_data_test),
             np.array(users_test)],
             np.array(labels_test)), vocabulary

In [162]:
(x_train, y_train), (x_test, y_test), voc = load_erisk_data(writings_df, 
                                                            seq_len=hyperparams_features['maxlen'],
                                                            voc_size=hyperparams_features['max_features'],
                                                           emotion_lexicon=nrc_lexicon,
                                                           emotions=emotions,
                                                           user_level=hyperparams_features['user_level'],
                                                           vocabulary=pickle.load(open('vocabulary20K_selfharm.pkl', 'rb')))

Loading data...
237 training users,  103  test users.


In [54]:
x_train_seq, x_train_categ, x_train_sparse, x_train_users = x_train
x_test_seq, x_test_categ, x_test_sparse, x_test_users = x_test
print(len(x_train_seq), 'train sequences')
print(len(x_test_seq), 'test sequences')

111375 train sequences
41455 test sequences


In [55]:
print(pd.Series(y_train).sum(), "positive training examples")
print(pd.Series(y_test).sum(), "positive test examples")

4430 positive training examples
2150 positive test examples


In [56]:
x_train

[array([[    8, 11922,    13, ...,     8, 19999,   149],
        [    0,     0,     0, ...,    25,    21,   393],
        [    0,     0,     0, ...,   975,    19,   409],
        ...,
        [   39,    19,   374, ...,     1,   709,   809],
        [    0,     0,     0, ...,   137,     3,    38],
        [    0,     0,     0, ...,     1,   276,  8212]], dtype=int32),
 array([[0.03448276, 0.02298851, 0.01149425, ..., 0.01149425, 0.02298851,
         0.03448276],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.4       ],
        ...,
        [0.00505051, 0.01010101, 0.01010101, ..., 0.        , 0.03030303,
         0.08080808],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.16666667],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.18181818]]),
 array([[1, 0, 1, ..., 0, 0, 0],
      

In [57]:
from sklearn.utils import class_weight

class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(y_train),
                                                 y_train)
class_weights

array([ 0.52071158, 12.57054176])

In [58]:
def load_embeddings(path, embedding_dim, voc):
    # random matrix with mean value = 0
    embedding_matrix = np.random.random((len(voc)+2, embedding_dim)) - 0.5 # voc + unk + pad value(0)
#     embedding_matrix = np.zeros((len(voc)+1, embedding_dim))

    f = open(path)
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        word_i = voc.get(word)
        if word_i is not None:
            embedding_matrix[word_i] = coefs
    f.close()

    print('Total %s word vectors.' % len(embedding_matrix))

 
    return embedding_matrix

# pretrained_embeddings_path = '/home/ana/resources/glove.6B/glove.6B.%dd.txt' % hyperparams_features['embedding_dim']
pretrained_embeddings_path = '/home/ana/resources/glove.twitter.27B/glove.twitter.27B.%dd.txt' % hyperparams_features['embedding_dim']
embedding_matrix = load_embeddings(pretrained_embeddings_path, hyperparams_features['embedding_dim'], voc)


Total 20000 word vectors.


In [59]:
y_test.shape

(41455,)

## Define model

In [60]:
hyperparams = {
    'lstm_units': 100,
    'dense_bow_units': 15,
    'dropout': 0.14,
    'l2_dense': 0.00011,
    'optimizer': 'adam', #None,
    'decay': 0.001,
    'lr': 0.00001,
    "batch_size": 128,
    "trainable_embeddings": False,
    "reduce_lr_factor": 0.2,
    "reduce_lr_patience": 2,
    "freeze_patience": 50,

}
if not hyperparams['optimizer']:
    hyperparams['optimizer'] = optimizers.Adam(lr=hyperparams['lr'], beta_1=0.9, beta_2=0.999, epsilon=0.0001,
                                   decay=hyperparams['decay'])

In [61]:
def recall_m(y_true, y_pred):
        y_labels = y_true#tf.reshape(y_true[:], (-1,1))
#         y_pred = tf.reshape(y_pred, (-1,1))
        print("Labels", y_labels)
        print("Predictions", y_pred)
        true_positives = K.sum(K.round(K.clip(y_labels * y_pred, 0, 1)))
        print("True positives", true_positives)
        possible_positives = K.sum(K.round(K.clip(y_labels, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_m(y_true, y_pred):
        y_labels = y_true#tf.reshape(y_true[:], (-1,1))
#         y_pred = tf.reshape(y_pred, (-1,1))
        true_positives = K.sum(K.round(K.clip(y_labels * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

def binary_crossentropy_custom(y_true, y_pred):
    y_labels = y_true#tf.reshape(y_true[0],(1,-1))
    return K.binary_crossentropy(y_labels, 
                                 y_pred)

In [62]:
def build_model(hyperparams, hyperparams_features, embedding_matrix, emotions, stopwords_list,
               ignore_layer=[]):

    tokens_features = Input(shape=(hyperparams_features['maxlen'],), name='word_seq')
    embedding_layer = Embedding(hyperparams_features['max_features'], 
                                hyperparams_features['embedding_dim'], 
                                input_length=hyperparams_features['maxlen'],
                                mask_zero=True,
                                embeddings_regularizer=regularizers.l2(hyperparams['l2_dense']),
                                weights=[embedding_matrix], 
                                trainable=hyperparams['trainable_embeddings'],
                               name='embeddings_layer')(
        tokens_features)
#     lstm_layers = Bidirectional(LSTM(hyperparams['lstm_units']))(embedding_layer)
    lstm_layers = LSTM(hyperparams['lstm_units'], dropout=hyperparams['dropout'],
                      recurrent_dropout=hyperparams['dropout'],
                      name='LSTM_layer')(embedding_layer)
    
    numerical_features = Input(shape=(len(emotions) + 1,), name='numeric_input') # emotions and pronouns
    dense_layer = Dense(units=1,
                        kernel_regularizer=regularizers.l2(hyperparams['l2_dense']),
                        name='numerical_dense_layer',
                       )(numerical_features)
    sparse_features = Input(shape=(len(stopwords_list),), name='sparse_input') # stopwords
    subjects = Input(shape=(1,), name='subjects')
    dense_layer_sparse = Dense(units=hyperparams['dense_bow_units'],
                              name='sparse_feat_dense_layer',
                                kernel_regularizer=regularizers.l2(hyperparams['l2_dense']),
                              )(sparse_features)
#     # TODO: this is getting out of hand. Refactor this ablation part.
    if 'lstm_layers' in ignore_layer:
        output_layer = Dense(1, activation='sigmoid')(numerical_features)
    elif 'numerical_dense_layer' in ignore_layer and 'sparse_feat_dense_layer' in ignore_layer:
        output_layer = Dense(1, activation='sigmoid')(lstm_layers)
    elif 'numerical_dense_layer' in ignore_layer:
        merged_layers = concatenate([lstm_layers, dense_layer_sparse])
        output_layer = Dense(1, activation='sigmoid')(merged_layers)

    elif 'sparse_feat_dense_layer' in ignore_layer:
        merged_layers = concatenate([lstm_layers, dense_layer])
        output_layer = Dense(1, activation='sigmoid')(merged_layers)

    else:
        merged_layers = concatenate([lstm_layers, dense_layer, dense_layer_sparse])
        output_layer = Dense(1, activation='sigmoid')(merged_layers)

    # Compile model
    model = Model(inputs=[tokens_features, numerical_features, sparse_features], 
                  outputs=output_layer)
    model.compile(hyperparams['optimizer'], binary_crossentropy,
                  metrics=[f1_m, precision_m, recall_m])
    
    return model


In [63]:
model = build_model(hyperparams, hyperparams_features, embedding_matrix, emotions, stopword_list,
                   ignore_layer=[])
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


NameError: name 'binary_crossentropy' is not defined

In [193]:
plot_model(model, 'models/lstm_plus9.png')

In [194]:
experiment = Experiment(api_key="eoBdVyznAhfg3bK9pZ58ZSXfv",
                        project_name="mental", workspace="ananana")

experiment.log_parameters(hyperparams_features)

experiment.log_parameter('emotion_lexicon', nrc_lexicon_path)
experiment.log_parameter('emotions', emotions)
experiment.log_parameter('embeddings_path', pretrained_embeddings_path)

experiment.log_parameters(hyperparams)

COMET INFO: ----------------------------
COMET INFO: Comet.ml Experiment Summary:
COMET INFO:   Data:
COMET INFO:     url: https://www.comet.ml/ananana/mental/549c18d5fe2847c2962932af4e0b8004
COMET INFO:   Metrics [count] (min, max):
COMET INFO:     batch_f1_m [5]           : (0.0, 0.045454543083906174)
COMET INFO:     batch_loss [5]           : (0.3589591681957245, 2.49874210357666)
COMET INFO:     batch_precision_m [5]    : (0.0, 0.03030303120613098)
COMET INFO:     batch_recall_m [5]       : (0.0, 0.09090907871723175)
COMET INFO:     epoch_duration           : (307.80801238399, 307.80801238399)
COMET INFO:     f1_m                     : (0.0208333320915699, 0.0208333320915699)
COMET INFO:     loss                     : (0.5987520306547985, 0.5987520306547985)
COMET INFO:     lr                       : (0.0010000000474974513, 0.0010000000474974513)
COMET INFO:     precision_m              : (0.013888888992369175, 0.013888888992369175)
COMET INFO:     recall_m                 : (0.041

In [195]:
voc

{'the': 1,
 'i': 2,
 'to': 3,
 'a': 4,
 'and': 5,
 'of': 6,
 'it': 7,
 'in': 8,
 'you': 9,
 'that': 10,
 'is': 11,
 's': 12,
 'for': 13,
 'this': 14,
 't': 15,
 'on': 16,
 'with': 17,
 'but': 18,
 'my': 19,
 '8217': 20,
 'be': 21,
 'was': 22,
 'have': 23,
 'are': 24,
 'not': 25,
 'they': 26,
 'as': 27,
 'if': 28,
 'so': 29,
 'just': 30,
 'what': 31,
 'can': 32,
 'like': 33,
 'he': 34,
 'or': 35,
 'at': 36,
 'we': 37,
 'me': 38,
 'from': 39,
 'your': 40,
 'm': 41,
 'do': 42,
 'com': 43,
 'all': 44,
 'about': 45,
 'an': 46,
 'one': 47,
 'there': 48,
 'would': 49,
 'out': 50,
 'up': 51,
 'when': 52,
 'more': 53,
 'get': 54,
 'don': 55,
 'people': 56,
 'by': 57,
 'will': 58,
 'no': 59,
 'how': 60,
 'https': 61,
 'gt': 62,
 'has': 63,
 'them': 64,
 'his': 65,
 'time': 66,
 'some': 67,
 're': 68,
 'know': 69,
 'think': 70,
 'who': 71,
 'their': 72,
 'because': 73,
 'had': 74,
 'she': 75,
 'here': 76,
 'good': 77,
 'really': 78,
 'www': 79,
 'r': 80,
 'now': 81,
 've': 82,
 'been': 83,
 'only

## Train

In [196]:
class WeightsHistory(callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.log_weights(0)

    def on_epoch_end(self, epoch, logs={}):
        self.log_weights(epoch)
        
    def log_weights(self, step):
        experiment.log_histogram_3d(self.model.get_layer('LSTM_layer').get_weights()[0], name='lstm_weights',
                                   step=step)  
        experiment.log_histogram_3d(model.get_layer('embeddings_layer').get_weights()[0], 
                            name='embedding_weights',
                           step=step)
        experiment.log_histogram_3d(model.get_layer('numerical_dense_layer').get_weights()[0], 
                                    name='numerical_dense_weights',
                                   step=step)
        experiment.log_histogram_3d(model.get_layer('sparse_feat_dense_layer').get_weights()[0], 
                            name='sparse_dense_weights',
                           step=step)
        
        
class FreezeLayer(callbacks.Callback):
    def __init__(self, logs={}, patience=5, layer='embeddings_layer', verbose=1, set_to=False):
        super(FreezeLayer, self).__init__()
        self.freeze_epoch = patience
        self.freeze_layer = layer
        self.verbose = verbose
        self.set_to = set_to

    def on_epoch_begin(self, epoch, logs={}):
        if epoch == self.freeze_epoch:
            layer = model.get_layer(self.freeze_layer)
            old_value = layer.trainable
            layer.trainable = self.set_to
            if self.verbose:
                print("Setting %s layer from %s to trainable=%s..." % (layer.name, old_value,
                                                               model.get_layer(self.freeze_layer).trainable))


In [197]:
def train_model(model, 
                x_train, y_train, x_test, y_test, 
                batch_size, epochs, class_weight, start_epoch=0, workers=4,
                callback_list = [],
                model_path='/tmp/model'):
    print('Train...')
    experiment.log_parameter('class_weight', class_weight.values())
    experiment.log_parameter('callbacks', callbacks)

    history = model.fit(x_train, y_train,
              batch_size=batch_size,
              epochs=epochs, initial_epoch=start_epoch, 
#               class_weight=class_weight,
              validation_data=[x_test, y_test],
#               validation_split=0.3,
                       workers=workers,
            callbacks = [
                callbacks.ModelCheckpoint(filepath='%s_best' % model_path, verbose=1, 
                                          save_best_only=True, save_weights_only=True),
                callbacks.EarlyStopping(patience=5), *callback_list
            ])
    model.save(model_path, save_weights_only=True)
    experiment.log_parameter('model_path', model_path)
    return history

In [None]:
%%time

freeze_layer = FreezeLayer(hyperparams['freeze_patience'], set_to=not hyperparams['trainable_embeddings'])
weights_history = WeightsHistory()
reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=hyperparams['reduce_lr_factor'],
                          patience=hyperparams['reduce_lr_patience'], min_lr=0.000001, verbose=1)

history = train_model(model, x_train, y_train, x_test, y_test,
           epochs=15, batch_size=hyperparams['batch_size'],
                      class_weight={0:0.5, 1:5}, 
                      callback_list = [freeze_layer, weights_history, reduce_lr],
                      model_path='models/lstm_plus9', workers=2)

Train...
Train on 111375 samples, validate on 41455 samples
Epoch 1/15
Epoch 00001: val_loss improved from inf to 0.38412, saving model to models/lstm_plus9_best

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 2/15
Epoch 00002: val_loss improved from 0.38412 to 0.37932, saving model to models/lstm_plus9_best

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 3/15
Epoch 00003: val_loss improved from 0.37932 to 0.37418, saving model to models/lstm_plus9_best

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 4/15
Epoch 00004: val_loss did not improve from 0.37418
Epoch 5/15
Epoch 00005: val_loss did not improve from 0.37418

Epoch 00005: ReduceLROnPlateau reducing learning rate to 0.00020000000949949026.
Setting embeddings_layer layer from False to trainable=True...
Epoch 6/15
Epoch 00006: val_loss did not improve from 0.37418
Epoch 7/15

In [165]:
model.get_weights()

[array([[ 1.5279915e-02,  4.7590178e-01, -4.8616432e-02, ...,
          4.0647617e-01,  4.0538330e-02,  2.4765652e-01],
        [ 9.5151998e-02,  3.7024000e-01,  5.4290998e-01, ...,
         -5.1082999e-01,  4.6877000e-01,  3.4882000e-01],
        [-3.9621000e-04,  4.5670000e-01,  3.3890000e-01, ...,
         -4.2910001e-01,  1.0746000e+00, -3.6550000e-01],
        ...,
        [-7.0870000e-01, -6.0604000e-01, -4.4224000e-01, ...,
         -8.3099999e-02,  5.5158000e-02,  2.5769001e-01],
        [ 3.9036000e-01, -5.7444000e-01, -6.5509999e-01, ...,
         -5.7653999e-01,  9.5931001e-02,  1.0454000e+00],
        [-4.4363201e-01,  1.0039248e-01,  6.3836761e-02, ...,
          2.6907095e-01,  7.8583762e-02, -3.8874221e-01]], dtype=float32),
 array([[ 0.05118892,  0.01830989, -0.0848855 , ...,  0.14976056,
          0.04705308, -0.06339781],
        [ 0.00329915, -0.05218123,  0.13718773, ..., -0.03093715,
         -0.07668457, -0.17208682],
        [ 0.01844802,  0.02189548, -0.00731012

In [107]:
plot_model(model, to_file='models/lstm_plus4.png')

In [161]:
dependencies = {
    'f1_m': f1_m,
    'precision_m': precision_m,
    'recall_m': recall_m,
    'binary_crossentropy_custom': binary_crossentropy_custom
}
model = load_model('models/lstm_plus8_remote', custom_objects=dependencies)

Labels Tensor("dense_1_target_1:0", shape=(?, ?), dtype=float32)
Predictions Tensor("Sigmoid_2:0", shape=(?, 1), dtype=float32)
True positives Tensor("metrics_1/f1_m/Sum_2:0", shape=(), dtype=float32)
Labels Tensor("dense_1_target_1:0", shape=(?, ?), dtype=float32)
Predictions Tensor("Sigmoid_2:0", shape=(?, 1), dtype=float32)
True positives Tensor("metrics_1/f1_m/Sum_7:0", shape=(), dtype=float32)
Labels Tensor("dense_1_target_1:0", shape=(?, ?), dtype=float32)
Predictions Tensor("Sigmoid_2:0", shape=(?, 1), dtype=float32)
True positives Tensor("metrics_1/recall_m/Sum:0", shape=(), dtype=float32)
Labels Tensor("dense_1_target_1:0", shape=(?, ?), dtype=float32)
Predictions Tensor("Sigmoid_2:0", shape=(?, 1), dtype=float32)
True positives Tensor("metrics_1/recall_m/Sum_3:0", shape=(), dtype=float32)


In [163]:
model.evaluate(x_test, y_test)



[5.413350227315928, 0.024855131, 0.057499193, 0.017912848]

In [None]:
predictions = model.predict(x_test)

In [155]:
voc['prediction']

5631

In [157]:
import pickle
voc2=pickle.load(open('vocabulary20K_selfharm.pkl', 'rb'))
voc2['prediction']

5644

In [None]:
pd.Series(predictions.flatten()).hist()

In [None]:
sum(predictions>0.5)

In [None]:
sum(predictions<0.5)

## Evaluate per user

In [139]:
def get_data_for_point(subject, voc, hyperparams_features=hyperparams_features, nrc_lexicon=nrc_lexicon,
                      emotions=emotions):
    eval_writings_df = writings_df[writings_df['subject']==subject]
    correct_label = eval_writings_df.label.values[0]
    (x_train, y_train), (x_test, y_test), voc = load_erisk_data(eval_writings_df,
                        seq_len=hyperparams_features['maxlen'],
                        voc_size=hyperparams_features['max_features'],
                        emotion_lexicon=nrc_lexicon,
                        emotions=emotions, user_level=False,
                        train_prop=0.0, vocabulary=voc)
    return x_test, y_test, correct_label

In [151]:
def predict_per_user(writings_df, majority_prop=0.2, majority_nr=0, validate=False, voc=None):
    all_predictions = []
    all_labels = []
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    thresh=0.5
    majority_proportion=majority_prop
    validation_data_prop = 0.4
    train_prop = 0.7
    all_subjects = sorted(list(set(writings_df.subject)))
    training_subjects_size = int(len(all_subjects) * train_prop)
    training_subjects = all_subjects[:training_subjects_size]
    test_subjects = [s for s in all_subjects if s not in training_subjects]
    validation_subjects = test_subjects[:int(validation_data_prop*len(test_subjects))]
    if validate:
        subjects = validation_subjects
    else:
        subjects = [s for s in test_subjects if s not in validation_subjects]
    for subject in subjects:
        x_test_user, y_test_user, label = get_data_for_point(subject, voc=voc)
        outputs = model.predict(x_test_user)
        positive_pred = sum(outputs>=thresh)
        negative_pred = sum(outputs<thresh)
        majority_pred = 0
        if majority_proportion and positive_pred >= majority_proportion*negative_pred:
            majority_pred = 1
        if majority_nr and positive_pred>=majority_nr:
            majority_pred = 1
        if label == 1:
            if majority_pred == 1:
                tp+=1
            else:
                fn+=1
        else:
            if majority_pred == 0:
                tn+=1
            else:
                fp+=1
        print(negative_pred, positive_pred, majority_pred)
        all_predictions.append(majority_pred)
        all_labels.append(label)
    def prec_recall_f1(tp, fp, tn, fn):
        recall = tp/(tp+fn+0.0000001)
        precision = tp/(tp+fp+0.0000001)
        f1 = 2*precision*recall/(precision+recall+0.0000001)
        print("Recall", recall, "Precision", precision, "F1", f1)
    if majority_prop:
        print("Vote proportion", majority_prop)
    if majority_nr:
        print("Vote points", majority_nr)
    prec_recall_f1(tp, fp, tn, fn)

        

In [164]:
predict_per_user(writings_df=writings_df, voc=voc, majority_prop=0.2)

Loading data...
0 training users,  1  test users.
[391] [3] 0
Loading data...
0 training users,  1  test users.
[21] [1] 0
Loading data...
0 training users,  1  test users.
[775] [115] 0
Loading data...
0 training users,  1  test users.
[1279] [101] 0
Loading data...
0 training users,  1  test users.
[1156] [16] 0
Loading data...
0 training users,  1  test users.
[24] [5] 1
Loading data...
0 training users,  1  test users.
[18] [1] 0
Loading data...
0 training users,  1  test users.
[44] [2] 0
Loading data...
0 training users,  1  test users.
[13] [3] 1
Loading data...
0 training users,  1  test users.
[47] [9] 0
Loading data...
0 training users,  1  test users.
[1108] [45] 0
Loading data...
0 training users,  1  test users.
[281] [28] 0
Loading data...
0 training users,  1  test users.
[115] [4] 0
Loading data...
0 training users,  1  test users.
[203] [75] 1
Loading data...
0 training users,  1  test users.
[248] [20] 0
Loading data...
0 training users,  1  test users.
[29] [26] 1
Lo

## Extra analysis


In [61]:
def merge_tokens(row):
    tokens = []
    if row.tokenized_text:
        tokens += row.tokenized_text
    if row.tokenized_title:
        tokens += row.tokenized_title
    return tokens
writings_df['all_tokens'] = writings_df.apply (lambda row: merge_tokens(row), axis=1)

In [123]:
# TODO: include the title
def extract_emotions(tokens, emotion, relative=True):
    if not tokens:
        return None
    emotion_words = [t for t in tokens 
                     if t in nrc_lexicon[emotion]]
    if relative:
        return len(emotion_words) / len(tokens)
    else:
        return len(emotion_words)
    
    return encoded_emotions

from functools import partial
for emotion in emotions:
    writings_df[emotion] = writings_df['all_tokens'].apply(partial(extract_emotions, emotion=emotion, 
                                                                   relative=True))


In [124]:
writings_df['pronouns'] = writings_df['all_tokens'].apply(partial(encode_pronouns, relative=True))

In [100]:
writings_df[['text', 'label', 'pronouns', 'text_len'] + emotions].corr()

Unnamed: 0,label,pronouns,text_len,anger,anticipation,disgust,fear,joy,negative,positive,sadness,surprise,trust
label,1.0,0.104269,0.011986,0.020197,0.031982,0.031271,0.019335,0.040782,0.023853,0.023621,0.032969,0.020421,0.02359
pronouns,0.104269,1.0,0.636745,0.449384,0.567496,0.452098,0.464899,0.54857,0.513029,0.571303,0.524614,0.461328,0.538335
text_len,0.011986,0.636745,1.0,0.708853,0.791715,0.64298,0.738146,0.728836,0.823974,0.867609,0.723653,0.65042,0.834939
anger,0.020197,0.449384,0.708853,1.0,0.643459,0.762591,0.858442,0.564162,0.835345,0.681573,0.774846,0.583704,0.671042
anticipation,0.031982,0.567496,0.791715,0.643459,1.0,0.573916,0.668326,0.834784,0.684882,0.849864,0.668269,0.727331,0.818885
disgust,0.031271,0.452098,0.64298,0.762591,0.573916,1.0,0.729799,0.526733,0.765865,0.603013,0.737717,0.540439,0.589641
fear,0.019335,0.464899,0.738146,0.858442,0.668326,0.729799,1.0,0.570632,0.862778,0.706676,0.824782,0.569688,0.687232
joy,0.040782,0.54857,0.728836,0.564162,0.834784,0.526733,0.570632,1.0,0.604964,0.850961,0.603296,0.72271,0.811529
negative,0.023853,0.513029,0.823974,0.835345,0.684882,0.765865,0.862778,0.604964,1.0,0.735431,0.840379,0.597634,0.706808
positive,0.023621,0.571303,0.867609,0.681573,0.849864,0.603013,0.706676,0.850961,0.735431,1.0,0.702751,0.678778,0.916526


In [99]:
writings_df[['text', 'label', 'pronouns', 'text_len'] + emotions].groupby('label').mean()

Unnamed: 0_level_0,pronouns,text_len,anger,anticipation,disgust,fear,joy,negative,positive,sadness,surprise,trust
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,0.868213,32.031615,0.386069,0.58984,0.263683,0.478014,0.479908,0.8188,1.280788,0.385315,0.28479,0.83056
1,2.484271,36.398389,0.529232,0.86985,0.416203,0.654371,0.769766,1.152422,1.717428,0.627088,0.375418,1.128341


In [108]:
from nltk.sentiment import SentimentAnalyzer, SentimentIntensityAnalyzer

In [112]:
sid = SentimentIntensityAnalyzer()


In [115]:
sid.polarity_scores("We are here today happiness is all around")

{'neg': 0.0, 'neu': 0.66, 'pos': 0.34, 'compound': 0.5574}

In [117]:
writings_df['neg_vader'] = writings_df.text.apply(lambda t: sid.polarity_scores(t)['neg']
                                                 if type(t)==str else 0)

In [118]:
writings_df

Unnamed: 0,subject,title,date,text,label,tokenized_title,title_len,tokenized_text,text_len,emotions,...,fear,joy,negative,positive,sadness,surprise,trust,pronouns,all_tokens,neg_vader
0,subject8292,If anyone could help with which sub to put thi...,2016-08-02 09:22:12,,0,"[if, anyone, could, help, with, which, sub, to...",11.0,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[if, anyone, could, help, with, which, sub, to...",0.000
1,subject8292,I'm literally never gonna stop waiting...,2016-08-05 09:35:55,,0,"[i, m, literally, never, gonna, stop, waiting]",7.0,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,"[i, m, literally, never, gonna, stop, waiting]",0.000
2,subject8292,This is a really interesting study! Makes sens...,2016-08-05 21:36:24,,0,"[this, is, a, really, interesting, study, make...",9.0,,,,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,"[this, is, a, really, interesting, study, make...",0.000
3,subject8292,The only thing Frank is building ...,2016-08-07 23:35:23,"... Is hype. Think about it, every time he wor...",0,"[the, only, thing, frank, is, building]",6.0,"[is, hype, think, about, it, every, time, he, ...",26.0,0.000000,...,0.0,0.0,3.0,3.0,0.0,0.0,1.0,0.0,"[is, hype, think, about, it, every, time, he, ...",0.000
4,subject8292,Mostly always me during this whole charade,2016-08-09 08:39:41,,0,"[mostly, always, me, during, this, whole, char...",7.0,,,,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,"[mostly, always, me, during, this, whole, char...",0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170693,subject217,,2018-08-19 11:29:21,"this is my personal experience ,it may not ref...",0,,,"[this, is, my, personal, experience, it, may, ...",153.0,0.026144,...,1.0,1.0,1.0,7.0,0.0,1.0,4.0,4.0,"[this, is, my, personal, experience, it, may, ...",0.089
170694,subject217,,2018-08-19 16:17:34,stop looking at 20 million saudis as one entit...,0,,,"[stop, looking, at, 20, million, saudis, as, o...",15.0,0.000000,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,"[stop, looking, at, 20, million, saudis, as, o...",0.145
170695,subject217,,2018-08-19 20:00:31,i am aware of stats now and then. i was just s...,0,,,"[i, am, aware, of, stats, now, and, then, i, w...",198.0,0.030303,...,2.0,3.0,4.0,11.0,3.0,0.0,6.0,16.0,"[i, am, aware, of, stats, now, and, then, i, w...",0.070
170696,subject217,WHAT DID YOU SAY TO ME?,2018-08-20 10:54:11,,0,"[what, did, you, say, to, me]",6.0,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,"[what, did, you, say, to, me]",0.000


In [119]:
writings_df['pos_vader'] = writings_df.text.apply(lambda t: sid.polarity_scores(t)['pos']
                                                 if type(t)==str else 0)

In [120]:
writings_df[['text', 'label', 'pronouns', 'text_len', 'neg_vader', 'pos_vader'] + emotions].groupby('label').mean()

Unnamed: 0_level_0,pronouns,text_len,neg_vader,pos_vader,anger,anticipation,disgust,fear,joy,negative,positive,sadness,surprise,trust
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,0.868213,32.031615,0.054259,0.109981,0.386069,0.58984,0.263683,0.478014,0.479908,0.8188,1.280788,0.385315,0.28479,0.83056
1,2.484271,36.398389,0.079191,0.148154,0.529232,0.86985,0.416203,0.654371,0.769766,1.152422,1.717428,0.627088,0.375418,1.128341


In [125]:
writings_df[['text', 'label', 'pronouns', 'text_len', 'neg_vader', 'pos_vader'] + emotions].corr('spearman')

Unnamed: 0,label,pronouns,text_len,neg_vader,pos_vader,anger,anticipation,disgust,fear,joy,negative,positive,sadness,surprise,trust
label,1.0,0.0978,0.033477,0.06717,0.065211,0.022057,0.025666,0.030664,0.019114,0.033977,0.022934,0.01959,0.032641,0.018109,0.024014
pronouns,0.0978,1.0,0.332071,0.193938,0.221419,0.076345,0.12803,0.094069,0.063176,0.144011,0.07667,0.106055,0.100827,0.10679,0.122914
text_len,0.033477,0.332071,1.0,0.343154,0.159673,0.36046,0.386351,0.312393,0.38141,0.339398,0.37025,0.330075,0.384031,0.349498,0.38962
neg_vader,0.06717,0.193938,0.343154,1.0,0.169624,0.38451,0.141868,0.362582,0.339245,0.126042,0.431111,0.099767,0.374256,0.159302,0.14306
pos_vader,0.065211,0.221419,0.159673,0.169624,1.0,0.079693,0.225925,0.087309,0.07145,0.323148,0.058266,0.270687,0.09504,0.186243,0.231954
anger,0.022057,0.076345,0.36046,0.38451,0.079693,1.0,0.196795,0.583864,0.58746,0.157202,0.631708,0.128169,0.52898,0.273195,0.169261
anticipation,0.025666,0.12803,0.386351,0.141868,0.225925,0.196795,1.0,0.164649,0.241958,0.583107,0.178827,0.452457,0.198972,0.460851,0.469028
disgust,0.030664,0.094069,0.312393,0.362582,0.087309,0.583864,0.164649,1.0,0.440376,0.152731,0.552021,0.116588,0.490181,0.232166,0.153723
fear,0.019114,0.063176,0.38141,0.339245,0.07145,0.58746,0.241958,0.440376,1.0,0.159907,0.576962,0.141985,0.583703,0.24816,0.18424
joy,0.033977,0.144011,0.339398,0.126042,0.323148,0.157202,0.583107,0.152731,0.159907,1.0,0.1134,0.645827,0.17644,0.477317,0.58292


### LIWC

In [41]:
from liwc_readDict import readDict

liwc = readDict('/home/ana/resources/FakeOrFact/features/LIWC/LIWC/liwc.dic')

In [48]:
categories = [c for (w,c) in liwc]
set(categories)

{'achieve',
 'adverb',
 'affect',
 'anger',
 'anx',
 'article',
 'assent',
 'auxverb',
 'bio',
 'body',
 'cause',
 'certain',
 'cogmech',
 'conj',
 'death',
 'discrep',
 'excl',
 'family',
 'feel',
 'filler',
 'friend',
 'funct',
 'future',
 'health',
 'hear',
 'home',
 'humans',
 'i',
 'incl',
 'ingest',
 'inhib',
 'insight',
 'ipron',
 'leisure',
 'money',
 'motion',
 'negate',
 'negemo',
 'nonfl',
 'number',
 'past',
 'percept',
 'posemo',
 'ppron',
 'preps',
 'present',
 'pronoun',
 'quant',
 'relativ',
 'relig',
 'sad',
 'see',
 'sexual',
 'shehe',
 'social',
 'space',
 'swear',
 'tentat',
 'they',
 'time',
 'verb',
 'we',
 'work',
 'you'}

In [49]:
liwc

[['a', 'funct'],
 ['a', 'article'],
 ['abandon*', 'affect'],
 ['abandon*', 'negemo'],
 ['abandon*', 'sad'],
 ['abandon*', 'cogmech'],
 ['abandon*', 'inhib'],
 ['abdomen*', 'bio'],
 ['abdomen*', 'body'],
 ['abilit*', 'achieve'],
 ['able*', 'achieve'],
 ['abortion*', 'bio'],
 ['abortion*', 'health'],
 ['abortion*', 'sexual'],
 ['about', 'funct'],
 ['about', 'adverb'],
 ['about', 'preps'],
 ['above', 'funct'],
 ['above', 'preps'],
 ['above', 'space'],
 ['above', 'relativ'],
 ['abrupt*', 'time'],
 ['abrupt*', 'relativ'],
 ['abs', 'bio'],
 ['abs', 'body'],
 ['absent*', 'work'],
 ['absolute', 'cogmech'],
 ['absolute', 'certain'],
 ['absolutely', 'funct'],
 ['absolutely', 'adverb'],
 ['absolutely', 'cogmech'],
 ['absolutely', 'certain'],
 ['absolutely', 'assent'],
 ['abstain*', 'cogmech'],
 ['abstain*', 'inhib'],
 ['abuse*', 'affect'],
 ['abuse*', 'negemo'],
 ['abuse*', 'anger'],
 ['abusi*', 'affect'],
 ['abusi*', 'negemo'],
 ['abusi*', 'anger'],
 ['academ*', 'work'],
 ['accept', 'affect'],
 

In [50]:
liwc_dict = {}
for (w, c) in liwc:
    if c not in liwc_dict:
        liwc_dict[c] = []
    liwc_dict[c].append(w)


In [86]:
liwc_dict['pronoun']

['anybod*',
 'anyone*',
 'anything',
 'everybod*',
 'everyone*',
 'everything*',
 'he',
 "he'd",
 "he's",
 'hed',
 'her',
 'hers',
 'herself',
 'hes',
 'him',
 'himself',
 'his',
 'i',
 "i'd",
 "i'll",
 "i'm",
 "i've",
 'id',
 'im',
 'it',
 "it'd",
 "it'll",
 "it's",
 'itd',
 'itll',
 'its',
 'itself',
 'ive',
 "let's",
 'lets',
 'me',
 'mine',
 'my',
 'myself',
 'nobod*',
 'oneself',
 'other',
 'others',
 'our',
 'ours',
 'ourselves',
 'she',
 "she'd",
 "she'll",
 "she's",
 'shes',
 'somebod*',
 'someone*',
 'something*',
 'somewhere',
 'stuff',
 'that',
 "that'd",
 "that'll",
 "that's",
 'thatd',
 'thatll',
 'thats',
 'thee',
 'their*',
 'them',
 'themselves',
 'these',
 'they',
 "they'd",
 "they'll",
 "they've",
 'theyd',
 'theyll',
 'theyve',
 'thine',
 'thing*',
 'this',
 'those',
 'thou',
 'thoust',
 'thy',
 'us',
 'we',
 "we'd",
 "we'll",
 "we're",
 "we've",
 'weve',
 'what',
 "what's",
 'whatever',
 'whats',
 'which',
 'whichever',
 'who',
 "who'd",
 "who'll",
 'whod',
 'wholl'

In [55]:
def encode_liwc_categories(tokens, category_words, relative=True):
    category_cnt = 0
    if not tokens:
        return None
    text_len = len(tokens)
    for word in category_words:
        for t in tokens:
            if t==word or (word[-1]=='*' and t.startswith(word[:-1])):
                category_cnt += 1
    if relative:
        return category_cnt/text_len
    else:
        return category_cnt

In [83]:
%%time
from functools import partial
# for categ in ['negemo', 'posemo', 'affect', 'sad', 'anx', 'pronoun']:#liwc_dict.keys():
for categ in ['pronoun']:#liwc_dict.keys():
    print("Computing for category %s..." % categ)
    writings_df[categ] = writings_df['all_tokens'].apply(partial(encode_liwc_categories, 
                                                                   category_words=liwc_dict[categ], 
                                                                   relative=True))


Computing for category pronoun...
CPU times: user 1min 13s, sys: 37.9 ms, total: 1min 13s
Wall time: 1min 33s


In [84]:
writings_df[['label', 'negemo', 'posemo', 'affect', 'sad', 'anx', 'pronoun']].corr()

Unnamed: 0,label,negemo,posemo,affect,sad,anx,pronoun
label,1.0,0.007687,0.008943,0.012005,0.00377,0.009488,0.071618
negemo,0.007687,1.0,-0.058048,0.456381,0.356025,0.303006,-0.011327
posemo,0.008943,-0.058048,1.0,0.860121,-0.020983,-0.024652,-0.01562
affect,0.012005,0.456381,0.860121,1.0,0.162322,0.13199,-0.019918
sad,0.00377,0.356025,-0.020983,0.162322,1.0,0.00473,0.006895
anx,0.009488,0.303006,-0.024652,0.13199,0.00473,1.0,-0.005346
pronoun,0.071618,-0.011327,-0.01562,-0.019918,0.006895,-0.005346,1.0


In [85]:
writings_df[['label', 'negemo', 'posemo', 'affect', 'sad', 'anx', 'pronoun']].groupby('label').mean()

Unnamed: 0_level_0,negemo,posemo,affect,sad,anx,pronoun
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.023493,0.0508,0.074548,0.003242,0.002606,0.120154
1,0.026116,0.056145,0.082611,0.003706,0.003591,0.16231


In [66]:
writings_df.columns

Index(['subject', 'title', 'date', 'text', 'label', 'tokenized_title',
       'title_len', 'tokenized_text', 'text_len', 'all_tokens', 'funct',
       'article', 'negemo'],
      dtype='object')

## Hyperparameter tuning

In [133]:
# Declare your hyperparameters search:
tune_epochs=15
config = {
      "algorithm": "random",
      "parameters": {
          "lstm_units": {"type": "integer", "min": 10, "max": 100},
          "dense_bow_units": {"type": "integer", "min": 1, "max": 20},
          "lr": {"type": "float", "min": 0.00001, "max": 0.05, "scalingType": "loguniform"},
          "l2_dense": {"type": "float", "min": 0.00001, "max": 0.5, "scalingType": "loguniform"},
          "dropout": {"type": "float", "min": 0, "max": 0.7, "scalingType": "uniform"},
          "optimizer": {"type": "categorical", "values": ["adam", "adagrad", ""]},
          "batch_size": {"type": "integer", "min": 10, "max": 512, "scalingType": "loguniform"},
          "positive_class_weight": {"type": "integer", "min": 1, "max": 25},
          "trainable_embeddings": {"type": "discrete", "values": [True, False]},
          "freeze_patience": {"type": "integer", "min": 2, "max": tune_epochs+1},
          "lr_reduce_factor": {"type": "float", "min": 0.0001, "max": 0.8},
          "lr_reduce_patience": {"type": "integer", "min": 2, "max": tune_epochs+1},
          "decay": {"type": "float", "min": 0.00000001, "max": 0.5, "scalingType": "loguniform"},          
      },
      "spec": {
          "metric": "loss",
          "objective": "minimize",
      },
  }
optimizer = Optimizer(config, api_key="eoBdVyznAhfg3bK9pZ58ZSXfv")

for experiment in optimizer.get_experiments(project_name="mental"):
    experiment.add_tag("tune")
    
    # Test the model
    hyperparams_config = {
        param: experiment.get_parameter(param) for param in config['parameters'].keys()}
    if not hyperparams_config['optimizer']:
        hyperparams_config['optimizer'] = optimizers.Adam(lr=hyperparams_config['lr'], 
                                   decay=hyperparams_config['decay'])
    model = build_model(hyperparams=hyperparams_config,
                        hyperparams_features=hyperparams_features, 
                        embedding_matrix=embedding_matrix, emotions=emotions,
                       stopwords_list=stopword_list)
    freeze_layer = FreezeLayer(patience=experiment.get_parameter('freeze_patience'),
                              set_to=not experiment.get_parameter('trainable_embeddings'))
    reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_loss', 
                                            factor=experiment.get_parameter('lr_reduce_factor'),
                                            patience=experiment.get_parameter('lr_reduce_patience'), 
                                            min_lr=0.000001, verbose=1)
    history = train_model(model, 
            x_train, y_train, x_test, y_test,
            epochs=tune_epochs, batch_size=experiment.get_parameter('batch_size'),
                      class_weight={0:1, 1:experiment.get_parameter('positive_class_weight')}, 
                          workers=2,
                          callback_list = [freeze_layer, reduce_lr],
                      model_path='models/experiment')
    loss = history.history['loss'][-1]
    
    # Report the loss, if not auto-logged:
    experiment.log_metric("loss", loss)

COMET INFO: COMET_OPTIMIZER_ID=786fc2b3654047e69f492db122f55b95
COMET INFO: Using optimizer config: {'algorithm': 'random', 'configSpaceSize': 600000000000, 'endTime': None, 'id': '786fc2b3654047e69f492db122f55b95', 'lastUpdateTime': None, 'maxCombo': 0, 'name': '786fc2b3654047e69f492db122f55b95', 'parameters': {'batch_size': {'max': 512, 'min': 10, 'scalingType': 'loguniform', 'type': 'integer'}, 'decay': {'max': 0.5, 'min': 1e-08, 'scalingType': 'loguniform', 'type': 'float'}, 'dense_bow_units': {'max': 20, 'min': 1, 'scalingType': 'uniform', 'type': 'integer'}, 'dropout': {'max': 0.7, 'min': 0, 'scalingType': 'uniform', 'type': 'float'}, 'freeze_patience': {'max': 16, 'min': 2, 'scalingType': 'uniform', 'type': 'integer'}, 'l2_dense': {'max': 0.5, 'min': 1e-05, 'scalingType': 'loguniform', 'type': 'float'}, 'lr': {'max': 0.05, 'min': 1e-05, 'scalingType': 'loguniform', 'type': 'float'}, 'lr_reduce_factor': {'max': 0.8, 'min': 0.0001, 'scalingType': 'uniform', 'type': 'float'}, 'lr_r

Train...
Train on 111375 samples, validate on 31863 samples
Epoch 1/15
Epoch 00001: val_loss improved from inf to 0.48204, saving model to models/experiment_best
Epoch 2/15

KeyboardInterrupt: 