# An LSTM Neural Net to predict depression, or depression symptoms based on the PHQ-8 from transcribed verbal data (utterances)

* this code is exploratory, may contain errors!
* python 2 with Tensorflow
* prediction options include: depression level (continuous, low/med/high, or binary), level for each depression symptom based on PHQ-8 (low/med/high, or binary). Need to change the cost function in the code depending on the outcome chosen. 
* data is unbalanced, so options in code for 1) undersampling the majority class (lower levels of depression) and 2) cost-senstive learning
* various options for drop out, L1/L2 regaularization

Unique features of this model compared to other frameworks explored:
* includes embedding layer prior to LSTM layer. Here, the emebddings are learned along the way not pre-trained. 
* set up for mulit-task learning, to try predicting multiple outcomes (e.g., predict two symptoms rather than just one). The goal of this is to improve generalizability of the model. 
* sequences of various lengths may be used, or a sliding window of words (to increase sample size and variability of utterances)
* debug_sentences allows us to write the sentences in each part of the confusion matrix to a file for later examination of the erors


Alina notes to self:
* data corresponding to IDs without meta-information and data corresponding to missing target information are excluded in Load_Data()
* for variable coding see codebook excel

Helpful tutorials:
* https://github.com/nfmcclure/tensorflow_cookbook/blob/master/09_Recurrent_Neural_Networks/02_Implementing_RNN_for_Spam_Prediction/02_implementing_rnn.py
* Long Short Term Memory paper: http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf; Author: Aymeric Damien; Project: https://github.com/aymericdamien/TensorFlow-Examples/    

In [1]:
import os
import re
import io
import requests
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import random
from zipfile import ZipFile
from tensorflow.python.framework import ops
ops.reset_default_graph()

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import sys
import csv
from collections import deque
from itertools import islice


import matplotlib.mlab as mlab
import matplotlib.pyplot as plt

cwd= os.getcwd()

In [2]:
# ==========
#   MODEL
# Some LSTM code and code structure reused from M. Morales
# ==========
class BaseConfiguration:

    def __init__(self):
        self.learning_rate = 0.0001 #also try .001, i think accuracy and confusion matrix could be more stable with lower learning rate
        self.training_iters = 3000000  #was 50,000,000. Consider reducing further if you notice that the testing accuracy becomes stable (or starts dropping) earlier than this number of iterations, to prevent overfitting. 
        self.batch_size = 50
        self.display_step = 1
        self.val_step = 50
        self.L2_penalty= 0.4 #beta term on L2 weight penalty for L2 normalization, if set to 0.0 for no L2 normalization. I think confusion matrix becomes unbalanced (but accuracy higher) with L2 noamlization
        # Network Parameters
        self.seq_max_len = 115 # Sequence max length. In the case of rolling windows, this is also the size of the window (i.e., how many words are in a window)
        self.seq_min_len = 7
        self.embedding_size = 30
        self.n_hidden = 30 # hidden layer num of features, per layer. 15 seems to small, doesn't learn beyond 50%
        self.n_classes = 2 # linear sequence or not
        self.num_layers = 1 #keep at 1 for now, should 2x check the function before adding more than 1 layer. #https://r2rt.com/recurrent-neural-networks-in-tensorflow-ii.html
        self.keep_prob = .9 #for no dropout, use 1. This only performs dropout on the LSTM layer, for inputs AND outputs outputs. If config.n_layers>1, dropout will work on every layer, i.e., if set dropout to .9 and 2 layers, total dropout is ,.9*.9
        self.debug_sentences = False #if True, sentences corresponding to each cell in the confusion matrix are written in folder "sentence_validation"
        self.min_word_frequency=15 #wont work well below 15
        self.validation_IDs_proportion=.15 #proportion of IDs (among IDs that exist in meta data, and have the specified target value) that are held out as "validation" data
        self.balance_classes=True #use for balancing classification with undersampling. If True, will only sample the number of the minority category utterances to be same as high utterances (for training data)
        self.step_windows=1 #the step (amt of words to jump) to get to the next rolling windows.  
        self.toss=75 #skip first config.toss words in each transcript to get over small-talk. Best between 50-75 number after reading a few transcripts, could try firsrt 75 or 100 words even

    def printConfiguration(self):
        # print configuration
        print ('---------- Configuration: ----------')
        print ('learning_rate', self.learning_rate)
        print ('training_iters', self.training_iters)
        print ('batch_size', self.batch_size) 
        print ('L2_penalty', self.L2_penalty) 

        # Network Parameters
        print ('seq_max_len', self.seq_max_len )# Sequence max length
        print ('seq_min_len', self.seq_min_len)
        print ('embedding_size', self.embedding_size)
        print ('n_hidden', self.n_hidden) # hidden layer num of features
        print ('n_classes', self.n_classes) # linear sequence or not
        print ('num_layers', self.num_layers)
        print ('keep_prob (dropout = 1-keep_prob)', self.keep_prob)
        print ('------------------------------------')
	#print 'n_hidden: ', self.n_hidden, 'learning rate: ', self.learning_rate, ' batch_size: ', self.batch_size, ' max_len: ', self.seq_max_len, ' min_len: ', self.seq_min_len

# Parameters
# configuration
config = BaseConfiguration()

Reset the tensorflow graph

In [3]:
tf.reset_default_graph() #to reset tensorflow graph if you've already made one graph, once re-set need to run through to get variable agani. 

Set up Tensorflow Graph Input

In [4]:
# tf Graph input
#x = tf.placeholder(tf.float32, [None, config.seq_max_len]) #this plaecholder for x used if pre-trained embeddings are used
y1 = tf.placeholder(tf.float32, [None, config.n_classes])
x = tf.placeholder(tf.int32, [None, config.seq_max_len]) #this placeholder for x used if training embedding layer
y2= tf.placeholder(tf.float32, [None, config.n_classes])

#x = tf.placeholder(tf.float32, [None, config.seq_max_len, config.embedding_size]) #Number of examples, number of input, dimension of each input

keep_prob = tf.placeholder(tf.float32)
# A placeholder for indicating each sequence length
seqlen = tf.placeholder(tf.int32, [None])

# Define weights
weights1 = {
    'out': tf.Variable(tf.random_normal([config.n_hidden, config.n_classes], seed= 43)) #note weights are initialized as normal random variable
}
biases1 = {
    'out': tf.Variable(tf.random_normal([config.n_classes], seed= 43))  #note biases are initialized as normal random variable
}


weights2 = {
    'out': tf.Variable(tf.random_normal([config.n_hidden, config.n_classes], seed= 43)) #note weights are initialized as normal random variable
}
biases2 = {
    'out': tf.Variable(tf.random_normal([config.n_classes], seed= 43))  #note biases are initialized as normal random variable
}


In [5]:
def random_batch(x, y1, y2,  sequence_length, part_size=0, sentences=None):
    batch_x = []
    batch_y1 = []
    batch_y2 = []
    batch_sequence_length = []
    batch_sentences = []
    indexes = range(len(x))
    random.shuffle(indexes)
    #print(indexes[0:batch_size-1])
    for index in indexes[0:part_size-1]:
        batch_x.append(x[index])
        batch_y1.append(y1[index])
        batch_y2.append(y2[index])
        batch_sequence_length.append(sequence_length[index])
        if sentences is not None:
            batch_sentences.append(sentences[index])
    return batch_x, batch_y1, batch_y2, batch_sequence_length, batch_sentences

def last_relevant(output, length):
    config.batch_size = tf.shape(output)[0]
    max_length = int(output.get_shape()[1])
    output_size = int(output.get_shape()[2])
    index = tf.range(0, config.batch_size) * max_length + (length - 1)
    flat = tf.reshape(output, [-1, output_size])
    relevant = tf.gather(flat, index)
    return relevant

In [6]:
def import_meta_data(path):
    """Imports labels as a dictionary- must designate below which label you want"""
    labels = {}
    # load the CSV file as a numpy matrix
    with open (path,'r') as csv:
        dataset = csv.readlines()
    for row in dataset[1:]:
        ID, depression, dep_binary2, depression_value, depression_level_official, Morethan7PHQsympt_Available,PHQ_9NoInterest,PHQ_9Depressed,PHQ_9Sleep,PHQ_9Tired,PHQ_9Appetite,PHQ_9Failure,PHQ_9Concentrating,PHQ_9Moving,PHQ_9NoInterest_level,PHQ_9Depressed_level,PHQ_9Sleep_level,PHQ_9Tired_level,PHQ_9Appetite_level,PHQ_9Failure_level,PHQ_9Concentrating_level,PHQ_9Moving_level  = row.strip().split(',')
        #map depression strings to int values
        depression_level_official = depression_level_official.strip()
        PHQ_9NoInterest_level = PHQ_9NoInterest_level.strip()
        PHQ_9Depressed_level = PHQ_9Depressed_level.strip()
        PHQ_9Sleep_level = PHQ_9Sleep_level.strip()
        PHQ_9Tired_level = PHQ_9Tired_level.strip()
        PHQ_9Appetite_level = PHQ_9Appetite_level.strip()
        PHQ_9Failure_level = PHQ_9Failure_level.strip()
        PHQ_9Concentrating_level = PHQ_9Concentrating_level.strip()
        PHQ_9Moving_level = PHQ_9Moving_level.strip()
        
        if config.n_classes==2:
            if depression_level_official=='low':
                depression_level_official = 0
            elif depression_level_official=='mid':
                depression_level_official = 1
            elif depression_level_official=='high':
                depression_level_official =1
        
        if config.n_classes==3:
            if depression_level_official=='low':
                depression_level_official = 0
            elif depression_level_official=='mid':
                depression_level_official = 1
            elif depression_level_official=='high':
                depression_level_official =2
        
        if dep_binary2=='0': #need to do this because there are NAs in the data, so it is loaded in as a string not number
            dep_binary2=0
        elif dep_binary2=='1':
            dep_binary2=1
            
        if PHQ_9NoInterest_level=='low':
            PHQ_9NoInterest_level = 0
        elif PHQ_9NoInterest_level=='mid':
            PHQ_9NoInterest_level = 1
        elif PHQ_9NoInterest_level=='high':
            if config.n_classes==2:
                PHQ_9NoInterest_level =1
            elif config.n_classes==3:
                PHQ_9NoInterest_level =2    
       
        if PHQ_9Depressed_level=='low':
            PHQ_9Depressed_level = 0
        elif PHQ_9Depressed_level=='mid':
            PHQ_9Depressed_level = 1
        elif PHQ_9Depressed_level=='high':
            if config.n_classes==2:
                PHQ_9Depressed_level =1
            elif config.n_classes==3:
                PHQ_9Depressed_level =2     
            
        if PHQ_9Sleep_level=='low':
            PHQ_9Sleep_level = 0
        elif PHQ_9Sleep_level=='mid':
            PHQ_9Sleep_level = 1
        elif PHQ_9Sleep_level=='high':
            if config.n_classes==2:
                PHQ_9Sleep_level =1
            elif config.n_classes==3:
                PHQ_9Sleep_level =2     
    
        if PHQ_9Tired_level=='low':
            PHQ_9Tired_level = 0
        elif PHQ_9Tired_level=='mid':
            PHQ_9Tired_level = 1
        elif PHQ_9Tired_level=='high':
            if config.n_classes==2:
                PHQ_9Tired_level =1
            elif config.n_classes==3:
                PHQ_9Tired_level =2     
            
        if PHQ_9Appetite_level=='low':
            PHQ_9Appetite_level = 0
        elif PHQ_9Appetite_level=='mid':
            PHQ_9Appetite_level = 1
        elif PHQ_9Appetite_level=='high':
            if config.n_classes==2:
                PHQ_9Appetite_level =1
            elif config.n_classes==3:
                PHQ_9Appetite_level=2      
            
        if PHQ_9Failure_level=='low':
            PHQ_9Failure_level = 0
        elif PHQ_9Failure_level=='mid':
            PHQ_9Failure_level = 1
        elif PHQ_9Failure_level=='high':
            if config.n_classes==2:
                PHQ_9Failure_level =1
            elif config.n_classes==3:
                PHQ_9Failure_level =2      
            
        if PHQ_9Concentrating_level=='low':
            PHQ_9Concentrating_level = 0
        elif PHQ_9Concentrating_level=='mid':
            PHQ_9Concentrating_level = 1
        elif PHQ_9Concentrating_level=='high':
            if config.n_classes==2:
                PHQ_9Concentrating_level =1
            elif config.n_classes==3:
                PHQ_9Concentrating_level =2     
        
        if PHQ_9Moving_level=='low':
            PHQ_9Moving_level = 0
        elif PHQ_9Moving_level=='mid':
            PHQ_9Moving_level = 1
        elif PHQ_9Moving_level=='high':
            if config.n_classes==2:
                PHQ_9Moving_level =1
            elif config.n_classes==3:
                PHQ_9Moving_level =2     
        
        labels[ID.strip()] = [depression, dep_binary2, depression_value, depression_level_official,PHQ_9NoInterest_level,PHQ_9Depressed_level,PHQ_9Sleep_level,PHQ_9Tired_level,PHQ_9Appetite_level,PHQ_9Failure_level,PHQ_9Concentrating_level,PHQ_9Moving_level]
    return labels
        

In [7]:
#meta_data = import_meta_data('Meta_Data_with_Symptoms.csv')
#meta_data['311']

Load Transcript Data, to eventually reshape into moving windows of utterances

In [8]:
data_file_path= cwd + '/REcleaned_utterances_data_July2017/Cleaned_Utterances_by_ID/'

#load utterances data
utterances=[]
IDs_list= []

files= os.listdir(data_file_path)
#files= ['Utterances771.txt', 'Utterances300.txt']
lengths=[]

for ppt_file in files:
    path= data_file_path + str(ppt_file)
    f = open(path,'r')
    lines =  [line.strip() for line in f.readlines()]
    lines= " ".join(lines)
    lines= lines.split()
    if len(lines) > ((config.toss+config.seq_max_len) -1):
        skip=config.toss
        utterances.append(lines[skip:]) #skip first 'toss' words to get over small-talk. chose this number after reading a few transcripts, could try firsrt 75 or 100 words even
        ID=re.sub('Utterances', '', ppt_file)
        ID=re.sub('.txt', '', ID)
        IDs_list.append(ID)
        lengths.append(len(lines))
    f.close()

#Plot distribution of transcript lengths
'''
plt.hist(lengths, 300, normed=1, facecolor='green', alpha=0.75)
plt.xlabel('Transcript Length')
plt.ylabel('Probability')
plt.title('Distribution of Transcript Lengths')
plt.show()

#statistics on transcript lengths
import statistics
from fractions import Fraction as F
from decimal import Decimal as D

low=0
print(statistics.mean(lengths), statistics.median(lengths), min(lengths), max(lengths))
for i in lengths:
    if i<200:
        low=low+1
'''

"\nplt.hist(lengths, 300, normed=1, facecolor='green', alpha=0.75)\nplt.xlabel('Transcript Length')\nplt.ylabel('Probability')\nplt.title('Distribution of Transcript Lengths')\nplt.show()\n\n#statistics on transcript lengths\nimport statistics\nfrom fractions import Fraction as F\nfrom decimal import Decimal as D\n\nlow=0\nprint(statistics.mean(lengths), statistics.median(lengths), min(lengths), max(lengths))\nfor i in lengths:\n    if i<200:\n        low=low+1\n"

Get Transcript Data into moving windows of utterances. Window size and jump between windows is set in config. 

In [9]:
#adjusted from: https://stackoverflow.com/questions/6822725/rolling-or-sliding-window-iterator-in-python
def sliding_window(iterable, size, step, fillvalue=' '):
    if size < 0 or step < 1:
        raise ValueError
    it = iter(iterable) #list of the words in the transcript
    q = deque(islice(it, size), maxlen=size)
    if not q:
        return  # empty iterable or size == 0
    q.extend(fillvalue for _ in range(size - len(q)))  # pad to size
    while True:
        yield iter(q)  # iter() to avoid accidental outside modifications
        try:
            q.append(next(it))
        except StopIteration: # Python 3.5 pep 479 support
            return
        q.extend(next(it) for _ in range(step - 1))
        
'''Testing:

utts=[]
for i in sliding_window(trial, 10, 5):
    utts.append(list(i))

trial='um my family moved to the u_s and then i moved down here eventually for college uh it took a long time to ive been living here'
trial=trial.split( )
len(trial)

for i in utts:
    print(i)
'''
utterances_seqs=[]
for i in range(0,len(utterances)):
    for j in sliding_window(utterances[i], config.seq_max_len, config.step_windows):
        listy= ' '.join(list(j))
        listy= listy + ',' + str(IDs_list[i])
        listy= ''.join(listy).split(',')
        #utts.append(list(j)+ [str(IDs[i])])
        #utts.append(IDs[i]) #now utterances_seqs[][-1] is ID number
        utterances_seqs.append(listy)

Function for merging utterance window data and meta, and shaping for the RNN

In [None]:
missing_meta=[] #this is just for inspecting IDs that aren't in the meta-data file, and so not used in RNN code
IDs_NAs_inMeta=[] #this is just for inspecting IDs that are in the meta-data file but don't have target values, so not used in RNN code

def load_data(target1=None, target2= None, meta_file_path='Meta_Data_with_Symptoms.csv', LIWC_file_path=cwd + '/LIWC/LIWC_features_per_ID.csv' ):

    if target1 is None:
        print("Target not specified: possible target classes: depression_binary=0, dep_binary2=1, depression value=2, depression_level_official=3, PHQ_9NoInterest_level=4,PHQ_9Depressed_level=5,PHQ_9Sleep_level=6,PHQ_9Tired_level=7,PHQ_9Appetite_level=8,PHQ_9Failure_level=9,PHQ_9Concentrating_level=10,PHQ_9Moving_level=11")
        raise
    #get labels
    meta_data = import_meta_data(meta_file_path)
    
    
    #training data variables
    utterances_withIDs = [] #utterances of training data with meta data (ID of speaker, line in transcript, and quarter of transcript)
    data_y1 = [] #this will be an array with an element for each sentence in the training data, with the depression label, in the format [X,X,X] 
    data_y2= []
    sequence_length = [] #vector of sequence lengths for each utterances in the training data
    sentences = [] #just the utterances of training data, no meta data included 
    Nlow_utterances=[] #number of utterances from those in training category with low depression, used to balance cost function
    Nmid_utterances=[] #number of utterances from those in training category with mid depression, used to balance cost function
    Nhigh_utterances=[] #number of utterances from those in training category with high depression, used to balance cost function
    IDs_training= []
    
    #validation data variables
    utterances_validation=[]  #utterances of validation data with meta data (ID of speaker, line in transcript, and quarter of transcript)
    validation_y1 =[]  #this will be an array with an element for each sentence in the validation data, with the depression label, in the format [X,X,X] 
    validation_y2 =[]
    sequence_length_validation=[] #vector of sequence lengths for each utterances in the training data
    sentences_validation=[] #just the utterances of validation data, no meta data included 
    IDs_validation=[] #list of validation IDs so that validation ID's are excluded from training data, and it is in same order as the utterances, sentences, sequence length, y, etc. 


    #get validation IDs, and list them in validation_IDs_set
    IDs_noNAs_inMeta=[] #list of IDs in sentences that have corresponding meta data and have the specified target value
    for L in utterances_seqs:
        ID = L[1].strip() 
        try:
            meta = meta_data[ID]
            if meta[target1] != 'NA':
                if meta[target2] != 'NA':
                    IDs_noNAs_inMeta.append(ID)
            else:
                IDs_NAs_inMeta.append(ID) #just in case want to track who was excluded
        except:
            missing_meta.append(ID)
            continue       
    validation_IDs_set= random.sample(set(IDs_noNAs_inMeta), int(round(config.validation_IDs_proportion*len(set(IDs_noNAs_inMeta))))) #get number of IDs for validation set based on the proportion set in the configuration, round the number to nearest integer
    #fill in variables for utterances (i.e, sentences with meta info), sentences, IDs, and sequence lengths (for training and validation sets separately)
    utterances_withIDs_low=[] #this can be used to sample among utterances for "low" depression
    utterances_withIDs_mid=[] #this can be used to sample among utterances for "mid" depression

    for L in utterances_seqs:
        ID = L[1].strip()    
        length = len(L[0].split(" ")) #each line is for example: ['me there at all can you do it at home', ' 602', '1', 'first quarter'], so the 0th item is the words, this is measuring characters, if want to measure words split here split.("\\s+")
        if length > config.seq_max_len: #truncate at max sequence length
            length=config.seq_max_len
        minimum_seqs= config.seq_min_len-1 #such that the minimum length in the next line is inclusive of the minimum number itself (e.g. if minimum sequence length is 6 words then sequences of 6 will be included)
        
        if length > minimum_seqs:
            try:
                meta = meta_data[ID]
                if meta[target1] == 0: #exclude participants without data for the specific target
                    if ID in IDs_noNAs_inMeta: #IDs in validation set are EXCLUDED from variables below
                        if ID not in validation_IDs_set:
                            utterances_withIDs_low.append(L) # can sample among these,below #this is a list of [utterance, id] for each utterance. but only includes those corresponding to indivduals with valid ids
                        elif ID in validation_IDs_set:
                            sentences_validation.append(L[0]) 
                            utterances_validation.append(L) 
                            IDs_validation.append(ID)
                            sequence_length_validation.append(length)
                elif meta[target1] ==1: #exclude participants without data for the specific target
                    if ID in IDs_noNAs_inMeta: 
                        if ID not in validation_IDs_set: #IDs in validation set are EXCLUDED from variables below
                            utterances_withIDs_mid.append(L) #this is a list of [utterance, id] for each utterance. but only includes those corresponding to indivduals with valid ids
                        elif ID in validation_IDs_set:
                            sentences_validation.append(L[0]) 
                            utterances_validation.append(L) 
                            IDs_validation.append(ID)
                            sequence_length_validation.append(length)
                elif meta[target1] ==2: #exclude participants without data for the specific target
                     if ID in IDs_noNAs_inMeta: 
                        if ID not in validation_IDs_set: #IDs in validation set are EXCLUDED from variables below
                            sentences.append(L[0]) #now this is a list of utterances coresponding to indivdiuals with valid IDs
                            utterances_withIDs.append(L) #this is a list of [utterance, id] for each utterance. but only includes those corresponding to indivduals with valid ids
                            IDs_training.append(ID)
                            sequence_length.append(length)
                        elif ID in validation_IDs_set:
                            sentences_validation.append(L[0]) 
                            utterances_validation.append(L) 
                            IDs_validation.append(ID)
                            sequence_length_validation.append(length)
            except:
                #print('invalid ID: ', ID)
                continue
    
    #this is where the "low" (for 2 or 3 class predictions) and "mid" (for 3 class predictions) level depression setences may be undersampled
    if config.balance_classes==True:
        if config.n_classes==2:
            try:
                #try this, will throw and exception for a few PHQ symptoms which are arleady balanced enough and have more mid than low
                utterances_withIDs_low= random.sample(utterances_withIDs_low, len(utterances_withIDs_mid)) #for three classes not right sampling
            except:
                pass
        elif config.n_classes==3:
            utterances_withIDs_low= random.sample(utterances_withIDs_low, len(utterances_withIDs)) #for three classes not right sampling
            utterances_withIDs_mid= random.sample(utterances_withIDs_mid, len(utterances_withIDs)) #for three classes not right sampling
        
    for L in utterances_withIDs_low:
        ID = L[1].strip()    
        length = len(L[0].split(" ")) #each line is for example: ['me there at all can you do it at home', ' 602', '1', 'first quarter'], so the 0th item is the words, this is measuring characters, if want to measure words split here split.("\\s+")
        if length > config.seq_max_len: #truncate at max sequence length
            length=config.seq_max_len
        
        sentences.append(L[0]) #now this is a list of utterances coresponding to indivdiuals with valid IDs
        utterances_withIDs.append(L) 
        IDs_training.append(ID)
        sequence_length.append(length)
    
    for L in utterances_withIDs_mid:
        ID = L[1].strip()    
        length = len(L[0].split(" ")) #each line is for example: ['me there at all can you do it at home', ' 602', '1', 'first quarter'], so the 0th item is the words, this is measuring characters, if want to measure words split here split.("\\s+")
        if length > config.seq_max_len: #truncate at max sequence length
            length=config.seq_max_len
        
        sentences.append(L[0]) #now this is a list of utterances coresponding to indivdiuals with valid IDs
        utterances_withIDs.append(L) 
        IDs_training.append(ID)
        sequence_length.append(length)
    #fill in re-shaped labels variables (for training and validation separately, and do depending on the number of classes- 2 or 3)
    if config.n_classes==2:
        for L in utterances_withIDs:
            ID = L[1].strip()
            meta = meta_data[ID]
            if meta[target1]==0:
                data_y1.append([1.0, 0.0])
                Nlow_utterances.append(1)
            elif meta[target1] == 1:
                data_y1.append([0.0, 1.0])
                Nmid_utterances.append(1)
            
            if meta[target2]==0:
                data_y2.append([1.0, 0.0])
            elif meta[target2] == 1:
                data_y2.append([0.0, 1.0])

    if config.n_classes==3:
        for L in utterances_withIDs:
            ID = L[1].strip()
            meta = meta_data[ID]
            if meta[target1]==0:
                data_y1.append([1.0, 0.0, 0.0]) 
                Nlow_utterances.append(1)
            elif meta[target1] == 1:
                data_y1.append([0.0, 1.0, 0.0])
                Nmid_utterances.append(1)
            elif meta[target1] == 2:
                data_y1.append([0.0, 0.0, 1.0]) #for binary depression categorization, this else should be ignored by default
                Nhigh_utterances.append(1)
            
            if meta[target2]==0:
                data_y2.append([1.0, 0.0, 0.0]) 
            elif meta[target2] == 1:
                data_y2.append([0.0, 1.0, 0.0])
            elif meta[target2] == 2:
                data_y2.append([0.0, 0.0, 1.0]) #for binary depression categorization, this else should be ignored by default
    
    if config.n_classes==2:
        for L in utterances_validation:
            ID = L[1].strip()
            meta = meta_data[ID]
            if meta[target1]==0:
                validation_y1.append([1.0, 0.0]) #need labels in this shape for the RNN
            elif meta[target1] == 1:
                validation_y1.append([0.0, 1.0])
            
            if meta[target2]==0:
                validation_y2.append([1.0, 0.0]) #need labels in this shape for the RNN
            elif meta[target2] == 1:
                validation_y2.append([0.0, 1.0])
        
    if config.n_classes==3:
        for L in utterances_validation:
            ID = L[1].strip()
            meta = meta_data[ID]
            if meta[target1]==0:
                validation_y1.append([1.0, 0.0, 0.0]) #need labels in this shape for the RNN
            elif meta[target1] == 1:
                validation_y1.append([0.0, 1.0, 0.0])
            elif meta[target1] == 2:
                validation_y1.append([0.0, 0.0, 1.0]) 
    
            if meta[target2]==0:
                validation_y2.append([1.0, 0.0, 0.0]) #need labels in this shape for the RNN
            elif meta[target2] == 1:
                validation_y2.append([0.0, 1.0, 0.0])
            elif meta[target2] == 2:
                validation_y.append([0.0, 0.0, 1.0]) 
    #change each sentence to a sequence of numbers (each number maps to a word), and if the sequence is less than the max_len, pads on the back of this seuqence withs 0s to reach the max sequence length. 
    vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(config.seq_max_len,min_frequency=config.min_word_frequency)
    data_x = np.array(list(vocab_processor.fit_transform(sentences))) #words that don't occur frequently enough are replaced with 0 in data_x. Usually, this (could) be an issue since padding uses 0s as well, but in this code it shouldn't be an issue since we explicitly feed the sequence length (see: https://r2rt.com/recurrent-neural-networks-in-tensorflow-iii-variable-length-sequences.html). If still concerned, instead, could use another symbol like NA or could remove these words entirely?

    #process validation sentences data in way corersponding to dictionary used to process training sentence data
    validation_x= np.array(list(vocab_processor.fit_transform(sentences_validation))) #words that don't occur frequently enough are replaced with 0 in data_x. Usually, this (could) be an issue since padding uses 0s as well, but in this code it shouldn't be an issue since we explicitly feed the sequence length (see: https://r2rt.com/recurrent-neural-networks-in-tensorflow-iii-variable-length-sequences.html). If still concerned, instead, could use another symbol like NA or could remove these words entirely?

    ##Get vocab size, needed to specify embedding dimensions in the RNN
    vocab_dict = vocab_processor.vocabulary_._mapping
    vocab_size= len(vocab_dict)
    
    #need to reshape utterances_validation, from a list of lists, to a list of strings. each string has 'utterance, ID, line number in transcript, quarter of transcript'
    reshaped_utterances_validation=[]  
    for i in range(0,len(utterances_validation)):
        flatter= ", ".join(utterances_validation[i])
        reshaped_utterances_validation.append(flatter)
    
    '''
    #need to reshape utterances_training, from a list of lists, to a list of strings. each string has 'utterance, ID, line number in transcript, quarter of transcript'
    reshaped_utterances_training=[]  
    for i in range(0,len(utterances_withIDs)):
        flatter= ", ".join(utterances_withIDs[i])
        reshaped_utterances_training.append(flatter)
    '''
    
    return data_x, data_y1, data_y2, sequence_length, vocab_size, IDs_training, len(Nlow_utterances), len(Nmid_utterances), len(Nhigh_utterances), validation_x, validation_y1, validation_y2, sequence_length_validation, reshaped_utterances_validation

Now run this function to get the data

In [None]:
#this needs to be before creating RNN object, since the vocab size is needed to specify the dimensions of the embedding in the RNN 
dcaps_data, dcaps_label1, dcaps_label2, dcaps_sequence_lengths, vocab_size, dcaps_IDs, N_low, N_mid, N_high, validation_x, validation_y1, validation_y2, sequence_length_validation, utterances_validation = load_data(target1=3, target2=5) 

check_numbers= (dcaps_data, dcaps_label1, dcaps_label2, dcaps_sequence_lengths, dcaps_IDs, validation_x, validation_y1, validation_y2, utterances_validation, sequence_length_validation)
print('Check numbers:')
for i in check_numbers:
    print(len(i))

In [None]:
print(N_low, N_mid, N_high)

In [None]:
'''
#plot out training data sequence lengths
plt.hist(dcaps_sequence_lengths, 50, normed=1, facecolor='green', alpha=0.75)
plt.xlabel('Sequence Length')
plt.ylabel('Probability')
plt.title('Distribution of truncated training sequence lengths')

plt.show()
'''

In [None]:
def write_debug_sentences(val_predictions1, valBatch_y1, iteration, val_probs1, valBatch_sentences):
    os.makedirs('sentence_validation' + str(iteration))
    if config.n_classes==2:
        with open('sentence_validation' + str(iteration) + '/conf_11.log', 'w') as f11, open('sentence_validation' + str(iteration) + '/conf_12.log', 'w') as f12, open('sentence_validation' + str(iteration) + '/conf_21.log', 'w') as f21, open('sentence_validation' + str(iteration) + '/conf_22.log', 'w') as f22:
            files = []
            files.append(f11)
            files.append(f12)
            files.append(f21)
            files.append(f22)

            for i, val in enumerate(val_predictions1):
                file_idx = config.n_classes*val + np.argmax(valBatch_y1[i])
                files[file_idx].write(valBatch_sentences[i]+ ',' + str(val_prob_predictions1[i][0]) + ',' + str(val_prob_predictions1[i][1]) + '\n') 
           
    if config.n_classes==3:
        with open('sentence_validation' + str(iteration) + '/conf_11.log', 'w') as f11, open('sentence_validation' + str(iteration) + '/conf_12.log', 'w') as f12, open('sentence_validation' + str(iteration) + '/conf_13.log', 'w') as f13, open('sentence_validation' + str(iteration) + '/conf_21.log', 'w') as f21, open('sentence_validation' + str(iteration) + '/conf_22.log', 'w') as f22, open('sentence_validation' + str(iteration) + '/conf_23.log', 'w') as f23, open('sentence_validation' + str(iteration) + '/conf_31.log', 'w') as f31, open('sentence_validation' + str(iteration) + '/conf_32.log', 'w') as f32, open('sentence_validation' + str(iteration) + '/conf_33.log', 'w') as f33:
            files = []
            files.append(f11)
            files.append(f12)
            files.append(f13)
            files.append(f21)
            files.append(f22)
            files.append(f23)
            files.append(f31)
            files.append(f32)
            files.append(f33)
        
            for i, val in enumerate(val_predictions1):
                file_idx = config.n_classes*val + np.argmax(valBatch_y1[i])
                files[file_idx].write(valBatch_sentences[i]+ ',' + str(val_prob_predictions1[i][0]) + ',' + str(val_prob_predictions1[i][1]) + ',' + str(val_prob_predictions1[i][2]) + '\n') 
    
    for thefile in files:
        thefile.close()

In [None]:
def write_debug_sentences_training(all_training_predictions1, all_training_y1, iteration, all_training_prob_predictions1, all_training_IDs):
    os.makedirs('sentence_training' + str(iteration))
    if config.n_classes==2:
        with open('sentence_training' + str(iteration) + '/conf_11.log', 'w') as f11, open('sentence_training' + str(iteration) + '/conf_12.log', 'w') as f12, open('sentence_training' + str(iteration) + '/conf_21.log', 'w') as f21, open('sentence_training' + str(iteration) + '/conf_22.log', 'w') as f22:
            files = []
            files.append(f11)
            files.append(f12)
            files.append(f21)
            files.append(f22)

            for i, val in enumerate(all_training_predictions1):
                file_idx = config.n_classes*val + np.argmax(all_training_y1[i])
                files[file_idx].write(str(all_training_IDs[i]) + ',' + str(all_training_prob_predictions1[i][0]) + ',' + str(all_training_prob_predictions1[i][1]) + '\n') 
    
    if config.n_classes==3:
        with open('sentence_training' + str(iteration) + '/conf_11.log', 'w') as f11, open('sentence_training' + str(iteration) + '/conf_12.log', 'w') as f12, open('sentence_training' + str(iteration) + '/conf_13.log', 'w') as f13, open('sentence_validation_training' + str(iteration) + '/conf_21.log', 'w') as f21, open('sentence_training' + str(iteration) + '/conf_22.log', 'w') as f22, open('sentence_validation_training' + str(iteration) + '/conf_23.log', 'w') as f23, open('sentence_training' + str(iteration) + '/conf_31.log', 'w') as f31, open('sentence_training' + str(iteration) + '/conf_32.log', 'w') as f32, open('sentence_training' + str(iteration) + '/conf_33.log', 'w') as f33:
            files = []
            files.append(f11)
            files.append(f12)
            files.append(f13)
            files.append(f21)
            files.append(f22)
            files.append(f23)
            files.append(f31)
            files.append(f32)
            files.append(f33)
        
            for i, val in enumerate(all_training_predictions1):
                file_idx = config.n_classes*val + np.argmax(all_training_y1[i])
                files[file_idx].write(str(all_training_prob_predictions1[i][0]) + ',' + str(all_training_prob_predictions1[i][1]) + ',' + str(all_training_predictions1[i][2]) + '\n') 
    
    for thefile in files:
        thefile.close()

In [None]:
#write_debug_sentences_training(all_training_predictions1 , all_training_y1, step*batch_siz, all_training_prob_predictions1, all_training_sentences)


Now that data is loaded in, begin building RNN

In [None]:
def dynamicRNN(x, seqlen, weights1, biases1,weights2, biases2, vocab_size, keep_prob):

    # Prepare data shape to match `rnn` function requirements
    #need to input dat in: batch_size, n_steps
    with tf.device("/cpu:0"): #http://www.wildml.com/2015/12/implementing-a-cnn-for-text-classification-in-tensorflow/
        embedding_mat = tf.Variable(tf.random_uniform([vocab_size, config.embedding_size], -1.0, 1.0)) #this should be a matrix vocab size by embedding size
        embedding_output = tf.nn.embedding_lookup(embedding_mat, x)
    #The result of the embedding operation is a 3-dimensional tensor of shape [None, sequence_length, embedding_size].
   
    # Define a lstm cell with tensorflow
    if config.num_layers == 1:
        lstm_cell = tf.contrib.rnn.BasicLSTMCell(config.n_hidden)
        if config.keep_prob < 1:
            lstm_cell = tf.contrib.rnn.DropoutWrapper(lstm_cell, output_keep_prob=keep_prob, input_keep_prob=keep_prob) #might also add input_keep_prob
        #init_state = lstm_cell.zero_state(config.batch_size, tf.float32)    
    else:    
        def lstm_cell():
            return tf.contrib.rnn.BasicLSTMCell(config.n_hidden, reuse=tf.get_variable_scope().reuse)
        lstm_cell = tf.contrib.rnn.MultiRNNCell([lstm_cell() for _ in range(config.num_layers)], state_is_tuple = True)
        if config.keep_prob <1:
            def lstm_cell():
                return tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.BasicLSTMCell(config.n_hidden), output_keep_prob=keep_prob, input_keep_prob=keep_prob)
            lstm_cell = tf.contrib.rnn.MultiRNNCell([lstm_cell() for _ in range(config.num_layers)], state_is_tuple = True)

        #lstm_cell = tf.contrib.rnn.BasicLSTMCell(config.n_hidden, reuse=tf.get_variable_scope().reuse)
        #init_state = lstm_cell.zero_state(config.batch_size, tf.float32)            
                
    # Get lstm cell output, providing 'sequence_length' will perform dynamic calculation.
    outputs, states = tf.nn.dynamic_rnn(lstm_cell, embedding_output, dtype=tf.float32, sequence_length=seqlen)
    
    # When performing dynamic calculation, we must retrieve the last
    # dynamically computed output, i.e, if a sequence length is 10, we need
    # to retrieve the 10th output.

    output = tf.transpose(outputs, [1, 0, 2])
    last= tf.gather(output, int(output.get_shape()[0]) - 1)
    
    return tf.matmul(last, weights1['out']) + biases1['out'], tf.matmul(last, weights2['out']) + biases2['out']

In [None]:
pred1, pred2 = dynamicRNN(x, seqlen, weights1, biases1, weights2, biases2, vocab_size, keep_prob) #returns a tensor [#, #, #] with LOGITS for each class, only become probabilities when fed into softmax function

In [None]:
prob1 = tf.nn.softmax(pred1) #get probabilities as a variable, so they can be added in and analyzed later
predictions1 = tf.cast(tf.argmax(pred1,1), tf.int64) #a_note, returns the index with the largest value across axis of a tensor, i.e., the predicted class as 1/2/3
labels1 = tf.cast(tf.argmax(y1,1), tf.int64) #a_note, returns the index with the largest value across axis of a tensor, i.e., the expected class as 1/2/3
conf_mat1 = tf.confusion_matrix(labels= predictions1, predictions= labels1) #confusion matrix, where we WANT each row is a prediction, each column a true label

prob2 = tf.nn.softmax(pred2) #get probabilities as a variable, so they can be added in and analyzed later
predictions2 = tf.cast(tf.argmax(pred2,1), tf.int64) #a_note, returns the index with the largest value across axis of a tensor, i.e., the predicted class as 1/2/3
labels2 = tf.cast(tf.argmax(y2,1), tf.int64) #a_note, returns the index with the largest value across axis of a tensor, i.e., the expected class as 1/2/3
conf_mat2 = tf.confusion_matrix(labels= predictions2, predictions= labels2 ) #confusion matrix, where we WANT each row is a prediction, each column a true label

###################################

# Define loss and optimizer


#FOR COST-SENSITIVE LEARNING:
#####use Cost-sensitive learning if proportion of classes in data is very unbalanced 
#(under samping the majority class is another approach). 
#Create and add weights to labels for cost-sensitive learning, where the cost is magnified for certain observations, 
#such as those which occur in/frequently. Example followed: https://github.com/jakeret/tf_unet/blob/master/tf_unet/unet.py#L191


if config.n_classes==2:
    if config.balance_classes == True:
        prop_low = (N_low+N_mid)/N_low #calculate weights, which are the proportion of each class
        prop_mid = (N_low+N_mid)/N_mid 
        classes_weights = tf.constant([prop_low, prop_mid],  dtype=np.float32) #weighted contributions of each class, so that we can penalize extra when a sentences is mistakenly put in the majority class. otherwise the most 'accurate' strategy in unbalanced data is to predict everthing as the majority class 
    else:
        classes_weights = tf.constant([1.0, 1.4],  dtype=np.float32) #weighted contributions of each class, so that we can penalize extra when a sentences is mistakenly put in the majority class. otherwise the most 'accurate' strategy in unbalanced data is to predict everthing as the majority class 

if config.n_classes==3: 
    if config.balance_classes == True:
        prop_low = (N_low+N_mid+N_high)/N_low #calculate weights, which are the proportion of each class
        prop_mid = (N_low+N_mid+N_high)/N_mid 
        prop_high = (N_low+N_mid+N_high)/N_high #consider increaseing penalty by muliptlying N_high*.7 or another value
        classes_weights = tf.constant([prop_low, prop_mid, prop_high],  dtype=np.float32) #weighted contributions of each class, so that we can penalize extra when a sentences is mistakenly put in the majority class. otherwise the most 'accurate' strategy in unbalanced data is to predict everthing as the majority class 
    else:
        classes_weights = tf.constant([1.2, 1.3, 1.5],  dtype=np.float32) #weighted contributions of each class, so that we can penalize extra when a sentences is mistakenly put in the majority class. otherwise the most 'accurate' strategy in unbalanced data is to predict everthing as the majority class 
    
#weighted_logits = tf.multiply(pred, classes_weights) #could use weighted_logits instead of weighted preds in cost function

#just weight the predictions for depression status, which is what we care about most:
flat_pred = tf.reshape(pred1, [-1, config.n_classes])
flat_labels = tf.reshape(y1, [-1, config.n_classes])
weight_map1 = tf.multiply(flat_labels, classes_weights )
weight_map = tf.reduce_sum(weight_map1, axis=1) #sum weighted predictions

#weighted cost, with downweighted cost for pred2 since pred1 (dep status) is what we care about most:
loss_map= tf.nn.softmax_cross_entropy_with_logits(labels=flat_labels, logits=flat_pred) #shape [1, batch_size] #get loss
weighted_cost1= tf.multiply(loss_map, weight_map) #weight the loss by the summed, weighted predictions (why? multiplies loss from rarer events by larger factor, so they won't just be put in the most common class. )
cost1 = tf.reduce_mean(weighted_cost1) #shape 1; returns the mean of weighted losses

reg1= tf.nn.l2_loss(weights1['out'])
reg2= tf.nn.l2_loss(weights2['out'])

cost = cost1 + .6*tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y2, logits=pred2)) + config.L2_penalty*reg1 + config.L2_penalty*reg2 # this loss in middle is the addded loss plus L2 penalty term. why? shouldnt the mean cost be added to the regualrized mean weights?



#FOR NON-COST SENSITIVE LEARNING:
'''
######L2 REGULARIZATION. But if config.L2_penalty is 0 this reduces to no L2 regularization: Not 100% confident in code yet, so keep at 0. 
reg1= tf.nn.l2_loss(weights1['out'])
reg2= tf.nn.l2_loss(weights2['out'])

cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y1, logits=pred1)) + .6*tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y2, logits=pred2)) + config.L2_penalty*reg1 + config.L2_penalty*reg2 # this loss in middle is the addded loss plus L2 penalty term. why? shouldnt the mean cost be added to the regualrized mean weights?

'''

#OPTIMIZER
optimizer = tf.train.RMSPropOptimizer(learning_rate=config.learning_rate).minimize(cost)


#######################################

#EVALUATE MODEL
#pred = tf.Print(pred, [tf.argmax(pred, 1), tf.argmax(y,1)], message='crisp_prediction', summarize=100)
correct_pred1 = tf.equal(tf.argmax(pred1,1), tf.argmax(y1,1))
accuracy1 = tf.reduce_mean(tf.cast(correct_pred1, tf.float32))

correct_pred2 = tf.equal(tf.argmax(pred2,1), tf.argmax(y1,1))
accuracy2 = tf.reduce_mean(tf.cast(correct_pred2, tf.float32))

# Initializing the variables
init =  tf.global_variables_initializer() #tf.initialize_all_variables()

In [None]:
# Launch the graph
with tf.Session() as sess:
    sess.run(init)
    saver = tf.train.Saver() #for saving models at each step where validation accuracy is assessed
    step = 1
    moving_avg_loss = 0.0
    moving_avg_acc = 0.0
    discount = 0.01
    batch_siz = config.batch_size 

    config.printConfiguration()
    print(len(dcaps_data), len(dcaps_label1), len(dcaps_sequence_lengths))
    valBatch_x, valBatch_y1,valBatch_y2, valBatch_seqlen, valBatch_sentences = random_batch(validation_x, validation_y1,validation_y2, sequence_length_validation, part_size=7500, sentences=utterances_validation) #sentences_validation

    # Keep training until reach max iterations
    while step * batch_siz < config.training_iters:
        # batch_x, batch_y, batch_seqlen = trainset.next(batch_size)
   
        batch_x, batch_y1,batch_y2, batch_seqlen, __ = random_batch(dcaps_data, dcaps_label1, dcaps_label2,dcaps_sequence_lengths, part_size=batch_siz)

        #Run optimization op (backprop)
        sess.run(optimizer, feed_dict={x: batch_x, y1: batch_y1, y2: batch_y2, seqlen: batch_seqlen, keep_prob:config.keep_prob})
        if step % config.display_step == 0:
            #Calculate batch accuracy
            acc = sess.run(accuracy1, feed_dict={x: batch_x, y1: batch_y1,y2: batch_y2,
                                               seqlen: batch_seqlen, keep_prob:config.keep_prob})
            #Calculate batch loss
            loss = sess.run(cost, feed_dict={x: batch_x, y1: batch_y1,y2: batch_y2,
                                            seqlen: batch_seqlen, keep_prob:config.keep_prob})

            if step == 1:
                moving_avg_loss = loss
                moving_avg_acc = acc
            else:
                moving_avg_loss = (1.0-discount) * moving_avg_loss + discount * loss
                moving_avg_acc = (1.0-discount) * moving_avg_acc + discount * acc
            print("Iter " + str(step*batch_siz) + ", Minibatch Loss= " + \
                "{:.6f}".format(loss) + ", Training Accuracy= " + \
                "{:.5f}".format(acc)  + ", Moving Average Loss= " + \
                "{:.6f}".format(moving_avg_loss) + ", Moving Average Acc= " + \
                "{:.5f}".format(moving_avg_acc))
        if step % config.val_step == 0:

            print ("Testing Accuracy:", \
                sess.run(accuracy1, feed_dict={x: valBatch_x, y1: valBatch_y1, y2: valBatch_y2,
                                              seqlen: valBatch_seqlen, keep_prob:1}))
            print ("Testing Accuracy2:", \
                sess.run(accuracy2, feed_dict={x: valBatch_x, y1: valBatch_y1, y2: valBatch_y2,
                                              seqlen: valBatch_seqlen, keep_prob:1}))
            print ("Test Confmatrix: \n", \
            sess.run(conf_mat1, feed_dict={x: valBatch_x, y1: valBatch_y1, y2: valBatch_y2,  seqlen: valBatch_seqlen, keep_prob:1}))
            val_predictions1 = sess.run(predictions1, feed_dict={x: valBatch_x, y1: valBatch_y1,  y2: valBatch_y2,seqlen: valBatch_seqlen, keep_prob:1}) #get predicted classes for validation batch
            val_prob_predictions1 = sess.run(prob1, feed_dict={x: valBatch_x, y1: valBatch_y1,  y2: valBatch_y2,seqlen: valBatch_seqlen, keep_prob:1}) #get predicted probabilities for validation batch
            
            if config.debug_sentences: 
                write_debug_sentences(val_predictions1, valBatch_y1, step*batch_siz, val_prob_predictions1, valBatch_sentences)
                
                all_training_x, all_training_y1,all_training_y2, all_training_seqlen, all_training_IDs = random_batch(dcaps_data, dcaps_label1, dcaps_label2,dcaps_sequence_lengths, part_size=len(dcaps_data), sentences= dcaps_IDs) #just write ids not whole sentences, to save computational time here
                all_training_predictions1 = sess.run(predictions1, feed_dict={x: all_training_x, y1: all_training_y1,  y2: all_training_y2,seqlen: all_training_seqlen, keep_prob:1}) #get predicted classes for all training
                all_training_prob_predictions1 = sess.run(prob1, feed_dict={x: all_training_x, y1: all_training_y1,  y2: all_training_y2,seqlen: all_training_seqlen, keep_prob:1})  #get predicted probs for all training 
                write_debug_sentences_training(all_training_predictions1 , all_training_y1, step*batch_siz, all_training_prob_predictions1, all_training_IDs)
            #save a model
            #saver.save(sess, 'my_test_model',global_step=step)
        step += 1
    print ("Optimization Finished!")