# Measuring Engagement and Satisfaction in Online Mental Health Platform Conversations

## Data preprocessing

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from math import log10, floor, ceil
import math
import nltk
import string
from nltk.tokenize import sent_tokenize
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import spacy
from spacy.matcher import PhraseMatcher
import joblib
import sklearn.externals
import joblib
from profanity_check import predict, predict_prob
import sarcastic

In [7]:
sid = SentimentIntensityAnalyzer()

nlp = spacy.load('en_core_web_sm')

pd.set_option('mode.chained_assignment',None)

# Display long column text
pd.options.display.max_colwidth = 100

In [8]:
df = pd.read_csv("data/RED/annotated/100_annotated_dialogues.csv")
df = df.rename(columns={'conversation id': 'conversation_id', 'post title': 'post_title', 'dialog turn': 'dialog_turn', 'emotion prediction': 'emotion_prediction'})

In [9]:
### --- CLEANING OUT CONVERSATIONS WITH ONLY 1 OR 2 TURN(S) AND/OR WITH A SINGLE AUTHOR --- ###
'''
# Group data by conversation id and calculate count of each conversation id
df_conv_len = df.groupby("conversation_id").count()
df_conv_len = df_conv_len.drop(columns=["subreddit", "post_title", "author", "text", "compound", "sentiment", "emotion_prediction"])
print("Number of conversations in subreddit: ", len(df_conv_len))

# Separate conversation id's with a single occurrence as monologues
df_mono = df_conv_len[df_conv_len["dialog_turn"] == 1]
#print("Number of conversations with a single turn in subreddit: ", len(df_mono))
df_mono_ids = df_mono.reset_index()
df_mono_ids = df_mono_ids["conversation_id"]

# Separate conversation id's with multiple occurrences as dialogues
df_dia = df_conv_len[df_conv_len["dialog_turn"] > 2]
print("Number of conversations longer than 2 turns in subreddit: ", len(df_dia))
df_dia = df_dia.reset_index()
df_dia = df_dia.drop(columns=['dialog_turn'])

# Join dialogue conversation id's with original data such that only dialogues remain in the dataset
df = df.join(df_dia.set_index('conversation_id'), on='conversation_id', how="right") 

# Separate conversations that have more than one author
df_conv_authors = df.groupby("conversation_id")["author"].unique().reset_index()
df_conv_authors["author"] = df_conv_authors["author"].apply(lambda x: x.size)
df_conv_authors = df_conv_authors[df_conv_authors["author"] > 1]
df_conv_authors = df_conv_authors.drop(columns=['author'])

# Join dialogue conversation id's with original data such that only conversations that have more than one author remain in the dataset
df = df.join(df_conv_authors.set_index('conversation_id'), on='conversation_id', how="right") 
print("Number of conversations longer than 2 turns with more than a single author in subreddit: ", len(df_conv_authors))

### ---------------------------------------------- ###
'''

'\n# Group data by conversation id and calculate count of each conversation id\ndf_conv_len = df.groupby("conversation_id").count()\ndf_conv_len = df_conv_len.drop(columns=["subreddit", "post_title", "author", "text", "compound", "sentiment", "emotion_prediction"])\nprint("Number of conversations in subreddit: ", len(df_conv_len))\n\n# Separate conversation id\'s with a single occurrence as monologues\ndf_mono = df_conv_len[df_conv_len["dialog_turn"] == 1]\n#print("Number of conversations with a single turn in subreddit: ", len(df_mono))\ndf_mono_ids = df_mono.reset_index()\ndf_mono_ids = df_mono_ids["conversation_id"]\n\n# Separate conversation id\'s with multiple occurrences as dialogues\ndf_dia = df_conv_len[df_conv_len["dialog_turn"] > 2]\nprint("Number of conversations longer than 2 turns in subreddit: ", len(df_dia))\ndf_dia = df_dia.reset_index()\ndf_dia = df_dia.drop(columns=[\'dialog_turn\'])\n\n# Join dialogue conversation id\'s with original data such that only dialogues rem

In [10]:
# Round numbers to a given number of significant figures (default = 2)
def round_sig(x, sig=2):
    if x != 0:
        return round(x, sig-int(floor(log10(abs(x))))-1)
    else:
        return

## Extracting conversation, speaker, and listener

In [11]:
def extract_responses(conversation_id,subreddit):
    conversation = df[df["conversation_id"] == conversation_id]
    conversation = conversation[conversation["subreddit"] == subreddit]
    conversation.reset_index(drop=True, inplace=True)
    speaker = conversation.author.iloc[0]
    listener = conversation[conversation["author"] != speaker]["author"].unique().item() 
    speaker_responses = conversation[conversation["author"] == speaker]
    listener_responses = conversation[conversation["author"] == listener]
    num_speaker_responses = len(speaker_responses) 
    num_listener_responses = len(listener_responses)
    # TOIMPROVE: you can only keep conversation, speaker, and listener (because others are dynamic and can be calculated from these 3)
    return conversation, speaker, listener, speaker_responses, listener_responses, num_speaker_responses, num_listener_responses

## Measuring the level of engagement

In [12]:
# Calculates token length for all speaker responses except for the first and adds it to a new column 'token_length', and returns their sum.
def calculate_speaker_token_length(conversation, speaker):
    # Create an empty dataframe column 'token_length'
    conversation.loc[:,'token_length'] = np.nan
    
    sum_token_length = 0
    
    for i in range(1, len(conversation)):
        if conversation['author'].iloc[i] == speaker:
            # Tokenize speaker response and filter punctuations
            tokens = nltk.word_tokenize(conversation["text"][i])
            tokens = list(filter(lambda tokens: tokens not in string.punctuation, tokens)) # TOIMPROVE: exclude quotation marks
            
            # Add token length to dataframe
            conversation.at[i,'token_length'] = len(tokens)
            
            # Calculate sum of all tokens by speaker
            sum_token_length += len(tokens)
        
    return conversation, sum_token_length

In [13]:
def merge_consecutive_speaker_responses(conversation, speaker, listener):
    i = 0
    
    while i < (len(conversation) - 1):
        if conversation['author'].iloc[i] == speaker and conversation['author'].iloc[i+1] == speaker:
            if (conversation['dialog_turn'].iloc[i] + 1) == conversation['dialog_turn'].iloc[i+1]:
                dropped_turn = conversation['dialog_turn'].iloc[i+1]
                former_text = conversation['text'][i]
                latter_text = conversation['text'][i+1]

                # Merge consecutive responses of the speaker
                merged_text = former_text + " " + latter_text 
                conversation['text'].replace({former_text: merged_text}, inplace=True)

                # Get names of indexes for which column dialog_turn has value of the dropped turn
                conversation_index_names = conversation[conversation['dialog_turn'] == dropped_turn].index

                # Delete these row indexes from dataframe
                conversation.drop(conversation_index_names, inplace=True)

                # Reset indexes
                conversation.reset_index(drop=True, inplace=True)
                
        i += 1
        
    speaker_responses = conversation[conversation["author"] == speaker]
    listener_responses = conversation[conversation["author"] == listener]
    num_speaker_responses = len(speaker_responses) 
    num_listener_responses = len(listener_responses)
                 
    return conversation, num_speaker_responses, num_listener_responses

In [14]:
# Checks if conversation is interleaved. 
# If all the even-numbered turns are by the speaker and if all the odd-numbered turns are by the listener, returns True. Otherwise, returns False.
def is_interleaved_conversation(speaker, listener, conversation):
    num_turns = len(conversation)
    
    for i in range(0, num_turns, 2):
        if conversation['author'].iloc[i] == speaker:
            continue
        else:
            return False
        
    for i in range(1, num_turns, 2):
        if conversation['author'].iloc[i] == listener:
            continue
        else:
            return False
        
    return True

In [15]:
def calculate_engagement_score(num_turns, interleaved, sum_token_length, diff):
    num_turns_weight = 1
    interleaved_weight = 1
    token_length_weight = 0.05
    diff_weight = -1

    # Set an upper limit to token length's effect on engagement
    if sum_token_length >= 30:
        token_length_score = 30
    else:
        token_length_score = sum_token_length
    
    # Convert boolean to int
    if interleaved == True:
        interleaved_int = 1
    else:
        interleaved_int = -1
    
    return num_turns_weight*num_turns + interleaved_weight*interleaved + token_length_weight*token_length_score + diff_weight*diff

In [16]:
def classify_engagement(speaker, listener, conversation):
    conversation, num_speaker_responses, num_listener_responses = merge_consecutive_speaker_responses(conversation, speaker, listener)
    conversation, sum_token_length = calculate_speaker_token_length(conversation, speaker)
    diff = num_speaker_responses - num_listener_responses
    diff = abs(diff)
    interleaved = is_interleaved_conversation(speaker, listener, conversation)
    num_turns = len(conversation)
    engagement = ""
    threshold = 4
    
    engagement_score = calculate_engagement_score(num_turns, interleaved, sum_token_length, diff)
    
    if engagement_score >= threshold:
        engagement = 1
    else:
        engagement = 0
                    
    return num_turns, interleaved, sum_token_length, round_sig(diff), round_sig(engagement_score), engagement, conversation, threshold

## Measuring the level of satisfaction

In [17]:
def plot_emotion_sentiment(conversation, speaker):   
    speaker_responses = conversation[conversation["author"] == speaker]
    sns.set_theme(style="white")
    g = sns.relplot(x="dialog_turn", y="strongest_compound", hue="sentiment", style="emotion_prediction", palette="Set1",data=speaker_responses, s=200)
    g.fig.suptitle('Sentimental and Emotional Shift in Speaker Responses with Conversation Progression', fontsize=16)
    g.fig.subplots_adjust(top=0.9);
    plt.show()

In [18]:
# Return all emotions and the final emotion of the given responses
def get_emotion_prediction(speaker_responses):
    emotions = speaker_responses["emotion_prediction"]
    final_emotion = emotions.iloc[-1]
    
    return emotions, final_emotion

In [19]:
# Return all sentiments and the final sentiment of the given responses
def get_sentiment(speaker_responses):
    sentiments = speaker_responses["sentiment"]
    final_sentiment = sentiments.iloc[-1]
    
    return sentiments, final_sentiment

In [20]:
# Check if the last speaker emotion is grateful and its sentiment is positive. If yes, return true. Otherwise, return false.
def is_tagged_grateful_positive(speaker_responses):
    _, final_sentiment = get_sentiment(speaker_responses)
    _, final_emotion = get_emotion_prediction(speaker_responses)
    
    if final_sentiment == "positive" and final_emotion == "grateful":
        return True
    
    return False

In [21]:
def is_toward_listener(speaker_response):    
    phrase_matcher = PhraseMatcher(nlp.vocab)
    phrases = ['you', 'your']
    patterns = [nlp(text) for text in phrases]
    phrase_matcher.add('toward_listener', None, *patterns)
    sentence = nlp (speaker_response)
    matched_phrases = phrase_matcher(sentence)
 
    if len(matched_phrases) > 0:
        return True

In [22]:
# Checks if any of the speaker responses, except the first, contain profanity towards the listener
def contains_profanity(conversation, speaker):
    for i in range(1,len(conversation)):
        if conversation['author'].iloc[i] == speaker:
            for j in range(0,len(conversation['sentences'].iloc[i])):
                # i'th dialogue turn, j'th sentence
                if predict([conversation['sentences'][i][j]]) == 1 and is_toward_listener(conversation['sentences'][i][j]) == True:
                    # uncomment to print the sentence that contains profanity
                    #print(conversation['sentences'][i][j])
                    return True
    return False

In [23]:
def contains_gratitude(conversation, speaker):
 
    # Take the speaker responses except first one
    speaker_responses = conversation[conversation['author'] == speaker]
    speaker_responses = speaker_responses[speaker_responses['dialog_turn'] != 1]
    speaker_responses = speaker_responses['text']
    speaker_responses = speaker_responses.to_string()[1:].lower()
    
    phrase_matcher = PhraseMatcher(nlp.vocab)
   
    phrases = ['thank', 'means a lot to me', 'thanks', 'appreciate', 'support', 'concern'
               'your help', 'means so much to me', 'grateful', 'kind of you', 'repay you', 
               'taking the time']

    patterns = [nlp(text) for text in phrases]
    phrase_matcher.add('gratitude', None, *patterns)
    sentence = nlp (speaker_responses)
    matched_phrases = phrase_matcher(sentence)
    
    # uncomment this part if you want to print the matched phrases
    #for match_id, start, end in matched_phrases:
        #string_id = nlp.vocab.strings[match_id]  
        #span = sentence[start:end]                   
        #print(match_id, string_id, start, end, span.text)
    
    if len(matched_phrases) > 0:
        return True
    
    return False

In [24]:
# Checks if any of the speaker responses, except the first, contain sarcasm
def contains_sarcasm(conversation, speaker, tokenizer, model):

    # https://www.linkedin.com/pulse/you-being-sarcastic-deep-learning-answers-code-ibrahim-sobh-phd/?articleId=6662861432498987008
    
    # Take the speaker responses except first one\
    speaker_sentences = conversation[conversation['author'] == speaker]
    speaker_sentences = speaker_sentences[speaker_sentences['dialog_turn'] != 1]
    speaker_sentences = speaker_sentences['sentences']
    sarcastic_probas = sarcastic.proba(speaker_sentences, tokenizer, model)
    #print(sarcastic_probas)
    
    # Can be optimized 
    if (sarcastic_probas > 0.6).any():
        return True
    
    return False

In [25]:
# Checks if any of the speaker responses, except the first, contain disagreement
def contains_disagreement(conversation, speaker):
        
    # Take the speaker responses except first one
    speaker_responses = conversation[conversation['author'] == speaker]
    speaker_responses = speaker_responses[speaker_responses['dialog_turn'] != 1]
    speaker_responses = speaker_responses['text']
    speaker_responses = speaker_responses.to_string()[1:].lower()
    
    phrase_matcher = PhraseMatcher(nlp.vocab)
   
    phrases = ["i don't think so", "no way", "disagree", "i beg to differ", "i'd say the exact opposite", 
               "not necessarily", "that's not always true", "that's not always the case", "i'm not so sure about that", 
               "that doesn’t make much sense to me", "i don’t share your view", "i don’t agree with you"]

    patterns = [nlp(text) for text in phrases]
    phrase_matcher.add('disagreement', None, *patterns)
    sentence = nlp (speaker_responses)
    matched_phrases = phrase_matcher(sentence)
    
    # uncomment this part if you want to print the matched phrases
    #for match_id, start, end in matched_phrases:
        #string_id = nlp.vocab.strings[match_id]  
        #span = sentence[start:end]                   
        #print(match_id, string_id, start, end, span.text)
    
    if len(matched_phrases) > 0:
        return True
    
    return False

In [26]:
# Creates a column with sentence-level sentiment compounds
def sentence_level_sentiment(conversation):
    conversation['sentences'] = conversation['text'].apply(lambda x: sent_tokenize(x))
    conversation['sentences'] = conversation['sentences'].map(lambda x: list(map(str.lower, x)))
    conversation['sentence_compounds'] = conversation['sentences']
    
    for i in range(0,len(conversation)):
        num_sentences = len(sent_tokenize(conversation['text'].iloc[i]))
        # sentiment compound for each sentence
        scores = np.zeros(num_sentences) 
        for j in range(0,num_sentences):
            # i'th dialogue turn, j'th sentence
            scores[j] = sid.polarity_scores(sent_tokenize(conversation['text'][i])[j])['compound']

            conversation['sentence_compounds'][i] = scores
            
    return conversation

In [27]:
# Creates a column with the sentence compound with strongest magnitude within a dialogue turn
def strongest_sentiment(conversation):
    conversation['strongest_compound'] = conversation['sentence_compounds']
    conversation['strongest_compound'] = conversation['strongest_compound'].apply(lambda x: np.min(x) if np.max(abs(x)) == abs(np.min(x)) else np.max(x))
    
    return conversation

In [28]:
# TODO IN LATER WEEKS?
def emobert_predict_sentence_emotion(conversation):
    conversation['sentence_level_emotion_prediction'] = conversation['sentences']
    
    for i in range(0,len(conversation)):
        num_sentences = len(sent_tokenize(conversation['text'].iloc[i]))
        # emotion for each sentence
        emotions = np.zeros(num_sentences) 
        for j in range(0,num_sentences):
            # i'th dialogue turn, j'th sentence
            emotions[j] = sent_tokenize(conversation['text'][i])[j].swifter.apply(emobert_predict_emotion)
            conversation['sentence_level_emotion_prediction'][i] = emotions
            
    return conversation

In [29]:
def calculate_satisfaction_score(slope, sentiment_change, grateful_bonus, profanity_penalty, sarcasm_penalty, disagreement_penalty):
    slope_weight = 1
    sentiment_change_weight = 1
    grateful_bonus_weight = 1.5
    profanity_penalty_weight = 1
    sarcasm_penalty_weight = 1
    disagreement_penalty_weight = 1.5
    threshold = 2
    
    return (slope_weight*slope + sentiment_change_weight*sentiment_change + grateful_bonus_weight*grateful_bonus + 
            profanity_penalty_weight*profanity_penalty + sarcasm_penalty_weight*sarcasm_penalty + disagreement_penalty_weight*disagreement_penalty)

In [30]:
def classify_satisfaction(conversation, speaker, tokenizer, model):
    conversation = sentence_level_sentiment(conversation)
    conversation = strongest_sentiment(conversation)
    speaker_responses = conversation[conversation["author"] == speaker]
    num_speaker_responses = len(speaker_responses)
    
    satisfaction = ""
    grateful_bonus = 0
    profanity_penalty = 0
    sarcasm_penalty = 0
    disagreement_penalty = 0
    
    # Change in sentiment from the first to the last turn
    sentiment_change = speaker_responses['strongest_compound'].iloc[-1] - speaker_responses['strongest_compound'].iloc[0]
    
    # Take the slope of the compounds of speaker responses
    f = np.polyfit(speaker_responses['dialog_turn'], speaker_responses['compound'], deg=1)
    slope = f[0]

    if is_tagged_grateful_positive(speaker_responses)==True or contains_gratitude(speaker_responses, speaker)==True:
        grateful_bonus = 1
        
    if contains_profanity(conversation, speaker) == True:
        profanity_penalty = -1
    
    if contains_sarcasm(conversation, speaker, tokenizer, model) == True:
        sarcasm_penalty = -1    
    
    if contains_disagreement(conversation, speaker) == True:
        disagreement_penalty = -1
         
    satisfaction_score, threshold = math.ceil(calculate_satisfaction_score(slope, sentiment_change, grateful_bonus, profanity_penalty, sarcasm_penalty, disagreement_penalty))
                                         
    if satisfaction_score >= threshold:
        satisfaction = 1
    else:
        satisfaction = 0
        
    return round_sig(slope), round_sig(sentiment_change), grateful_bonus, profanity_penalty, sarcasm_penalty, disagreement_penalty, satisfaction_score, satisfaction, threshold

## Testing the measures on a few dialogues

In [31]:
# Train sarcastic model once
tokenizer, model = sarcastic.train()

In [32]:
def test_examples(conversation_id):
    conversation, speaker, listener, speaker_responses, listener_responses, num_speaker_responses, num_listener_responses = extract_responses(conversation_id)
    turns, interleaved, num_speak_tok, diff, eng_score, eng, conversation, thres_e = classify_engagement(speaker, listener, conversation)
    slope, change, grateful, profanity, sarcasm, disagree, sat_score, sat, thres_s = classify_satisfaction(conversation, speaker, tokenizer, model)
    
    plot_emotion_sentiment(conversation, speaker)
    
    print("Number of turns: ", turns, ", interleaved: ", interleaved, ", number of speaker tokens: ",  num_speak_tok, 
          ", speaker-listener turn diff: ",  diff, ", threshold: ", thres_e, ", ENGAGEMENT SCORE: ",  eng_score, ", ENGAGEMENT: ",  eng)
    
    print("Sentiment slope: ", slope, ", sentiment change: ", change, ", grateful bonus: ", grateful, 
          ", profanity penalty: ", profanity, ", sarcasm penalty: ", sarcasm, ", disagreement penalty: ", disagree, 
          ", threshold: ", thres_s, ", SATISFACTION SCORE: ", sat_score, ", SATISFACTION : ", sat) 
    
    print("Classification summary: ", eng, ", ", sat)
    
    return conversation

Highly engaging, highly satisfying:

In [33]:
#test_examples(1732) 

Highly engaging, highly satisfying (profanity & gratitude):

In [34]:
#test_examples(1003) 

Highly engaging, less satisfying:

In [35]:
#test_examples(854) 

Less engaging, highly satisfying:

In [36]:
#test_examples(35) 

Highly engaging, less satisfying (sarcasm):

In [37]:
#test_examples(24)

## Testing on the randomly selected 100 dialogues from 8 subreddits

In [38]:
def performance(TP,FP,TN,FN):
    P = TP/(TP+FP)
    R = TP/(TP+FN)
    f1 = 2*P*R/(P+R)
    acc = (TP+TN)/(TP+TN+FP+FN)
        
    return P, R, f1, acc

In [44]:
param_grid_satisfaction = {'slope_weight': ['0.2', '0.4', '0.6', '0.8', '1.0'],
                          'sentiment_change_weight': ['0.2', '0.4', '0.6', '0.8', '1.0'],
                          'grateful_bonus_weight': ['0.2', '0.4', '0.6', '0.8', '1.0'], 
                          'profanity_penalty_weight': ['0.2', '0.4', '0.6', '0.8', '1.0'],
                          'sarcasm_penalty_weight': ['0.2', '0.4', '0.6', '0.8', '1.0'],
                          'disagreement_penalty_weight': ['0.2', '0.4', '0.6', '0.8', '1.0'],
                          'sat_threshold': ['1.25', '1.5', '1.75', '2.0', '2.25']}

param_grid_engagement = {'num_turns_weight': ['0.2', '0.4', '0.6', '0.8', '1.0'],
                        'interleaved_weight': ['0.2', '0.4', '0.6', '0.8', '1.0'],
                        'token_length_weight': ['0.02', '0.04', '0.06', '0.08', '0.10'],
                        'diff_weight': ['-0.2', '-0.4', '-0.6', '-0.8', '-1.0'],
                        'eng_threshold': ['3.25', '3.5', '3.75', '4.0', '4.25']}

In [None]:
grouped = df.groupby(['conversation_id','subreddit']).groups

def test(grouped): 
    TP_s = 0
    TN_s = 0
    FP_s = 0
    FN_s = 0

    TP_e = 0
    TN_e = 0
    FP_e = 0
    FN_e = 0

    for conv_id,subreddit in grouped:
        conversation, speaker, listener, speaker_responses, listener_responses, num_speaker_responses, num_listener_responses = extract_responses(conv_id,subreddit)
        turns, interleaved, num_speak_tok, diff, eng_score, eng, conversation, thres_e = classify_engagement(speaker, listener, conversation)
        slope, change, grateful, profanity, sarcasm, disagree, sat_score, sat, thres_s = classify_satisfaction(conversation, speaker, tokenizer, model)

        ground_sat = conversation['ground_truth_satisfaction']
        if ((ground_sat == 1) & (sat == 1)).all():
            TP_s += 1
        elif ((ground_sat == 0) & (sat == 1)).all():
            FP_s += 1
        elif ((ground_sat == 1) & (sat == 0)).all():
            FN_s += 1
        else:
            TN_s += 1

        ground_eng = conversation['ground_truth_engagement']    
        if ((ground_eng == 1) & (eng == 1)).all():
            TP_e += 1
        elif ((ground_eng == 0) & (eng == 1)).all():
            FP_e += 1
        elif ((ground_eng == 1) & (eng == 0)).all():
            FN_e += 1
        else:
            TN_e += 1

    P_s, R_s, f1_s, acc_s = performance(TP_s, FP_s, TN_s, FN_s)
    P_e, R_e, f1_e, acc_e = performance(TP_e, FP_e, TN_e, FN_e)
    
    print(round_sig(P_s), round_sig(R_s), round_sig(f1_s), round_sig(acc_s))
    print(round_sig(P_e), round_sig(R_e), round_sig(f1_e), round_sig(acc_e))
    
    return P_s, R_s, f1_s, acc_s, P_e, R_e, f1_e, acc_e

First tested separately on satisfaction and engagement to be able to tune correctly and more easily. Later to be combined.

#### Things to note / ask:
1. I had trouble loading the larger files into memory which took a lot of time. There are ~60 annotated dialogues currently, but I managed to solve the issue and the rest will come.
2. I couldn't apply cross-validation yet because the code needs to be modified such that I can run the measures on all dataframe (currently running by selecting each conversation_id from the dataframe one-by-one). 
3. The test function needs to be modified as well because now I will need to predict for all conversations at once and then test their performance. Whereas right now, I predict and test each sample at once.
3. Depression_help conversation_id 9610 is in Portuguese. Should we remove foreign languages?
4. Suicidewatch contains posts that are spam (repeated 100's of times) and they are missing their last few columns. Should we remove spam posts? (Some of them are mean)
5. There are conversations with incorrect dialogue turns, e.g. 1, 2, 1, 2, 1, 2, 3, 4, 5,... Should we correct these?
6. "Dyadic" conversation files contain multi conversations. Should we remove such dialogues?
7. There are conversations with only 1 speaker turn. I should remove those.
8. I have been just using dyadic conversations. Should I include multi as well? The algorithms would need to change. 
9. What would be the best way to combine the satisfaction and engagement performances? Because if even one of the engagement / satisfaction measures are low, the conversation is disqualified. Should we judge the classifier based on whether it disqualifies a conversation (predicts if one of the measures are low) or whether it predicts both measures correctly? 