In [12]:
import time
import pandas as pd
import csv
import helpers
import sarcastic
from engagement import engagement_preprocessing
from satisfaction import satisfaction_preprocessing
from helpers import round_sig

In [13]:
pd.set_option('mode.chained_assignment', None)

In [15]:
# Train sarcasm classification model 
tokenizer, model = sarcastic.train()

In [14]:
subreddit_name = "depressed"

in_path = "data/RED/clean/" + subreddit_name + "_clean.csv"
out_path = "data/RED/clean/labeled/" + subreddit_name + "_clean_labeled.csv"

In [16]:
df = pd.read_csv(in_path)
df = df.rename(columns={'conversation id': 'conversation_id', 'post title': 'post_title', 'dialog turn': 'dialog_turn', 'emotion prediction': 'emotion_prediction'})

In [17]:
# Group conversations by conversation_id and subreddit
grouped = df.groupby(['conversation_id', 'subreddit']).groups

In [18]:
# BEST HYPERPARAMETERS 

eng_threshold = 2.75
num_turns_weight = 0.75
interleaved_weight = 0.75
token_length_weight = 0.025
diff_weight = -0.25

sat_threshold = 0.6
slope_weight = 0.5
sentiment_change_weight = 0.5
grateful_bonus_weight = 3.25
profanity_penalty_weight = 0.5
sarcasm_penalty_weight = 0.5
disagreement_penalty_weight = 0.5

In [None]:
# PREDICT ENGAGEMENT AND SATISFACTION USING BEST HYPERPARAMETERS ON DATASET 

start = time.time()
cols = df.columns.tolist()
df_preds = pd.DataFrame(columns=cols)

for conv_id, subreddit in grouped:
    conversation, speaker, listener = helpers.extract_responses(conv_id, subreddit, df)
    
    # Predict engagement
    num_turns, interleaved, token_length_score, num_turn_diff, conversation = engagement_preprocessing(speaker, listener, conversation)
    engagement_score = num_turns_weight*num_turns + interleaved_weight*interleaved + token_length_weight*token_length_score + diff_weight*num_turn_diff
    engagement = 1 if engagement_score >= eng_threshold else 0
    conversation['predicted_engagement'] = engagement
    
    # Predict satisfaction
    slope, sentiment_change, grateful_bonus, profanity_penalty, sarcasm_penalty, disagreement_penalty = satisfaction_preprocessing(conversation, speaker, tokenizer, model)
    satisfaction_score = slope_weight*slope + sentiment_change_weight*sentiment_change + grateful_bonus_weight*grateful_bonus + profanity_penalty_weight*profanity_penalty + sarcasm_penalty_weight*sarcasm_penalty + disagreement_penalty_weight*disagreement_penalty
    satisfaction = 1 if satisfaction_score >= sat_threshold else 0
    conversation['predicted_satisfaction'] = satisfaction
    
    df_preds = df_preds.append(conversation)

df_preds = df_preds[['conversation_id', 'subreddit', 'post_title', 'author', 'dialog_turn', 'text', 'predicted_satisfaction', 'predicted_engagement', 'compound', 'sentiment', 'emotion_prediction', 'token_length', 'sentences', 'sentence_compounds', 'strongest_compound']]

df_preds.to_csv(out_path, index=False)

end = time.time()
minutes = (end - start) / 60
print('Time it takes to predict for dataset (in minutes):', minutes)