In [1]:
import time
import pandas as pd
import csv
import helpers
import sarcastic
from engagement import engagement_preprocessing
from satisfaction import satisfaction_preprocessing
from helpers import round_sig



In [2]:
pd.set_option('mode.chained_assignment', None)

In [3]:
# Train sarcasm classification model 
tokenizer, model = sarcastic.train()

In [4]:
subreddit_name = "sad"
in_path = "data/RED/clean/" + subreddit_name + "_clean.csv"
out_path = "data/RED/clean/labeled/" + subreddit_name + "_clean_labeled.csv"

In [5]:
df = pd.read_csv(in_path)
df = df.rename(columns={'conversation id': 'conversation_id', 'post title': 'post_title', 'dialog turn': 'dialog_turn', 'emotion prediction': 'emotion_prediction'})

In [6]:
# Group conversations by conversation_id and subreddit
grouped = df.groupby(['conversation_id']).groups

In [7]:
#s = [101, 8786]

#subset = {x: grouped[x] for x in s}

In [8]:
# BEST HYPERPARAMETERS 

eng_threshold = 2.75
num_turns_weight = 0.75
interleaved_weight = 0.75
token_length_weight = 0.025
diff_weight = -0.25

sat_threshold = 0.6
slope_weight = 0.5
sentiment_change_weight = 0.5
grateful_bonus_weight = 3.25
profanity_penalty_weight = 0.5
sarcasm_penalty_weight = 0.5
disagreement_penalty_weight = 0.5

In [11]:
# PREDICT ENGAGEMENT AND SATISFACTION USING BEST HYPERPARAMETERS ON DATASET 

start = time.time()
cols = df.columns.tolist()
subreddit = df.iloc[0]['subreddit']
df_preds = pd.DataFrame(columns=cols)

for conv_id in grouped:
    conversation, speaker, listener = helpers.extract_responses(conv_id, subreddit, df)
    
    print(conv_id)
    
    # Predict engagement
    num_turns, interleaved, token_length_score, num_turn_diff, num_speaker_responses, num_listener_responses, conversation = engagement_preprocessing(speaker, listener, conversation)
    
    if num_speaker_responses < 2 or num_turns < 3:
        continue
        
    engagement_score = num_turns_weight*num_turns + interleaved_weight*interleaved + token_length_weight*token_length_score + diff_weight*num_turn_diff
    engagement = 1 if engagement_score >= eng_threshold else 0
    conversation['predicted_engagement'] = engagement
    
    # Predict satisfaction
    slope, sentiment_change, grateful_bonus, profanity_penalty, sarcasm_penalty, disagreement_penalty = satisfaction_preprocessing(conversation, speaker, tokenizer, model)
    satisfaction_score = slope_weight*slope + sentiment_change_weight*sentiment_change + grateful_bonus_weight*grateful_bonus + profanity_penalty_weight*profanity_penalty + sarcasm_penalty_weight*sarcasm_penalty + disagreement_penalty_weight*disagreement_penalty
    satisfaction = 1 if satisfaction_score >= sat_threshold else 0
    conversation['predicted_satisfaction'] = satisfaction
    
    df_preds = df_preds.append(conversation)

df_preds = df_preds[['conversation_id', 'subreddit', 'post_title', 'author', 'dialog_turn', 'text', 'predicted_satisfaction', 'predicted_engagement', 'compound', 'sentiment', 'emotion_prediction', 'token_length', 'sentences', 'sentence_compounds', 'strongest_compound']]

df_preds.to_csv(out_path, index=False)

end = time.time()
minutes = (end - start) / 60
print('Time it takes to predict for dataset (in minutes):', minutes)

1
11
18
21
26
32
60
62
73
75
77
79
85
94
97
99
106
112
119
120
130
138
155
188
190
204
208
211
219
224
225
228
270
271
279
298
305
307
311
312
342
350
357
374
383
400
404
416
426
429
435
440
447
453
457
462
463
469
479
495
499
515
535
540
541
550
558
568
576
601
609
611
612
613
616
617
622
627
630
632
636
645
654
665
666
673
684
714
720
722
735
736
770
772
782
787
788
789
799
801
818
825
833
838
846
872
876
877
882
898
900
907
911
916
918
923
929
934
938
960
961
963
964
966
971
978
979
981
987
997
998
1011
1018
1028
1044
1048
1052
1054
1067
1073
1075
1091
1094
1101
1106
1109
1127
1129
1134
1137
1140
1141
1143
1144
1158
1166
1194
1203
1236
1243
1257
1269
1307
1311
1312
1321
1331
1332
1342
1347
1349
1351
1371
1381
1385
1387
1389
1400
1408
1411
1413
1422
1429
1444
1448
1474
1476
1477
1480
1481
1482
1489
1511
1512
1521
1533
1534
1536
1545
1550
1555
1556
1561
1571
1572
1592
1597
1618
1620
1625
1629
1633
1659
1664
1676
1686
1687
1694
1699
1715
1743
1752
1756
1759
1762
1763
1768
1770
1771
177