In [153]:
import pandas as pd
import csv
import random
import warnings
from random import seed
from random import sample
from pandas.core.common import SettingWithCopyWarning

warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

pd.options.display.max_colwidth = 1000

In [189]:
col_names = ["conversation id","subreddit","post title","author","dialog turn","text","compound","sentiment","emotion prediction"]
df = pd.read_csv("data/RED/offmychest_dyadic_convs_clean_emotion.csv", names=col_names, engine='python', dtype={'conversation id': 'unicode'})
df = df.rename(columns={'conversation id': 'conversation_id', 'post title': 'post_title', 'dialog turn': 'dialog_turn', 'emotion prediction': 'emotion_prediction'})
print("Number of conversations in raw dataset: ", df['conversation_id'].nunique())
print("Number of rows in raw dataset: ", len(df))

# Drop rows which have same text and conversation_id
df = df.drop_duplicates(subset=['text','conversation_id'],keep="first")
print("Number of rows after duplicates are dropped: ", len(df))

# Group data by conversation id and calculate count of each conversation id
df_conv_len = df.groupby("conversation_id").size().reset_index(name='num_dialog_turns')

# Separate conversation id's with multiple occurrences as dialogues
df_dia = df_conv_len[df_conv_len["num_dialog_turns"] > 2]
print("Number of conversations longer than 2 turns: ", len(df_dia))
df_dia = df_dia.reset_index()
df_dia = df_dia.drop(columns=['num_dialog_turns'])

# Only conversations with multiple turns remain in the dataset
df = df.join(df_dia.set_index('conversation_id'), on='conversation_id', how="right") 

# Separate conversations that have exactly 2 authors
df_conv_authors = df.groupby("conversation_id")["author"].unique().reset_index()
df_conv_authors["author"] = df_conv_authors["author"].apply(lambda x: x.size)
df_conv_authors = df_conv_authors[df_conv_authors["author"] == 2]
df_conv_authors = df_conv_authors.drop(columns=['author'])

# Only conversations that have 2 authors remain in the dataset
df = df.join(df_conv_authors.set_index('conversation_id'), on='conversation_id', how="right") 
print("Number of conversations longer than 2 turns with 2 authors: ", len(df_conv_authors))
df.reset_index(drop=True, inplace=True)

# Number of turns of each author per conversation id
df_num_author_turns = df.groupby(['conversation_id','author']).size().reset_index(name="author_num_turns")

# Speakers of each conversation with their number of speaker turns
df_speakers = df.groupby('conversation_id').first()['author'].reset_index(name='author')
df_num_speaker_turns = pd.merge(df_speakers, df_num_author_turns)
df_num_speaker_turns = df_num_speaker_turns[df_num_speaker_turns['author_num_turns'] > 1]
df_num_speaker_turns = df_num_speaker_turns['conversation_id']
df_num_speaker_turns = df_num_speaker_turns.drop(columns=['author','author_num_turns'])

df = pd.merge(df, df_num_speaker_turns, on="conversation_id")
print("Number of conversations longer than 2 turns with 2 authors and with multiple speaker turns: ", df['conversation_id'].nunique())

df = df.dropna(subset=['compound','sentiment','emotion_prediction'])
print("Number of conversations in cleaned dataset: ", df['conversation_id'].nunique())
print("Number of rows in cleaned df: ", len(df))

Number of conversations in raw dataset:  437746
Number of rows in raw dataset:  1375164
Number of rows after duplicates are dropped:  1063771
Number of conversations longer than 2 turns:  94890
Number of conversations longer than 2 turns with 2 authors:  67039
Number of conversations longer than 2 turns with 2 authors and with multiple speaker turns:  60986
Number of conversations in cleaned dataset:  60986
Number of rows in cleaned df:  239056


In [241]:
# select a random sample without replacement
# seed random number generator
seed(3)
# prepare a sequence
sequence = [i for i in range(len(df))]
# select a subset without replacement
subset = sample(sequence, 30)
print(subset)

[62380, 155357, 142666, 34189, 96981, 158314, 124270, 164028, 152266, 17177, 158754, 3451, 238241, 219537, 123006, 67988, 144384, 61428, 50264, 187996, 123276, 141813, 219273, 144082, 124873, 104106, 167527, 225711, 39483, 60796]


In [252]:
# Take conversation with given row number
df.take([104106]) 

Unnamed: 0,conversation_id,subreddit,post_title,author,dialog_turn,text,compound,sentiment,emotion_prediction,index
104107,249638,offmychest,I am going to commit suicide if I fail this course,throwaway19910410,1.0,"I want to start by saying this is **not** a call for help. Currently I'm part, along with 11 other guys, of a very exclusive one-year-long course provided by the government of my country in order to be part of a very exclusive branch of said government. This course is very demanding, imagine *The Pursuit of Happyness*, but with less homelessness and with more available positions (10). I've always wanted to be part of this specific branch, it is my dream, I went to the university specifically to be there, and now, at 29, I finally have a shot at it. I'm doing quite good so far, and the competition for a position is getting lighter (originally we were 13, and one of the current guys is probably dropping out soon), but the real problem of course is the competition against myself, there is always the possibility of having a Fail grade somewhere that can take me out of the race. I'm doing my best in order to avoid such outcome, but shit happens, and some ugly shit has happened to me ...",-0.7507,negative,embarrassed,166273


In [256]:
df1 = df[df.conversation_id == '319473']
df2 = df[df.conversation_id == '147367']
df3 = df[df.conversation_id == '239904']
df4 = df[df.conversation_id == '27765']
df5 = df[df.conversation_id == '315328']
df6 = df[df.conversation_id == '104480']
df7 = df[df.conversation_id == '98827']
df8 = df[df.conversation_id == '197802']
df9 = df[df.conversation_id == '17108']
df10 = df[df.conversation_id == '36561']
df11 = df[df.conversation_id == '304034']
df12 = df[df.conversation_id == '278531']
df13 = df[df.conversation_id == '249638']

df13

Unnamed: 0,conversation_id,subreddit,post_title,author,dialog_turn,text,compound,sentiment,emotion_prediction,index
104107,249638,offmychest,I am going to commit suicide if I fail this course,throwaway19910410,1.0,"I want to start by saying this is **not** a call for help. Currently I'm part, along with 11 other guys, of a very exclusive one-year-long course provided by the government of my country in order to be part of a very exclusive branch of said government. This course is very demanding, imagine *The Pursuit of Happyness*, but with less homelessness and with more available positions (10). I've always wanted to be part of this specific branch, it is my dream, I went to the university specifically to be there, and now, at 29, I finally have a shot at it. I'm doing quite good so far, and the competition for a position is getting lighter (originally we were 13, and one of the current guys is probably dropping out soon), but the real problem of course is the competition against myself, there is always the possibility of having a Fail grade somewhere that can take me out of the race. I'm doing my best in order to avoid such outcome, but shit happens, and some ugly shit has happened to me ...",-0.7507,negative,embarrassed,166273
104108,249638,offmychest,I am going to commit suicide if I fail this course,SugarPie89,2.0,Couldn't you just take the course again instead of killing yourself that is just stupid,-0.8316,negative,furious,166273
104109,249638,offmychest,I am going to commit suicide if I fail this course,throwaway19910410,3.0,"If you fail you have to wait 5 years to take it again, so not an option. Also sorry for being stupid.",-0.802,negative,furious,166273
104110,249638,offmychest,I am going to commit suicide if I fail this course,throwaway19910410,4.0,"If you fail you have to wait 5 years to take it again, so not an option.",-0.5423,negative,furious,166273
104111,249638,offmychest,I am going to commit suicide if I fail this course,SugarPie89,5.0,"There are people actually suffering and he wants to throw his life away for his own failure at something he could try again later. I think it is stupid. He is not mentally ill, just desperate and obviously doesn't value life. What happened to people actually trying again after failing?",-0.9338,negative,furious,166273


In [257]:
df1["ground_truth_satisfaction"] = 1
df1["ground_truth_engagement"] = 1

df2["ground_truth_satisfaction"] = 1
df2["ground_truth_engagement"] = 1

df3["ground_truth_satisfaction"] = 1
df3["ground_truth_engagement"] = 1

df4["ground_truth_satisfaction"] = 1
df4["ground_truth_engagement"] = 1

df5["ground_truth_satisfaction"] = 1
df5["ground_truth_engagement"] = 1

df6["ground_truth_satisfaction"] = 1
df6["ground_truth_engagement"] = 1

df7["ground_truth_satisfaction"] = 0
df7["ground_truth_engagement"] = 1

df8["ground_truth_satisfaction"] = 1
df8["ground_truth_engagement"] = 1

df9["ground_truth_satisfaction"] = 0
df9["ground_truth_engagement"] = 1

df10["ground_truth_satisfaction"] = 1
df10["ground_truth_engagement"] = 1

df11["ground_truth_satisfaction"] = 1
df11["ground_truth_engagement"] = 1

df12["ground_truth_satisfaction"] = 1
df12["ground_truth_engagement"] = 1

df13["ground_truth_satisfaction"] = 0
df13["ground_truth_engagement"] = 1

In [258]:
df_annotated = df1.append(df2, ignore_index = True)
df_annotated = df_annotated.append(df3, ignore_index = True)
df_annotated = df_annotated.append(df4, ignore_index = True)
df_annotated = df_annotated.append(df5, ignore_index = True)
df_annotated = df_annotated.append(df6, ignore_index = True)
df_annotated = df_annotated.append(df7, ignore_index = True)
df_annotated = df_annotated.append(df8, ignore_index = True)
df_annotated = df_annotated.append(df9, ignore_index = True)
df_annotated = df_annotated.append(df10, ignore_index = True)
df_annotated = df_annotated.append(df11, ignore_index = True)
df_annotated = df_annotated.append(df12, ignore_index = True)
df_annotated = df_annotated.append(df13, ignore_index = True)

In [259]:
# number of selected conversations
len(df_annotated.groupby("conversation_id").count())

13

In [260]:
df_annotated.head(2)

Unnamed: 0,conversation_id,subreddit,post_title,author,dialog_turn,text,compound,sentiment,emotion_prediction,index,ground_truth_satisfaction,ground_truth_engagement
0,319473,offmychest,"Have been having self-esteem issues, today a guy asked me out.",SuicideVN,1.0,"I recently got out of a 3 month relationship with someone I thought was ""the one"" since we had known each other for a long time and had similar personalities but for some reason or another we were never able to date until recently we tried it out, things were good but once we had our first argument be showed me his true colors and I decided to end it. It destroyed my self-esteem since I already battle with depression and anxiety, mainly around trusting and meeting people. Today I hardly wore makeup and went to my local store for some groceries. This random guy out of the blue began making conversation with me and ended asking me for a date, for some reason I felt a good vibe coming off him and accepted. I am excited and genuinely feel good about the possibilities. Thanks for reading. TLDR: title says it all",0.8847,positive,prepared,243868,1,1
1,319473,offmychest,"Have been having self-esteem issues, today a guy asked me out.",daughter_of_tides,2.0,"Yay! Wonderful to hear you’re healing, and I hope your date goes well.",0.908,positive,neutral,243868,1,1


In [261]:
df_annotated.to_csv("data/RED/annotated/offmychest_annotated.csv", index=False)