In [8]:

import pandas as pd
import numpy as np
import json
import re
import ast

In [9]:
plutchik_wheel = {
    'na': ['na', 'na2', 'na3'],
    'fear': ['apprehension', 'fear', 'terror'],
    'trust': ['acceptance', 'trust', 'admiration'],
    'joy': ['serenity', 'joy', 'ecstasy'],
    'anticipation': ['interest', 'anticipation', 'vigilance'],
    'anger': ['annoyance', 'anger', 'rage'],
    'disgust': ['boredom', 'disgust', 'loathing'],
    'sadness': ['pensiveness', 'sadness', 'grief']
}

emotions_mapping = {emotion: id for emotion, id in zip(plutchik_wheel.keys(), range(0,8))}

intensities_df = pd.DataFrame(plutchik_wheel).rename(columns = emotions_mapping)
intensities_mapping = {}
for column, series in intensities_df.items():
    for index, value in series.items():
        intensities_mapping[value] = (column, index)

intensities_mapping, emotions_mapping

({'na': (0, 0),
  'na2': (0, 1),
  'na3': (0, 2),
  'apprehension': (1, 0),
  'fear': (1, 1),
  'terror': (1, 2),
  'acceptance': (2, 0),
  'trust': (2, 1),
  'admiration': (2, 2),
  'serenity': (3, 0),
  'joy': (3, 1),
  'ecstasy': (3, 2),
  'interest': (4, 0),
  'anticipation': (4, 1),
  'vigilance': (4, 2),
  'annoyance': (5, 0),
  'anger': (5, 1),
  'rage': (5, 2),
  'boredom': (6, 0),
  'disgust': (6, 1),
  'loathing': (6, 2),
  'pensiveness': (7, 0),
  'sadness': (7, 1),
  'grief': (7, 2)},
 {'na': 0,
  'fear': 1,
  'trust': 2,
  'joy': 3,
  'anticipation': 4,
  'anger': 5,
  'disgust': 6,
  'sadness': 7})

In [10]:
def clean_df_to_csv(path_name):
    raw_df = pd.read_json(f'{path_name}.json').T
    cleaned_df = pd.DataFrame()
    for row_i, row in raw_df.iterrows():
        row_dict = {}
        row_dict['id'] = row['Reddit ID']
        row_dict['t'] = row['Time Created']
        row_dict['post'] = row['Reddit Post']
        raw_annotations = row['Annotations']

        gold_emotions = []
        gold_intensities = []

        for annot_name, annot in raw_annotations.items():
            annot_pattern = re.compile(r'Annotation (\d+) \| Assignment ID = (\w+) \| Worker ID = (\w+)')

            # Use the regex pattern to extract information
            match = re.match(annot_pattern, annot_name)

            if match:
                annotation_index, assignment_id, worker_id = match.groups()

                # Display the extracted information
                row_dict[f'annot_{annotation_index}_assignment_id'] = assignment_id
                row_dict[f'annot_{annotation_index}_worker_id'] = worker_id
                
                for entry in annot:
                    for category, label in entry.items():
                        category = category.lower()
                        label = label.lower()
                        col_name = f'annot_{annotation_index}_{category}'
                        if col_name in row_dict.keys():
                            row_dict[col_name][0].append(label)
                        else:
                            row_dict[col_name] = [[label]]

                        if 'emotion' in col_name and label not in gold_emotions:
                            gold_emotions.append(label)
                        if 'intensity' in col_name and label not in gold_intensities:
                            gold_intensities.append(label)
            else:
                print("ERROR: No match found.")
        
        gold_emotions = sorted(gold_emotions)
        gold_intensities = sorted(gold_intensities)
            
        row_dict['gold_emotions'] = [gold_emotions]
        row_dict['gold_intensities'] = [gold_intensities]

        row_dict['gold_emotions_ids'] = [sorted([emotions_mapping[emotion] for emotion in gold_emotions])]
        row_dict['gold_intensities_ids'] = [sorted([intensities_mapping[intensity] for intensity in gold_intensities])]


        cleaned_df = pd.concat([cleaned_df, pd.DataFrame(index = [row_i], data = row_dict)])

    cleaned_df.to_csv(f'{path_name}.csv')
    appraisals_df = pd.read_csv('../data/CovidET_appraisals.csv')
    pd.merge(cleaned_df, appraisals_df, left_on = 'id', right_on = 'Reddit ID').to_csv(f'{path_name}_w_appraisal.csv')


In [11]:
dataset_paths = ['CovidET-ALL', 'CovidET-ALL-train_val_test/test', 'CovidET-ALL-train_val_test/train', 'CovidET-ALL-train_val_test/val']

for path in dataset_paths:
    clean_df_to_csv(path)
    

In [14]:
model_dataset_paths = ['CovidET-ALL-train_val_test/test', 'CovidET-ALL-train_val_test/train', 'CovidET-ALL-train_val_test/val']
all_df_w_appraisal = pd.read_csv('CovidET-ALL_w_appraisal.csv')

for i, path in enumerate(model_dataset_paths):
    all_df_w_appraisal.iloc[i*100: (i+1)*100].to_csv(f'{path}_w_appraisal.csv')