# Processing of Hand Scored Data

This code is used to process the data set that was scored by hand using Label Studio to prepare it for statistical analysis in R. 

In [16]:
# import packages
import pandas as pd
import re
import ast
from collections import Counter

In [None]:
# import data 
data = pd.read_csv("../data/scored/hand_scored_raw.csv") 

# create dataframe with relevant variables
selected = ['ID', 'text', 'lbl', 'group', 'WC']  
data_selected = data[selected]

# convert variable lbl to dictionary list (if necessary)
data_selected['lbl'] = data_selected['lbl'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

In [18]:
# defining a function to count the labels 
def count_labels(row):
    labels = [label for item in row for label in item['labels']]
    counts = Counter(labels)
    return counts

# apply function 
data_selected['label_counts'] = data_selected['lbl'].apply(lambda row: count_labels(row, ref_labels))

In [19]:
# extract counts from label_counts
label_data = data_selected['label_counts'].apply(lambda x: dict(x))

# mutate dictionaries into columns
label_df = pd.json_normalize(label_data)

# rename variables
label_df = label_df.rename (columns={
    "event": "int_event",
    "time" : "int_time",
    "place": "int_place",
    "perception": "int_perc", 
    "emotion\\/thought": "int_emo",
    "ext:event": "ext_event",
    "ext:semantic": "ext_sem",
    "ext:repetition": "ext_rep",
    "ext:other": "ext_other"
})

## add columns to dataframe
data_labelled = pd.concat([data_selected, label_df], axis=1)

# replace NA values with 0 
data_labelled = data_labelled.fillna(0)

In [20]:
# now we add two new variables with counts for internal and external details

# filter internal and external details columns
internal_columns = [col for col in label_df.columns if col.startswith('int_')]
external_columns = [col for col in label_df.columns if col.startswith('ext_')]

# sum up counts for internal and external columns, each in a new variable
data_labelled['internal_details'] = data_labelled[internal_columns].sum(axis=1)
data_labelled['external_details'] = data_labelled[external_columns].sum(axis=1)

In [None]:
# save data

# create dataframe with relevant variables
dropped = ['text', 'lbl', 'label_counts'] 
data_final = data_labelled.drop(columns=dropped)

# save as csv file
data_final.to_csv('../data/scored/hand_scored.csv', index=False)