# Processing of Hand Scored Data

This code is used to process the data set that was scored by hand using Label Studio to prepare it for statistical analysis in R. 

In [16]:
# import packages
import pandas as pd
import re
import ast
from collections import Counter

In [None]:
# import data 
data = pd.read_csv("../data/scored/hand_scored_raw.csv") 

# create dataframe with relevant variables
selected = ['ID', 'text', 'lbl']  
data = data[selected]

# convert variable lbl to dictionary list (if necessary)
data['lbl'] = data['lbl'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

In [18]:
# define a list of labels as a reference
ref_labels = ["internal: event", "internal: time", "internal: place", 
              "internal: perceptual", "internal: emotion\/thought",
              "external"]

# defining a function to count the labels 
def count_labels(row, ref_labels):
    labels = [label for item in row for label in item['labels']]
    counts = Counter(labels)
    for label in ref_labels:
        counts[label] = counts.get(label, 0)
    return counts

# apply function 
data['label_counts'] = data['lbl'].apply(lambda row: count_labels(row, ref_labels))


In [19]:
# extract counts from label_counts
label_data = data['label_counts'].apply(lambda x: dict(x))

# mutate dictionaries into columns
label_df = pd.json_normalize(label_data)

## add columns to dataframe
data = pd.concat([data, label_df], axis=1)

In [20]:
# now we add two new variables with counts for internal and external details

# filter internal and external details columns
internal_columns = [col for col in label_df.columns if col.startswith('internal')]
external_columns = [col for col in label_df.columns if col.startswith('external')]

# sum up counts for internal and external columns, each in a new variable
data['internal_details'] = data[internal_columns].sum(axis=1)
data['external_details'] = data[external_columns].sum(axis=1)

In [21]:
# add a variable with a word count for each text
data['total_words_ger'] = data['text'].apply(lambda x: len(re.findall(r'\w+', str(x))))

In [None]:
# save data

# create dataframe with relevant variables
dropped = ['text', 'lbl', 'label_counts'] 
data = data.drop(columns=dropped)

data.to_csv('../data/scored/hand_scored_processed.csv', index=False)

Unnamed: 0,ID,internal: time,internal: event,external,internal: perceptual,internal: place,internal: emotion\/thought,internal_details,external_details,total_words_ger
0,101,1,2,1,1,0,0,4,1,43
1,102,2,4,0,1,0,1,8,0,49
2,103,3,5,3,1,0,0,9,3,94
3,104,1,4,0,0,0,0,5,0,31
4,105,1,5,0,1,1,2,10,0,51
5,106,1,2,0,0,0,1,4,0,20
6,107,0,2,0,0,0,0,2,0,23
7,108,0,0,1,0,0,0,0,1,7
8,109,2,6,0,0,0,0,8,0,47
9,110,2,4,5,2,0,2,10,5,131
