# Preparing the data for the R analysis

In this notebook, we merge the human subjects regressions/number of fixations with each model's revisions / number of edits.

In [1]:
import os
import pandas as pd

In [2]:
NA_VALUES = [
    "", "#N/A", "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN", "-NaN", 
    "-nan", "1.#IND", "1.#QNAN", "<NA>", "N/A", "NA", "NULL", "NaN",
    "None", "n/a", "nan"]

In [3]:
corpora = ['rastros_ptbr', 'potec_de', 'provo_en', 'nicenboim_es', 'mecol1_du', 'mecol2_enl2']

human_data = {tuple(name.split('_')): {} for name in corpora}

# from https://stackoverflow.com/a/56469905
with os.scandir('preprocessed/human_data/') as directory:
    for entry in directory:
        if entry.name.endswith('.tsv') and entry.is_file():
            name, language, measure = entry.name.split('_')
            df = pd.read_csv(entry.path, sep='\t',
                             # it is interpreting the token null as NaN, so we remove it from the defaulf list
                             # to do that, we copy the default values without the null element
                             keep_default_na=False,
                             na_values=NA_VALUES)
            df = df.drop('Unnamed: 0', axis=1)
            df.set_index('Identifier', inplace=True)
            df.sort_index(axis=0, inplace=True)
            human_data[(name, language)][measure.removesuffix('.tsv')] = df

In [4]:
model_data = {tuple(name.split('_')): {'revisions': {}, 'edits': {}} for name in corpora}

# from https://stackoverflow.com/a/56469905
with os.scandir('preprocessed/model_data/') as directory:
    for entry in directory:
        if entry.name.endswith('.tsv') and entry.is_file():
            name, language, model, measure = entry.name.split('_')
            measure = measure.removesuffix(('.tsv'))
            df = pd.read_csv(entry.path, sep='\t',
                             keep_default_na=False,
                             na_values=NA_VALUES)
            df.set_index('Unnamed: 0', inplace=True)
            df.index.name = 'Identifier'
            df.sort_index(axis=0, inplace=True)
            columns = {name: f'{name}_{model}' for name in df.columns if 'revision' in name or 'edit' in name}
            df.rename(columns=columns, inplace=True)
            model_data[(name, language)][measure][model.removesuffix('.tsv')] = df

## Revisions and regressions

In [5]:
MEASURE = 'first-pass-regression-out'
MODEL_MEASURE = 'revisions'

In [6]:
for corpus_name in corpora:
    key = tuple(corpus_name.split('_'))
    ref_index = human_data[key][MEASURE].index
    ref_tokens = human_data[key][MEASURE]['Token']
    for dataframe in human_data[key].values():
        assert all(dataframe.index == ref_index)
        assert all(dataframe['Token'] == ref_tokens)
        assert dataframe.Token.isna().sum() == 0

    for model, dataframe in model_data[key][MODEL_MEASURE].items():
        assert all(dataframe.index == ref_index)
        assert dataframe.Token.isna().sum() == 0
        if corpus_name == 'rastros_ptbr':
            # we changed the space encoding and a quotation mark in 3 tokens
            assert dataframe[dataframe['Token'] != ref_tokens].shape[0] == 3
        elif corpus_name == 'potec_de':
            # we changed the ; in 2 tokens
            assert dataframe[dataframe['Token'] != ref_tokens].shape[0] == 2
        else:
            assert all(dataframe['Token'] == ref_tokens)

In [7]:
merged = {}

for corpus_name in corpora:
    corpus, lang = corpus_name.split('_')
    if MEASURE not in human_data[(corpus, lang)]:
        continue
    aux = human_data[(corpus, lang)][MEASURE]

    for model, data in model_data[(corpus, lang)][MODEL_MEASURE].items():
        columns = [column for column in data.columns if 'revision' in column]
        aux = aux.merge(data[columns], on='Identifier', suffixes=("", '_' + model))
    
    merged[(corpus)] = aux

In [9]:
for corpus, data in merged.items():
    # plain revisions columns
    model_cols = [c for c in data.columns if 'Subj' not in c and 'effective' not in c and 'convenient' not in c and 'Token' not in c]
    # other columns
    default_cols = [c for c in data.columns if 'revision' not in c]
    # we will generate one dataframe for each column (i.e., each model+task)
    for col in model_cols:
        aux_data = data[[c for c in data.columns if c in default_cols or c == col]].copy()
        # keep only subject identifier
        aux_data.rename(columns={c: c.split(':')[1] for c in aux_data.columns if c not in (col, 'Token')}, inplace=True)
        # standard name for the dependent variable
        aux_data.rename(columns={col: 'revision'}, inplace=True)
        task, model = col.replace('revision:', '').split('_')
        if 'large' in task or 'chunk' in task:
            task = task.split('-')[0]
        aux_data = aux_data.reset_index()
        
        # put dataframe into R-friendly format (one human observation per row, with the dependent variable repeated for all subjects)
        aux_data = aux_data.melt(id_vars=['Identifier', 'Token', 'revision'], value_name='regression', var_name='subjectid')
        # split identifier into two columns with text id and token position
        aux_data[['n1', 'textid', 'n2', 'token_position']] = aux_data['Identifier'].str.split('_', expand=True)
        aux_data.drop('n1', axis=1, inplace=True)
        aux_data.drop('n2', axis=1, inplace=True)
        print(corpus, model, task)
        aux_data.to_csv(f"preprocessed/models-humans/revisions-regressions/{corpus}_{model}_{task}.csv")

rastros stanza-bilstm upos
rastros stanza-bilstm deprel
rastros stanza-bilstm head
rastros hf-trf pos
rastros hf-trf deprel
rastros hf-trf head
potec hf-trf pos
potec hf-trf deprel
potec hf-trf head
potec stanza-bilstm upos
potec stanza-bilstm xpos
potec stanza-bilstm ner
potec stanza-bilstm deprel
potec stanza-bilstm head
provo stanza-bilstm upos
provo stanza-bilstm xpos
provo stanza-bilstm ner
provo stanza-bilstm deprel
provo stanza-bilstm head
provo hf-trf pos
provo hf-trf deprel
provo hf-trf head
nicenboim hf-trf pos
nicenboim hf-trf deprel
nicenboim hf-trf head
nicenboim stanza-bilstm upos
nicenboim stanza-bilstm xpos
nicenboim stanza-bilstm ner
nicenboim stanza-bilstm deprel
nicenboim stanza-bilstm head
mecol1 stanza-bilstm upos
mecol1 stanza-bilstm xpos
mecol1 stanza-bilstm ner
mecol1 stanza-bilstm deprel
mecol1 stanza-bilstm head
mecol1 hf-trf pos
mecol1 hf-trf deprel
mecol1 hf-trf head
mecol2 hf-trf pos
mecol2 hf-trf deprel
mecol2 hf-trf head
mecol2 stanza-bilstm upos
mecol2 s

## Effective revisions and regressions

In [10]:
for corpus, data in merged.items():
    model_cols = [c for c in data.columns if 'effective' in c]
    default_cols = [c for c in data.columns if 'revision' not in c]
    for col in model_cols:
        aux_data = data[[c for c in data.columns if c in default_cols or c == col]].copy()
        aux_data.rename(columns={c: c.split(':')[1] for c in aux_data.columns if c not in (col, 'Token')}, inplace=True)
        aux_data.rename(columns={col: 'revision'}, inplace=True)
        task, model = col.replace('effective-revision:', '').split('_')
        if 'large' in task or 'chunk' in task:
            task = task.split('-')[0]
        aux_data = aux_data.reset_index()
        
        aux_data = aux_data.melt(id_vars=['Identifier', 'Token', 'revision'], value_name='regression', var_name='subjectid')
        
        aux_data[['n1', 'textid', 'n2', 'token_position']] = aux_data['Identifier'].str.split('_', expand=True)
        aux_data.drop('n1', axis=1, inplace=True)
        aux_data.drop('n2', axis=1, inplace=True)
        aux_data['revision'].fillna(0, inplace=True)
        print(corpus, model, task)
        aux_data.to_csv(f"preprocessed/models-humans/effective_revisions-regressions/{corpus}_{model}_{task}.csv")

rastros stanza-bilstm upos
rastros stanza-bilstm deprel
rastros stanza-bilstm head
rastros hf-trf pos
rastros hf-trf deprel
rastros hf-trf head
potec hf-trf pos
potec hf-trf deprel
potec hf-trf head
potec stanza-bilstm upos
potec stanza-bilstm xpos
potec stanza-bilstm ner
potec stanza-bilstm deprel
potec stanza-bilstm head
provo stanza-bilstm upos
provo stanza-bilstm xpos
provo stanza-bilstm ner
provo stanza-bilstm deprel
provo stanza-bilstm head
provo hf-trf pos
provo hf-trf deprel
provo hf-trf head
nicenboim hf-trf pos
nicenboim hf-trf deprel
nicenboim hf-trf head
nicenboim stanza-bilstm upos
nicenboim stanza-bilstm xpos
nicenboim stanza-bilstm ner
nicenboim stanza-bilstm deprel
nicenboim stanza-bilstm head
mecol1 stanza-bilstm upos
mecol1 stanza-bilstm xpos
mecol1 stanza-bilstm ner
mecol1 stanza-bilstm deprel
mecol1 stanza-bilstm head
mecol1 hf-trf pos
mecol1 hf-trf deprel
mecol1 hf-trf head
mecol2 hf-trf pos
mecol2 hf-trf deprel
mecol2 hf-trf head
mecol2 stanza-bilstm upos
mecol2 s