## Translate agent text based responses, to Likert scale scores.

In [None]:
import pandas as pd

# delete as applicable:
# population_label = 'popc'
population_label = 'popp'

parts = []
for i in range(6):
    parts.append(pd.read_csv(f'{population_label}_responses_{i+1}.csv', index_col=0))
    
df_responses = pd.concat(parts)
df_responses[:2]

In [None]:
# responses file is too big for GitHub, break it up into smaller parts.
break_file_up = False
if break_file_up:
    df_responses[:50].to_csv(f'{population_label}_responses_1.csv')
    df_responses[50:100].to_csv(f'{population_label}_responses_2.csv')
    df_responses[100:150].to_csv(f'{population_label}_responses_3.csv')
    df_responses[150:200].to_csv(f'{population_label}_responses_4.csv')
    df_responses[200:250].to_csv(f'{population_label}_responses_5.csv')
    df_responses[250:].to_csv(f'{population_label}_responses_6.csv')

    parts = []
    for i in range(6):
        parts.append(pd.read_csv(f'{population_label}_responses_{i+1}.csv', index_col=0))
        
    combined = pd.concat(parts)
    combined.to_csv(f'{population_label}_responses_TEST.csv')

In [None]:
def check_responses(df):
    error_count = 0
    specific_error_count = 0
    
    for agent in df.index:
        for adj in df.columns:
            answer = df.loc[agent, adj]
            if answer == '[error]':
                print(f"Error: {agent}, found '[error]' in results for {adj}.")
                error_count += 1
                if adj == "niggardly":
                    specific_error_count += 1
                
    return error_count, specific_error_count

check_responses(df_responses)

In [None]:
errors = {}

for adj in list(df_responses.columns):
    values = df_responses[adj].value_counts()
    if '[error]' in values:
        errors[adj] = values['[error]']
        
[print(f"'{adj}' {count}") for adj, count in errors.items() if count > 1];

In [None]:
import re

# the 9-point scale.
expected_answers = ['Extremely Inaccurate',
                    'Very Inaccurate',
                    'Moderately Inaccurate',
                    'Slightly Inaccurate',
                    'Neither Accurate Nor Inaccurate',
                    'Slightly Accurate',
                    'Moderately Accurate',
                    'Very Accurate',
                    'Extremely Accurate']

def match_accuracy(text):
    pattern = '|'.join(re.escape(level) for level in expected_answers)
    match = re.search(pattern, text, re.IGNORECASE)
    if match:
        return match.group().lower()
    else:
        return None
    
def translate_response(answer, error_value=5):
    ret = match_accuracy(answer)
    if ret == 'Extremely Inaccurate'.lower():
        return 1
    elif ret == 'Very Inaccurate'.lower():
        return 2
    elif ret == 'Moderately Inaccurate'.lower():
        return 3
    elif ret == 'Slightly Inaccurate'.lower():
        return 4
    elif ret == 'Neither Accurate Nor Inaccurate'.lower():
        return 5
    elif ret == 'Slightly Accurate'.lower():
        return 6
    elif ret == 'Moderately Accurate'.lower():
        return 7
    elif ret == 'Very Accurate'.lower():
        return 8
    elif ret == 'Extremely Accurate'.lower():
        return 9
    return error_value

df_scores = df_responses.map(translate_response)
df_scores
    

In [None]:
df_scores.to_csv(f'{population_label}_results.csv')

In [None]:
def ipsatise_scores(df):
    # drop 'niggardly' due to high content filter hit rate.
    del df['niggardly']
    
    # drop any other adjective that has stdev of 0.
    to_drop = df.columns[df.std()==0]
    print(f'Dropping {len(to_drop)} adjectives (stdev==0): '+', '.join(to_drop))
    df.drop(columns=to_drop, inplace=True)
    print(f'shape = {df.shape}')
    
    # ipsatise, as follows:
    df_mean = df.mean()
    
    # (a) his/her mean self-ratings across adjectives where 
    # the mean self-rating score (of everyone's response) 
    # is less than 5
    df_negs = df[df_mean[df_mean < 5].index]
    print(f"df_negs shape = {df_negs.shape}")
    
    # (b) his/her mean self-ratings across adjectives where 
    # the mean self-rating score (of everyone's response) 
    # is *greater than or equal to* 5
    df_pos = df[df_mean[df_mean >= 5].index]
    print(f"df_pos shape = {df_pos.shape}")

    df_self_rating_means = pd.DataFrame({'positive': df_pos.mean(axis=1), 'negative': df_negs.mean(axis=1)})
    df_self_rating_means['average'] = df_self_rating_means.mean(axis=1) 
    print(f"df_self_rating_means shape = {df_self_rating_means.shape}")

    # now make the ipsatized data frame... 'this involves: 
    # for each adjective, subtracting the participant’s mean 
    # self-rating across all adjectives from his or her self-rating 
    # on the adjective in question and then dividing this 
    # difference by his or her standard deviation of self-ratings 
    # across all adjectives'
    adjusted_mean = df_self_rating_means['average']

    stdevs = df.std(axis=1)
    ipsatized_data = (df.sub(adjusted_mean, axis=0)).div(stdevs, axis=0)
    print(f"ipsatized_data shape = {ipsatized_data.shape}")
    return ipsatized_data

df_ipsatised = ipsatise_scores(df_scores)
df_ipsatised.to_csv(f'{population_label}_ipsatised_results.csv')