Answer cleaning for Mistral results

In [1]:
import json
import re
import pandas as pd
import numpy as np

In [2]:
anes = pd.read_csv('data/anes_timeseries_2020_csv_20220210.csv')

filename = "data/anes_questions.json"
with open(filename, "r") as f:
    questions = json.load(f)

# Question had an option for free text answer, which is not useful for the analysis
# turned into negative value to not take it into account
anes['V201336'].replace(5, -7, inplace=True)

questcode_topic = {"V201336": "abortion1",
              "V201342x": "abortion2",
              "V202332": "climate change1",
              "V202333": "climate change2",
              "V202337": "gun control1",
              "V202341x": "gun control2",
              "V202344x": "gun control3",
              "V202232": "immigration1",
              "V202233": "immigration2",
              "V202236x":"immigration3",
              "V202380x": "healthcare1"}

  anes = pd.read_csv('data/anes_timeseries_2020_csv_20220210.csv')


In [10]:
with open('outputs/mistral_few_ans.json', 'r') as f:
    mistral_few_ans = json.load(f)

with open('outputs/mistralSE_few_ans.json', 'r') as f:
    mistral_SE_few_ans = json.load(f)

with open('outputs/answers_mistral_zero.json', 'r') as f:
    mistral_zero_ans = json.load(f)

with open('outputs/mistral_few_strict.json', 'r') as f:
    mistral_few_strict = json.load(f)

with open('outputs/mistralSE_few_strict.json', 'r') as f:
    mistral_SE_few_strict = json.load(f)

In [4]:
def clean_mistral(data):
    final_ans_dict = {}
    pattern1 = r'\{.*?\}'

    for key, value in data.items():
        answer_list = []
        # print(key)
        for item in value:
            item = item.replace('\n', '').replace('%', '')
            # Add a closing parenthesis if missing
            if item.count('{') > item.count('}'):
                item += "}"
            # Correct wrong number format
            item = item.replace('.}', '.0}').replace(':}', ':0}')
            # Only match with the parenthesis and not any additional content
            match = re.search(pattern1, item)
            matched = match.group(0)
            final_ans = eval(matched)  

            answer_list.append(final_ans)
            # Check if each dict has right amount of answers
            # Valid number of answers
        valid_ans = anes[anes[key] > 0]
        number_answers = len(valid_ans[key].unique())
        # LLM number of answers
        ans_dict = [i for i in answer_list if len(i) == number_answers]

        final_ans_dict[key] = ans_dict
    return final_ans_dict
        
mistral_few_ans = clean_mistral(mistral_few_ans)
mistral_SE_few_ans = clean_mistral(mistral_SE_few_ans)
mistral_zero_ans = clean_mistral(mistral_zero_ans)
mistral_few_strict = clean_mistral(mistral_few_strict)
mistral_SE_few_strict = clean_mistral(mistral_SE_few_strict)

In [5]:
def check_and_normalize(answer_object):
    for key, answers in answer_object.items():
        
        #check if answers add up to 100%
        valid_answer = []
        for i in answers:
            if sum(i.values()) == 100:
                valid_answer.append(i)

            # If not normalize for 100
            else:
                # print(f"Sum is not equal to 100 in answer: {key}, {sum(i.values())}")
                sum_val = sum(i.values())
               
                for quest, ele in i.items():
                    norm = round(ele*100/sum_val, 1)
                    i[quest] = norm
                    valid_answer.append(i)

        if valid_answer:  # If there are valid answers
            array_data = np.array([list(dic.values()) for dic in valid_answer])
            averages = np.mean(array_data, axis=0)
            averages = np.round(averages, 1)
            av_dict = dict(enumerate(averages.flatten(), 1))
            answers.append(av_dict)
        else:
            print("No valid answers found.")
        
    return answer_object


In [6]:
mistral_few_ans = check_and_normalize(mistral_few_ans)
mistral_SE_few_ans = check_and_normalize(mistral_SE_few_ans)
mistral_zero_ans = check_and_normalize(mistral_zero_ans)
mistral_few_strict = check_and_normalize(mistral_few_strict)
mistral_SE_few_strict = check_and_normalize(mistral_SE_few_strict)

# save to JSON
with open('outputs/mistral_clean/mistral_few_ans.json', 'w') as f:
    json.dump(mistral_few_ans, f)

with open('outputs/mistral_clean/mistral_SE_few_ans.json', 'w') as f:
    json.dump(mistral_SE_few_ans, f)

with open('outputs/mistral_clean/mistral_zero_ans.json', 'w') as f:
    json.dump(mistral_zero_ans, f)

with open('outputs/mistral_clean/mistral_few_strict.json', 'w') as f:
    json.dump(mistral_few_strict, f)

with open('outputs/mistral_clean/mistral_SE_few_strict.json', 'w') as f:
    json.dump(mistral_SE_few_strict, f)

No valid answers found.
No valid answers found.
No valid answers found.


In [9]:
mistral_few_strict

{'V201336': [{1: 10, 2: 15, 3: 25, 4: 50},
  {1: 25.0, 2: 25.0, 3: 25.0, 4: 25.0},
  {1: 25.0, 2: 25.0, 3: 25.0, 4: 25.0},
  {1: 25.0, 2: 25.0, 3: 25.0, 4: 25.0},
  {1: 10.0, 2: 11.0, 3: 12.9, 4: 66.2},
  {1: 20.6, 2: 21.1, 3: 22.2, 4: 36.2}],
 'V201342x': [],
 'V202332': [{'1': 20.0, '2': 20.0, '3': 20.0, '4': 20.0, '5': 20.0},
  {'1': 20.0, '2': 20.0, '3': 20.0, '4': 20.0, '5': 20.0},
  {'1': 20.0, '2': 20.0, '3': 20.0, '4': 20.0, '5': 20.0},
  {'1': 20.0, '2': 20.0, '3': 20.0, '4': 20.0, '5': 20.0},
  {1: 20.0, 2: 20.0, 3: 20.0, 4: 20.0, 5: 20.0},
  {1: 20.0, 2: 20.0, 3: 20.0, 4: 20.0, 5: 20.0}],
 'V202333': [{'1': 11.9, '2': 13.1, '3': 17.9, '4': 25.0, '5': 32.1},
  {'1': 20.0, '2': 20.0, '3': 20.0, '4': 20.0, '5': 20.0},
  {'1': 20.0, '2': 20.0, '3': 20.0, '4': 20.0, '5': 20.0},
  {'1': 20.0, '2': 20.0, '3': 20.0, '4': 20.0, '5': 20.0},
  {'1': 20.0, '2': 20.0, '3': 20.0, '4': 20.0, '5': 20.0},
  {1: 18.4, 2: 18.6, 3: 19.6, 4: 21.0, 5: 22.4}],
 'V202337': [{'1': 12.0, '2': 12.0, '