## Import packages and data

In [None]:
import json
import yaml

import pandas as pd

In [None]:
pd.options.display.max_columns = None


In [None]:
# load the yaml of questions from the Brexit checker downloaded from
# https://github.com/alphagov/finder-frontend/blob/master/app/lib/brexit_checker/questions.yaml
with open(r'questions.yaml') as file:
    questions_list = yaml.load(file, Loader=yaml.SafeLoader)

In [None]:
# there are four question types at the moment:
# 'multiple', 'multiple_grouped', 'single', 'single_wrapped'
set([question['type'] for question in questions_list['questions']])

In [None]:
# open a sample of accounts data (checker answers) with timestamps and account IDs removed
answers_sample = pd.read_csv('answers_sample.csv')

In [None]:
answers_sample

#### Work on questions with a single answer

In [None]:
def collapse_single_answer_q(row_of_accounts_data, question_option_values):
    """
    look at all the columns relating to a single-answer question (one columns per possible answer) and return the answer that was chosen
    one_row: a row of accounts data (from a pandas dataframe)
    question_option_values: for a single answer question, what values can the answer take?
    """
    for table_value in question_option_values:
        if row_of_accounts_data[table_value] == True:
            return table_value

In [None]:
def collapse_all_single_answer_questions(single_answer_questions_list, accounts_df, new_accounts_columns_dict):
    """
    take a dataframe of checker responses, for questions that only want one answer, get all the potential answer columns,
    bring the answers into one column for using later on, and drop the one-hot encoding style columns
    single_answer_questions: list of the dictionaries describing each question that takes a single answer
    accounts_df: pandas dataframe containing accounts data - the responses to the brexit checker questions
    """
    for question in single_answer_questions_list:
        question_key = question['key'] #
        question_option_values = [option['value'].replace('-','_') for option in question['options']]
        accounts_df[question_key] = accounts_df.apply(lambda x: collapse_single_answer_q(x, question_option_values), axis=1)
#         drop those one-hot style columns for a narrower DF, could remove this if you want to keep them though
#         print(question_option_values)
        accounts_df = accounts_df.drop(question_option_values, axis=1)
        new_accounts_columns_dict.update({question['key']: [question['key'],
                                                            question['text'],
                                                            'single answer',
                                                            [option['label'] for option in question['options']],
                                                            question_option_values]})
    return accounts_df, new_accounts_columns_dict

#### Look at single-wrapped questions

In [None]:
def collapse_all_single_wrapped_answer_questions(single_wrapped_questions_list, accounts_df, new_accounts_columns_dict):
    """
    take a dataframe of checker responses, for questions that only want one answer, but then multiple follow ups,
    get all the potential answer columns,
    bring the mutually exclusive top-level answers into one column for using later on,
    and drop the one-hot encoding style columns
    single_wrapped_questions_list: list of the dictionaries describing each question that takes a single answer with multiple follow-ups
    accounts_df: pandas dataframe containing accounts data - the responses to the brexit checker questions
    """
    column_renaming_mapper = dict()
    for question in single_wrapped_questions_list:
        question_key = question['key']
        question_option_values = []
        for option in question['options']:
            question_option_values.append(option['value'].replace('-','_'))
            if 'options' in option:
                for choice in option['options']:
                    new_column_name = f"{question_key}--{choice['value'].replace('-','_')}"
                    column_renaming_mapper.update(
                        {choice['value'].replace('-','_'): new_column_name})
                    new_accounts_columns_dict.update(
                        {new_column_name: [question['key'],
                                           question['text'],
                                           'single wrapped - further options',
                                           choice['label'],
                                           ['True', 'NaN']]})
        accounts_df[question_key] = accounts_df.apply(lambda x: collapse_single_answer_q(x, question_option_values), axis=1)
#         drop those one-hot style columns for a narrower DF, could remove this if you want to keep them though
#         print(question_option_values)
        accounts_df = accounts_df.drop(question_option_values, axis=1)
        new_accounts_columns_dict.update({question['key']: [question['key'],
                                                            question['text'],
                                                            'single wrapped - top level',
                                                            [option['label'] for option in question['options']],
                                                            question_option_values]})
    accounts_df = accounts_df.rename(columns=column_renaming_mapper, errors="raise")
    return accounts_df, new_accounts_columns_dict

#### Rename multi-answer columns to add reference to the question asked?

In [None]:
def rename_multiple_answer_questions(multiple_questions_list, accounts_df, new_accounts_columns_dict):
    """
    take a dataframe of checker responses, for questions that can have multiple answers,
    get all the potential answer columns and add the question key as a prefix
    multiple_questions_list: list of the dictionaries describing each question that takes multiple answers
    accounts_df: pandas dataframe containing accounts data - the responses to the brexit checker questions
    """
    column_renaming_mapper = dict()
    for question in multiple_questions_list:
        question_key = question['key']
        for choice in question['options']:
            new_column_name = f"{question_key}--{choice['value'].replace('-','_')}"
            column_renaming_mapper.update(
                {choice['value'].replace('-','_'): new_column_name})
            new_accounts_columns_dict.update(
                {new_column_name: [question['key'],
                                   question['text'],
                                   'multiple',
                                   choice['label'],
                                   ['True', 'NaN']]})
    accounts_df = accounts_df.rename(columns=column_renaming_mapper, errors="raise")
#         print(question_option_values)
    return accounts_df, new_accounts_columns_dict

#### Rename multiple grouped answer columns to add reference to the question asked?

In [None]:
def rename_multiple_grouped_answer_questions(multiple_grouped_questions_list, accounts_df, new_accounts_columns_dict):
    """
    take a dataframe of checker responses, for questions that can have multiple (grouped)answers,
    get all the potential answer columns and add the question key as a prefix
    multiple_grouped_questions_list: list of the dictionaries describing each question that takes multiple grouped answers
    accounts_df: pandas dataframe containing accounts data - the responses to the brexit checker questions
    """
    column_renaming_mapper = dict()
    for question in multiple_grouped_questions_list:
        question_key = question['key']
        question_option_values = []
        for option in question['options']:
            if 'options' in option:
                for choice in option['options']:
                    new_column_name = f"{question_key}--{choice['value'].replace('-','_')}"
                    column_renaming_mapper.update(
                        {choice['value'].replace('-','_'): new_column_name})
                    new_accounts_columns_dict.update(
                        {new_column_name: [question['key'],
                                           question['text'],
                                           'multiple grouped',
                                           choice['label'],
                                           ['True', 'NaN']]})
            
    accounts_df = accounts_df.rename(columns=column_renaming_mapper, errors="raise")
    return accounts_df, new_accounts_columns_dict


In [None]:
def neaten_accounts_df(questions_list, accounts_df):
#     check we only have expected question types
    expected_question_types = set([question['type'] for question in questions_list['questions']]) 
    if len({'multiple', 'multiple_grouped', 'single', 'single_wrapped'} - expected_question_types) > 1:
            raise ValueError('unknown question types in questions_list that we cannot process')
    
    new_accounts_columns_dict = dict()
            
    single_answer_questions = [question for question in questions_list['questions'] if question['type'] == 'single']
    accounts_df, new_accounts_columns_dict = collapse_all_single_answer_questions(
        single_answer_questions, accounts_df, new_accounts_columns_dict)

    single_wrapped_answer_questions = [question for question in questions_list['questions'] if question['type'] == 'single_wrapped']
    accounts_df, new_accounts_columns_dict = collapse_all_single_wrapped_answer_questions(
        single_wrapped_answer_questions, accounts_df, new_accounts_columns_dict)

    multiple_answer_questions = [question for question in questions_list['questions'] if question['type'] == 'multiple']
    accounts_df, new_accounts_columns_dict = rename_multiple_answer_questions(
        multiple_answer_questions, accounts_df, new_accounts_columns_dict)

    multiple_grouped_answer_questions = [question for question in questions_list['questions'] if question['type'] == 'multiple_grouped']
    accounts_df, new_accounts_columns_dict = rename_multiple_grouped_answer_questions(
        multiple_grouped_answer_questions, accounts_df, new_accounts_columns_dict)
    return accounts_df, new_accounts_columns_dict

In [None]:
neater_accounts_sample, new_schema = neaten_accounts_df(questions_list, answers_sample)

In [None]:
neater_accounts_sample

In [None]:
new_schema_df = pd.DataFrame.from_dict(new_schema, orient='index', 
                       columns=['question-key', 'question-text', 'question type', 'options', 'values'])

In [None]:
new_schema_df.to_csv('new_schema.csv')