## Import packages

In [1]:
import pandas as pd
from IPython.display import display, HTML

import numpy as np
import math
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
%matplotlib inline

## Multiple answer questions

In [3]:
# Function to generate the results from a multiple answer question
# Produces the count (N) for each answer and the percentage (%) from the total sample number
# and prints and saves a graph of the data
# 
# Inputs: 
#   df - (pd.DataFrame) the cleaned dataframe 
#   q - (str) the question string that is the column name
#   n_options - (int) number of options that question has, e.g. how many columsn does the question answers span across
#   graph_type - (str) grapher function name
#   file_location - (str) the folder location for the grapher function
#
# Outputs:
#   No outputs, all generated results are displayed via print()/display()

def multiple_answer_userzoom(df, q, n_options, graph_type, file_location):
    # Get sample size before the answers are split up.
    sample_size = len(df[q])

    # get the column index 
    col_idx = df.columns.get_loc(q)

    counts = []
    perc = []
    for i in range(0, n_options):
        c = pd.value_counts(df[[col_idx+i]].values.flatten())
        counts.append(c)
        perc.append(c/sample_size)

    # Concatenate the results list into one DataFrame
    c_result = pd.DataFrame(pd.concat(counts))
    p_result = pd.DataFrame(pd.concat(perc))
    all_results = pd.concat([c_result,p_result], axis=1)
    all_results.columns = [q + ' (N)', q + ' (%)']

    # print sample size, total number of answers received and graph of the results
    print('SAMPLE SIZE: ' + str(sample_size))
    total_answers = all_results[q + ' (N)'].sum(axis=0)
    print('NO. ANSWERS: ' + str(total_answers))

    # Display the results DataFrame
    display(all_results)

    # Graph the results
    title = q.replace('"', '')
    title = title.replace('?', '')
    title = (title[:100] + '...') if len(title) > 100 else title
    graph_type(all_results, [q + ' (%)'], title, file_location)

##  Display other results (for relevant questions)

In [4]:
# Simple function for printing the Other responses from a question
# Inputs:
#   df - (pd.DataFrame) the cleaned dataframe 
#   question - (str) the column name for the Other data
# 
# Outputs:
#   No outputs, all generated results are displayed via print()/display()

def other_responses(df, question):
    responses = []

    responses.append(pd.DataFrame(df[question]))
    responses = pd.concat(responses, axis=1)
    responses = responses[responses[question].notnull()]

    print('NUMBER OF RESPONSES: ' , len(responses))
    
    for item in responses[question]:
        display(item)

## Single answer questions

In [5]:
# Function to generate the results from a single answer question
# Produces the count (N) for each answer and the percentage (%) from the total sample number
# and prints and saves a graph of the data
# 
# Inputs: 
#   df - (pd.DataFrame) the cleaned dataframe 
#   q - (str) the question string that is the column name
#   graph_type - (str) grapher function name
#   file_location - (str) the folder location for the grapher function
#
# Outputs:
#   No outputs, all generated results are displayed via print()/display()

def single_answer(df, q, graph_type, file_location):

    results = []

    results.append(pd.DataFrame(df[q].value_counts(dropna=True)))
    results.append(pd.DataFrame(df[q].value_counts(normalize=True, dropna=True)))

    # Concatenate the list DataFrames into a single DataFrame
    results = pd.concat(results, axis=1)

    # Rename columns in the DataFrame.
    results.columns = [q + ' (N)', q + ' (%)']

    # Print sample size, % total and graph for each question
    # Note: % total is for quality control. The total should be 1.0 if question was required.
    sample_size = results[q + ' (N)'].sum(axis=0)
    print('SAMPLE SIZE: ' + str(sample_size) + ' (' + q + ')')
    total_count = results[q + ' (%)'].sum(axis=0)
    print('TOTAL %: '  + str(total_count) + ' (' + q + ')')

    # Sort dataframe rows
    results.sort_index(axis=0, ascending=True, inplace=True)

    # Display the results DataFrame
    display(results)

    # Graph the results
    title = q.replace('"', '')
    title = title.replace('?', '')
    title = (title[:100] + '...') if len(title) > 100 else title
    graph_type(results, [q + ' (%)'], title, file_location)


## Likert scale questions

In [None]:
# Function to generate the results from a likert scale question
# 
# ***** Note that the only scales are agree/disagree and satisfied/unsatisfied
# If you require another scale, update the function *****
# 
# Produces descriptive statistics, counts and percentages, and graphs
# and prints and saves a graph of the data
# 
# Inputs: 
#   df - (pd.DataFrame) the cleaned dataframe 
#   q - (str) the question string that is the column name
#   graph_type - (str) grapher function name
#   file_location - (str) the folder location for the grapher function
#
# Outputs:
#   No outputs, all generated results are displayed via print()/display()

def likert_results(df, q, graph_type, file_location):

    results_compact = []
    results_numerical = []

    # Convert to numerical data for calculation of mean and std dev.
    # 1 = SD, 2 = D, 3 = N A/D, 4 = A, 5 = SA

    #first for the Agree/Disagree scale
    temp = df[q].replace('Strongly agree', 5, regex=True)
    temp = temp.replace('Agree', 4, regex=True)
    temp = temp.replace('Neither agree nor disagree', 3, regex=True)
    temp = temp.replace('Disagree', 2, regex=True)
    temp = temp.replace('Strongly disagree', 1, regex=True)
    # next for the satisfaction scale
    temp = temp.replace('Very satisfied', 5, regex=True)
    temp = temp.replace('Satisfied', 4, regex=True)
    temp = temp.replace('Neutral', 3, regex=True)
    temp = temp.replace('Unsatisfied', 2, regex=True)
    temp = temp.replace('Very unsatisfied', 1, regex=True)
    temp = temp.replace('NaN', 3, regex=True)
    results_numerical.append(pd.DataFrame(temp))

    # Concatenate the list into a DataFrame and use describe() to get the mean and std dev.
    results_numerical = pd.concat(results_numerical, axis=1)
    display(pd.DataFrame(results_numerical.describe()))

    # Compact standard Likert answers to three-point scale.
    # Create a list with question value counts (both number counts and as a percentage of the total).
    temp = df[q].replace('Strongly agree', 'Agree', regex=True)
    temp = temp.replace('Strongly disagree', 'Disagree', regex=True)
    temp = temp.replace('Very satisfied', 'Satisfied', regex=True)
    temp = temp.replace('Very unsatisfied', 'Unsatisfied', regex=True)
    temp = temp.replace('NaN', 3, regex=True)
    results_compact.append(pd.DataFrame(temp.value_counts()))
    results_compact.append(pd.DataFrame(temp.value_counts(normalize=True)))

    # Concatenate the list DataFrames into a single DataFrame
    results_compact = pd.concat(results_compact, axis=1)

    # Rename columns in the DataFrame.
    results_compact.columns = [q + ' (N)', q + ' (%)']

    # Sort dataframe rows
    results_compact.sort_index(axis=0, ascending=True, inplace=True)

    total_count = results_compact[q + ' (%)'].sum(axis=0)
    print('TOTAL %: '  + str(total_count) + ' (' + q + ')')

    display(results_compact)

    title = q.replace('"', '')
    graph_type(results_compact, [q + ' (%)'], title, file_location)