#### Import packages.

In [3]:
import pandas as pd

#### Import raw data csv from Loop11.

In [10]:
# Create variable and create DataFrame from the raw data exported from testing tool.
raw_df = pd.read_csv('ATOapp2_RAW.csv')

In [11]:
# List the DataFrame Column names.
list(raw_df.columns.values)

['Participant No.',
 'CustomID',
 'IP Address',
 'Date Started',
 'Date Completed',
 'User Agent',
 'Total Time Spent',
 'Avg Time Taken',
 'Task 1 - Onboarding',
 'Page Views',
 'Time Spent',
 'Task 2 - Personalisation',
 'Page Views.1',
 'Time Spent.1',
 'Task 3 - Help',
 'Page Views.2',
 'Time Spent.2',
 '13. Task. Task 4 - Settings',
 'Page Views.3',
 'Time Spent.3',
 'As an employee I can use this feature to record: (select all that apply)',
 'As a sole trader I can use this feature to record: (select all that apply)',
 'Where would you expect to be able to upload your information when "uploading your records to your tax return"? (select all that apply)',
 'Other, please specify',
 'I am confident I selected the right settings to set up the app.',
 'This feature (myDeductions) can be used by: (select all that apply)',
 'All small business owners can use this app to upload their records to their tax return.',
 'Let us know if you encountered any issues selecting your personalisatio

In [12]:
# Checking the number of participants in the raw data.
len(raw_df)

102

#### Remove NaN values from open text field columns to avoid type errors.

In [14]:
# Variable for storing open text field column names.
open_text_cols = ['Other, please specify', 'Other, please specify.1', 'Other, please specify.2', 'Let us know if you encountered any issues selecting your personalisation options.']

In [7]:
# Define function to remove NaN values from all necessary columns
def remove_nan(dataframe, columns):
    for item in columns:
        dataframe[item] = dataframe[item].fillna('nan')
        
remove_nan(raw_df, open_text_cols)

#### Define function to check for testers identified in the open text questions.

In [15]:
# Function to check for a partial string in the selected columns.
def remove_testers(dataframe, fragment, columns):
    for item in columns:
        test_df = dataframe[dataframe[item].str.contains('(?i)'+fragment) == True]    
    return test_df

testers_df = remove_testers(raw_df, 'test', open_text_cols)

In [16]:
# Print out the testers to check the comments are genuine tester comments.
testers_df

Unnamed: 0,Participant No.,CustomID,IP Address,Date Started,Date Completed,User Agent,Total Time Spent,Avg Time Taken,Task 1 - Onboarding,Page Views,...,Tell us about any language used in the tool that was unclear or confusing:,"On this screen, what do you understand the term ""Cash flow position"" to mean? (select all that apply)","Other, please specify.1","The new design of the myDeductions feature will provide businesses with a ""Cash flow position"" which is the balance of their recorded income, less any expenses. This amount may not match your tax return as it doesn't include the tax impact of any car expenses you record. * Please respond to the following statement: As a business I would find the ""Cash flow position"" feature useful.","If you are a sole trader, do you use your own name as your business name?",How can we improve the features you just tested?,Are you an ATO employee?,Which of the following best describes your role? (select all that apply),"Other, please specify.2",Which age group do you belong to?
0,Participant 1,,220.101.113.10,11/15/16 12:19,11/15/16 12:23,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5...,213 secs,16.4,success,4,...,tester,The balance of my recorded income less any exp...,,Disagree,No,tester,No,Small business,,26-45
1,Participant 5,,180.149.192.132,11/15/16 13:02,11/15/16 13:17,Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7....,887 secs,66.2,fail,1,...,test,How much I can expect to earn after tax,,Agree,Yes,test,Yes,Sole trader/self-employed,,
2,Participant 6,,180.149.192.132,11/15/16 13:07,11/15/16 13:18,Mozilla/5.0 (Windows NT 6.1; WOW64; rv:41.0) G...,662 secs,51.4,success,4,...,test,The balance of my recorded income less any exp...,,Agree,Not applicable (I'm not a sole trader),test,Yes,Individual taxpayer,,
16,Participant 41,,203.173.9.79,11/16/16 9:48,11/16/16 10:04,Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; ...,960 secs,57.4,success,4,...,,The balance of my recorded income less any exp...,,Disagree,Yes,Provide a help button which provides definitio...,No,Sole trader/self-employed,,46-65


In [17]:
# Remove participants identified as testers.
clean_df = raw_df[raw_df['Participant No.'] != 'Participant 1']
clean_df = clean_df[clean_df['Participant No.'] != 'Participant 5']
clean_df = clean_df[clean_df['Participant No.'] != 'Participant 6']

clean_df

Unnamed: 0,Participant No.,CustomID,IP Address,Date Started,Date Completed,User Agent,Total Time Spent,Avg Time Taken,Task 1 - Onboarding,Page Views,...,Tell us about any language used in the tool that was unclear or confusing:,"On this screen, what do you understand the term ""Cash flow position"" to mean? (select all that apply)","Other, please specify.1","The new design of the myDeductions feature will provide businesses with a ""Cash flow position"" which is the balance of their recorded income, less any expenses. This amount may not match your tax return as it doesn't include the tax impact of any car expenses you record. * Please respond to the following statement: As a business I would find the ""Cash flow position"" feature useful.","If you are a sole trader, do you use your own name as your business name?",How can we improve the features you just tested?,Are you an ATO employee?,Which of the following best describes your role? (select all that apply),"Other, please specify.2",Which age group do you belong to?
3,Participant 8,,203.22.30.47,11/15/16 13:47,11/15/16 16:01,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0...,8042 secs,768.8,success,4,...,,"My taxable business income, Balance of your bu...",,Disagree,No,,No,"Small business , Individual taxpayer",,
4,Participant 13,,180.149.192.133,11/15/16 14:59,11/15/16 15:17,Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7....,1100 secs,82.0,success,4,...,All language was fine,The balance of my recorded income less any exp...,,Agree,Not applicable (I'm not a sole trader),,Yes,Individual taxpayer,,46-65
5,Participant 15,,180.149.192.135,11/15/16 15:42,11/15/16 15:48,Mozilla/5.0 (Windows NT 6.1; WOW64; rv:41.0) G...,367 secs,28.8,success,4,...,,The balance of my recorded income less any exp...,,Strongly agree,Not applicable (I'm not a sole trader),,Yes,Individual taxpayer,,18-25
6,Participant 17,,180.149.192.136,11/15/16 16:55,11/15/16 17:00,Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7....,323 secs,24.0,success,4,...,,Balance of your business bank account,,Strongly agree,No,,Yes,"Sole trader/self-employed , Individual taxpayer",,46-65
7,Participant 19,,121.208.243.222,11/16/16 8:33,11/16/16 8:40,Mozilla/5.0 (iPhone; CPU iPhone OS 10_1_1 like...,412 secs,27.6,fail,1,...,,"My taxable business income, The balance of my ...",,Disagree,No,,No,"Small business , Tax professional",,46-65
8,Participant 22,,123.243.185.18,11/16/16 8:47,11/16/16 8:54,Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKi...,413 secs,32.8,success,4,...,,Balance of your business bank account,,Agree,No,,No,Tax professional,,46-65
9,Participant 25,,124.171.221.132,11/16/16 8:50,11/16/16 9:07,Mozilla/5.0 (Windows NT 10.0; WOW64; rv:49.0) ...,1031 secs,77.4,success,4,...,I did not see muh language so this is hard to ...,Balance of your business bank account,,Agree,No,This is very hard to test on sreen. I would pr...,No,Sole trader/self-employed,,46-65
10,Participant 30,,101.190.72.123,11/16/16 8:56,11/16/16 9:05,Mozilla/5.0 (iPhone; CPU iPhone OS 10_1_1 like...,500 secs,24.2,fail,1,...,,The balance of my recorded income less any exp...,,Strongly agree,Yes,,No,Small business,,26-45
11,Participant 32,,103.245.219.11,11/16/16 9:11,11/16/16 9:56,Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7....,2661 secs,454.2,success,4,...,Cash flow position is misleading. Should it no...,The balance of my recorded income less any exp...,,Disagree,No,cash flow position is deceptive as it doesn't ...,No,Tax professional,,26-45
12,Participant 35,,210.50.113.107,11/16/16 9:24,11/16/16 9:31,Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...,413 secs,43.0,success,4,...,,The balance of my recorded income less any exp...,,Agree,No,,No,"Sole trader/self-employed , Small business",,26-45


In [18]:
len(clean_df)

99

#### Outliers

In [21]:
# Function to remove outliers based on very fast average times.
def remove_outliers(df, col, min):
    outliers_df = df[df[col] >= min]
    return outliers_df

#### Looking at Total Time spent for outliers, rather than avg time.

In [30]:
# Convert string data to integer.
times_temp = []
for i in clean_df['Total Time Spent']:
    times_temp.append(int(i[:-5]))

    
# Create DataFrame and use describe() function to get stats.    
times_df = pd.DataFrame(times_temp)
times_df.columns = ['Total Time Spent']
times_df['Total Time (minutes)'] = times_df['Total Time Spent'] / 60
times_df.describe()
    

Unnamed: 0,Total Time Spent,Total Time (minutes)
count,99.0,99.0
mean,977.060606,16.284343
std,1337.646142,22.294102
min,196.0,3.266667
25%,413.0,6.883333
50%,631.0,10.516667
75%,1008.5,16.808333
max,8510.0,141.833333


In [31]:
# Calculate two std below the mean

times_mean = times_df['Total Time Spent'].mean()
times_std = times_df['Total Time Spent'].std()

mean_less_2xStd = times_mean - (times_std * 2)
mean_less_2xStd

-1698.231677314096

In [32]:
# Calculate two std above the mean

mean_plus_2xStd = times_mean + (times_std * 2)
mean_plus_2xStd

3652.352889435308

In [33]:
# drop values from clean 2std below the mean

In [34]:
#drop values from clean data 2 std above the mean

#### Export clean data to csv.

In [35]:
# Export spreadsheet with only full task complete information.
clean_df.to_csv('ATOapp_2_clean.csv',',') # export to csv