# Data Checker

## get_jobs function

In [3]:
import re

occupations = ["Sports and Personal Service Workers", "Storepersons", "Automotive and Engineering Trades Workers"]
pattern = re.compile(" and |, ")

for occupation in occupations:
    print(pattern.split(occupation))

['Sports', 'Personal Service Workers']
['Storepersons']
['Automotive', 'Engineering Trades Workers']


In [4]:
def get_jobs(occupation):
    return pattern.split(occupation)

## API Test

In [5]:
import requests
from pprint import pprint

def get_data_for(job):
    ROOT_URL = 'https://data.gov.au/data'
    API_URL = ROOT_URL + '/api/3/action/datastore_search?resource_id=bfa7ef04-e9f2-46ff-a959-84f005dfd17b%s'
    request =  API_URL % ('&q=%s' % job)

    response = requests.get(request).json()

    total_records = response['result']['total']
    
    data = response['result']['records']
    while len(data) < total_records:
        next_request = ROOT_URL + response['result']['_links']['next']
        response = requests.get(next_request).json()
        data += response['result']['records']
    return data

pprint(get_data_for('technician')[:5])

[{'Average salary or wage income3 \n$': 12549.2,
  'Average taxable income3\n$': 54872.49,
  'Average total income3 \n$': 55835.24,
  'Median salary or wage income3 \n$': 13274,
  'Median taxable income3\n$': 58991,
  'Median total income3 \n$': 59437,
  'Number of individuals': 41,
  'Occupation - unit group4': '3111 Agricultural technician',
  'State2': 'ACT',
  '_id': 1157,
  'rank': 0.0573088},
 {'Average salary or wage income3 \n$': 12822.05,
  'Average taxable income3\n$': 55145.04,
  'Average total income3 \n$': 57426.67,
  'Median salary or wage income3 \n$': 11065,
  'Median taxable income3\n$': 53589,
  'Median total income3 \n$': 54771,
  'Number of individuals': 457,
  'Occupation - unit group4': '3111 Agricultural technician',
  'State2': 'NSW',
  '_id': 1158,
  'rank': 0.0573088},
 {'Average salary or wage income3 \n$': 15126.06,
  'Average taxable income3\n$': 63617.26,
  'Average total income3 \n$': 66244.47,
  'Median salary or wage income3 \n$': 13477,
  'Median taxab

## get_average_income_for method

In [6]:
AVERAGE_TOTAL_INCOME = 'Average total income3 \n$'

def get_average_income_for(job):
    data = get_data_for(job)
    
    data_total_income = 0
    for point in data:
        data_total_income += point[AVERAGE_TOTAL_INCOME]
        
    return data_total_income/max(1, len(data))

pprint(get_average_income_for('technician'))

67005.76781250001


## Process occupations.csv

In [7]:
import csv
from collections import defaultdict

occupations_average_income = defaultdict(float)

with open('./occupations.csv', 'r', encoding='utf-8') as f:
    reader = csv.reader(f)
    for row in reader:
        occupation = row[0]
        jobs = get_jobs(str(occupation))

        cleaned_jobs = []
        results = []
        for job in jobs:
            job = job.lower()
            if not job.endswith('ss') and job.endswith('s'):
                job = job.rstrip('s')
            cleaned_jobs.append(job)
            results.append(get_average_income_for(job))

        average_income = sum(results)/len(jobs)
        
        occupations_average_income[occupation] = average_income

pprint(occupations_average_income)

defaultdict(<class 'float'>,
            {'AFSA': 0.0,
             'Arts and Media Professionals': 0.0,
             'Automotive and Engineering Trades Workers': 47662.66645833333,
             'Business, Human Resource and Marketing Professionals': 77583.54833333332,
             'Carers and Aides': 37123.9640625,
             'Chief Executives, General Managers and Legislators': 121835.31916666667,
             'Cleaners and Laundry Workers': 33535.71453125,
             'Clerical and Office Support Workers': 45200.33734375,
             'Construction Trades Workers': 0.0,
             'Construction and Mining Labourers': 85313.40778716217,
             'Design, Engineering, Science and Transport Professionals': 70794.60455940316,
             'Education Professionals': 0.0,
             'Electrotechnology and Telecommunications Trades Workers': 29159.781875,
             'Engineering, ICT and Science Technicians': 49180.281287537546,
             'Factory Process Workers': 43905.36

## Process provided data

In [8]:
def get_income_range(income_range):
    if re.match(r'^\$-?[0-9]+-\$-?[0-9]+$', income_range):
        tokens = income_range.split('-$')
        return [int(el.strip('$')) for el in tokens]
    return [200000]

print(get_income_range('$0-$99999'))
print(get_income_range('$-10000-$-5000'))

[0, 99999]
[-10000, -5000]


In [17]:
with open('./data.csv', 'r', newline='') as f:
    with open('./output.csv', 'w', newline='') as g:
        reader = csv.reader(f)
        writer = csv.writer(g, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)

        first = True
        count_wrong = 0
        for row in reader:
            if not first:
                occupation = row[10]
                income = row[13]
                income_range = get_income_range(income)
                max_income_range = income_range[0]

                expected = occupations_average_income[occupation]
                if income_range[-1] < 50000 and income_range[-1] < expected and expected - income_range[-1] > 5000:
                    writer.writerow(row + ['INCOME_BELOW_EXPECTED', 'Average $%.2f, got %s' % (expected, income)])
                else:
                    writer.writerow(row)
            else:
                writer.writerow(row + ['Alert Category', 'Alert Reason'])
            first = False