In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('data/data.csv')

In [5]:
subset = data[data.PINCP > 0].copy()
subset.OCCP.fillna(9999, inplace=True)
total_n = len(subset)

In [6]:
codes = pd.read_csv('data/codes.csv')

In [7]:
code_name = {}
for index, row in codes.iterrows():
    try:
        code = float(row['code'])
    except ValueError:
        continue
    code_name[code] = row['occupation']

In [31]:
def show(low, high, subset=subset):
    subset = subset[(low <= subset.PINCP) & (subset.PINCP < high)].copy()
    n = len(subset)

    counts = dict(subset.OCCP.value_counts())

    results = []
    for code, count in counts.items():
        results.append([code_name[code], count, code, code])

    print(f'{round(100 * n / total_n, 2)}% of records {low} <= income < {high}\n')

In [48]:
show(0, 10_000)
show(10_000, 30_000)
show(30_000, 50_000)
show(50_000, 100_000)
show(100_000, 200_000)
show(200_000, 400_000)
show(400_000, 9e9)

16.27% of records 0 <= income < 10000

29.22% of records 10000 <= income < 30000

20.19% of records 30000 <= income < 50000

22.68% of records 50000 <= income < 100000

8.58% of records 100000 <= income < 200000

2.09% of records 200000 <= income < 400000

0.96% of records 400000 <= income < 9000000000.0



In [53]:
def show(low, high, subset=subset):
    subset = subset[(low <= subset.PINCP) & (subset.PINCP < high)].copy()
    n = len(subset)

    counts = dict(subset.OCCP.value_counts())

    results = []
    for code, count in counts.items():
        results.append([code_name[code], count, code, code])

    print(f'{round(100 * n / total_n, 2)}% of records {low} <= income < {high}\n')

    print(sum(row[2] == row[3] for row in results), '\n')

    ordered = [row for row in sorted(results, key=lambda x: -x[1]) if row[2] == row[3]]

    print('<ul>')
    total = 0
    for name, number, first, last in ordered:
        if number / n < 0.01:
            break
        if 'Unspecified' not in name:
            total += number
        percent = f'{100 * number / n: >4.1f}'
        print(f'<li>{percent}% {name}</li>')
    print('</ul>')

    print('\n', 100 * total / n, '\n')

In [54]:
show(0, 10_000)
show(10_000, 30_000)
show(30_000, 50_000)
show(50_000, 100_000)
show(100_000, 200_000)
show(200_000, 400_000)
show(400_000, 9e9)

16.27% of records 0 <= income < 10000

531 

<ul>
<li>36.1% Unspecified; added by Aaron</li>
<li> 4.5% Cashiers</li>
<li> 2.7% Retail Salespersons</li>
<li> 2.4% Waiters And Waitresses</li>
<li> 2.1% Cooks</li>
<li> 1.7% Laborers And Freight, Stock, And Material Movers, Hand</li>
<li> 1.7% Janitors And Building Cleaners</li>
<li> 1.5% Childcare Workers</li>
<li> 1.4% Customer Service Representatives</li>
<li> 1.3% Food Preparation Workers</li>
<li> 1.2% Stockers And Order Fillers</li>
<li> 1.2% Fast Food And Counter Workers</li>
<li> 1.1% Landscaping And Groundskeeping Workers</li>
<li> 1.1% Teaching Assistants</li>
<li> 1.1% Maids And Housekeeping Cleaners</li>
</ul>

 24.945670204640088 

29.22% of records 10000 <= income < 30000

531 

<ul>
<li>34.8% Unspecified; added by Aaron</li>
<li> 2.2% Cashiers</li>
<li> 2.0% Retail Salespersons</li>
<li> 1.8% Cooks</li>
<li> 1.7% Janitors And Building Cleaners</li>
<li> 1.5% Laborers And Freight, Stock, And Material Movers, Hand</li>
<li> 1.

In [57]:
show(0, 50_000)
show(50_000, 100_000)
show(100_000, 200_000)

65.68% of records 0 <= income < 50000

531 

<ul>
<li>29.7% Unspecified; added by Aaron</li>
<li> 2.3% Cashiers</li>
<li> 2.0% Retail Salespersons</li>
<li> 1.6% Driver/Sales Workers And Truck Drivers</li>
<li> 1.6% Janitors And Building Cleaners</li>
<li> 1.6% Secretaries And Administrative Assistants, Except Legal, Medical, And Executive</li>
<li> 1.5% Cooks</li>
<li> 1.5% Customer Service Representatives</li>
<li> 1.5% Laborers And Freight, Stock, And Material Movers, Hand</li>
<li> 1.4% Waiters And Waitresses</li>
<li> 1.3% First-Line Supervisors Of Retail Sales Workers</li>
<li> 1.2% Elementary And Middle School Teachers</li>
<li> 1.1% Teaching Assistants</li>
<li> 1.0% Stockers And Order Fillers</li>
</ul>

 19.589538453698868 

22.68% of records 50000 <= income < 100000

531 

<ul>
<li>11.6% Unspecified; added by Aaron</li>
<li> 3.7% Registered Nurses</li>
<li> 3.0% Elementary And Middle School Teachers</li>
<li> 3.0% Other Managers</li>
<li> 2.5% Driver/Sales Workers And Truck 

In [3]:
subset = data[data.PINCP > 0].copy()

In [4]:
subset.OCCP.fillna(9999, inplace=True)

In [5]:
subset = subset[subset.PINCP > 800_000].copy()

In [6]:
codes = pd.read_csv('data/codes.csv')

In [7]:
code_name = {}
for index, row in codes.iterrows():
    try:
        code = float(row['code'])
    except ValueError:
        continue
    code_name[code] = row['occupation']

In [8]:
len(code_name)

531

In [9]:
categories = pd.read_csv('data/categories.csv')

In [10]:
counts = dict(subset.OCCP.value_counts())

In [11]:
results = []

for code, count in counts.items():
    results.append([code_name[code], count, code, code])

for index, row in categories.iterrows():
    total = sum(count for code, count in counts.items()
                if row['lowest_code'] <= code <= row['greatest_code'])
    results.append([row['category'], total, row['lowest_code'], row['greatest_code']])

In [12]:
n = len(subset)

In [13]:
ordered = sorted(results, key=lambda x: (x[2], -x[3]))

In [14]:
ordered[:3]

[['Management, Business, Science, and Arts Occupations', 870, 10, 3550],
 ['Management, Business, and Financial Occupations', 564, 10, 960],
 ['Management Occupations', 429, 10, 440]]

In [15]:
stack = [99999]
in_list = False

for name, number, first, last in ordered:
    percent = f'{100 * number / n: >5.2f}'
    if first == last:
        if not in_list:
            print('<ul>')
            in_list = True
        print(f'<li>{percent}% {name}</li>')
        continue
    else:
        if in_list:
            print('</ul>')
            in_list = False
    # Definitely doing a span
    while first > stack[-1]:  # ending previous
        print('</details>')
        stack.pop()
    print(f'<details><summary>{percent}% {name}</summary>')
    stack.append(last)

<details><summary>86.57% Management, Business, Science, and Arts Occupations</summary>
<details><summary>56.12% Management, Business, and Financial Occupations</summary>
<details><summary>42.69% Management Occupations</summary>
<ul>
<li>20.80% Chief Executives And Legislators</li>
<li> 0.70% General And Operations Managers</li>
<li> 0.10% Advertising And Promotions Managers</li>
<li> 0.60% Marketing Managers</li>
<li> 0.60% Sales Managers</li>
<li> 0.20% Computer And Information Systems Managers</li>
<li> 3.68% Financial Managers</li>
<li> 0.30% Industrial Production Managers</li>
<li> 0.90% Farmers, Ranchers, And Other Agricultural Managers</li>
<li> 1.00% Construction Managers</li>
<li> 0.20% Architectural And Engineering Managers</li>
<li> 0.40% Food Service Managers</li>
<li> 0.10% Lodging Managers</li>
<li> 0.40% Medical And Health Services Managers</li>
<li> 1.69% Property, Real Estate, And Community Association Managers</li>
<li>11.04% Other Managers</li>
</ul>
</details>
<detai

In [16]:
[row for row in sorted(results, key=lambda x: -x[1]) if row[2] == row[3]][:10]

[['Chief Executives And Legislators', 209, 10.0, 10.0],
 ['Other Managers', 111, 440.0, 440.0],
 ['Physicians', 94, 3090.0, 3090.0],
 ['Lawyers, And Judges, Magistrates, And Other Judicial Workers',
  78,
  2100.0,
  2100.0],
 ['Personal Financial Advisors', 43, 850.0, 850.0],
 ['Financial Managers', 37, 120.0, 120.0],
 ['Financial And Investment Analysts', 28, 845.0, 845.0],
 ['Management Analysts', 26, 710.0, 710.0],
 ['Securities, Commodities, And Financial Services Sales Agents',
  25,
  4820.0,
  4820.0],
 ['Surgeons', 23, 3100.0, 3100.0]]

In [17]:
len(results)

152

In [20]:
print(sum(row[2] == row[3] for row in results), '\n')

ordered = [row for row in sorted(results, key=lambda x: -x[1]) if row[2] == row[3]]

print('<ul>')
total = 0
for name, number, first, last in ordered:
    if number / n < 0.01:
        break
    total += number
    percent = f'{100 * number / n: >4.1f}'
    print(f'<li>{percent}% {name}</li>')
print('</ul>')
    
print('\n', 100 * total / n)

119 

<ul>
<li>20.8% Chief Executives And Legislators</li>
<li>11.0% Other Managers</li>
<li> 9.4% Physicians</li>
<li> 7.8% Lawyers, And Judges, Magistrates, And Other Judicial Workers</li>
<li> 4.3% Personal Financial Advisors</li>
<li> 3.7% Financial Managers</li>
<li> 2.8% Financial And Investment Analysts</li>
<li> 2.6% Management Analysts</li>
<li> 2.5% Securities, Commodities, And Financial Services Sales Agents</li>
<li> 2.3% Surgeons</li>
<li> 2.0% Real Estate Brokers And Sales Agents</li>
<li> 1.7% Property, Real Estate, And Community Association Managers</li>
<li> 1.7% Accountants And Auditors</li>
<li> 1.5% Postsecondary Teachers</li>
<li> 1.4% Dentists</li>
<li> 1.1% First-Line Supervisors Of Non-Retail Sales Workers</li>
</ul>

 76.41791044776119


In [19]:
results

[['Chief Executives And Legislators', 209, 10.0, 10.0],
 ['Other Managers', 111, 440.0, 440.0],
 ['Physicians', 94, 3090.0, 3090.0],
 ['Lawyers, And Judges, Magistrates, And Other Judicial Workers',
  78,
  2100.0,
  2100.0],
 ['Personal Financial Advisors', 43, 850.0, 850.0],
 ['Financial Managers', 37, 120.0, 120.0],
 ['Financial And Investment Analysts', 28, 845.0, 845.0],
 ['Management Analysts', 26, 710.0, 710.0],
 ['Securities, Commodities, And Financial Services Sales Agents',
  25,
  4820.0,
  4820.0],
 ['Surgeons', 23, 3100.0, 3100.0],
 ['Real Estate Brokers And Sales Agents', 20, 4920.0, 4920.0],
 ['Property, Real Estate, And Community Association Managers',
  17,
  410.0,
  410.0],
 ['Accountants And Auditors', 17, 800.0, 800.0],
 ['Postsecondary Teachers', 15, 2205.0, 2205.0],
 ['Dentists', 14, 3010.0, 3010.0],
 ['First-Line Supervisors Of Non-Retail Sales Workers', 11, 4710.0, 4710.0],
 ['Construction Managers', 10, 220.0, 220.0],
 ['First-Line Supervisors Of Retail Sales 