In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('data/data.csv')

In [3]:
subset = data[data.PINCP > 0].copy()

In [4]:
subset.OCCP.fillna(9999, inplace=True)

In [5]:
subset = subset[subset.PINCP > 100_000].copy()

In [6]:
codes = pd.read_csv('data/codes.csv')

In [7]:
code_name = {}
for index, row in codes.iterrows():
    try:
        code = float(row['code'])
    except ValueError:
        continue
    code_name[code] = row['occupation']

In [8]:
len(code_name)

531

In [9]:
categories = pd.read_csv('data/categories.csv')

In [10]:
counts = dict(subset.OCCP.value_counts())

In [11]:
results = []

for code, count in counts.items():
    results.append([code_name[code], count, code, code])

for index, row in categories.iterrows():
    total = sum(count for code, count in counts.items()
                if row['lowest_code'] <= code <= row['greatest_code'])
    results.append([row['category'], total, row['lowest_code'], row['greatest_code']])

In [12]:
n = len(subset)

In [13]:
ordered = sorted(results, key=lambda x: (x[2], -x[3]))

In [14]:
ordered[:3]

[['Management, Business, Science, and Arts Occupations', 177460, 10, 3550],
 ['Management, Business, and Financial Occupations', 90354, 10, 960],
 ['Management Occupations', 66142, 10, 440]]

In [15]:
stack = [99999]
in_list = False

for name, number, first, last in ordered:
    percent = f'{100 * number / n: >5.2f}'
    if first == last:
        if not in_list:
            print('<ul>')
            in_list = True
        print(f'<li>{percent}% {name}</li>')
        continue
    else:
        if in_list:
            print('</ul>')
            in_list = False
    # Definitely doing a span
    while first > stack[-1]:  # ending previous
        print('</details>')
        stack.pop()
    print(f'<details><summary>{percent}% {name}</summary>')
    stack.append(last)

<details><summary>69.24% Management, Business, Science, and Arts Occupations</summary>
<details><summary>35.25% Management, Business, and Financial Occupations</summary>
<details><summary>25.81% Management Occupations</summary>
<ul>
<li> 4.57% Chief Executives And Legislators</li>
<li> 1.38% General And Operations Managers</li>
<li> 0.08% Advertising And Promotions Managers</li>
<li> 0.77% Marketing Managers</li>
<li> 1.03% Sales Managers</li>
<li> 0.14% Public Relations And Fundraising Managers</li>
<li> 0.05% Administrative Services Managers</li>
<li> 0.12% Facilities Managers</li>
<li> 1.63% Computer And Information Systems Managers</li>
<li> 2.06% Financial Managers</li>
<li> 0.03% Compensation And Benefits Managers</li>
<li> 0.40% Human Resources Managers</li>
<li> 0.07% Training And Development Managers</li>
<li> 0.38% Industrial Production Managers</li>
<li> 0.31% Purchasing Managers</li>
<li> 0.17% Transportation, Storage, And Distribution Managers</li>
<li> 0.66% Farmers, Ranc

In [16]:
[row for row in sorted(results, key=lambda x: -x[1]) if row[2] == row[3]][:10]

[['Unspecified; added by Aaron', 21955, 9999.0, 9999.0],
 ['Other Managers', 17761, 440.0, 440.0],
 ['Chief Executives And Legislators', 11723, 10.0, 10.0],
 ['Software Developers', 9614, 1021.0, 1021.0],
 ['Lawyers, And Judges, Magistrates, And Other Judicial Workers',
  8911,
  2100.0,
  2100.0],
 ['Physicians', 7908, 3090.0, 3090.0],
 ['Financial Managers', 5273, 120.0, 120.0],
 ['Accountants And Auditors', 4775, 800.0, 800.0],
 ['Registered Nurses', 4608, 3255.0, 3255.0],
 ['Postsecondary Teachers', 4224, 2205.0, 2205.0]]

In [17]:
len(results)

563

In [19]:
print(sum(row[2] == row[3] for row in results), '\n')

ordered = [row for row in sorted(results, key=lambda x: -x[1]) if row[2] == row[3]]

print('<ul>')
total = 0
for name, number, first, last in ordered:
    if number / n < 0.01:
        break
    total += number
    percent = f'{100 * number / n: >4.1f}'
    print(f'<li>{percent}% {name}</li>')
print('</ul>')
    
print('\n', 100 * total / n)

530 

<ul>
<li> 8.6% Unspecified; added by Aaron</li>
<li> 6.9% Other Managers</li>
<li> 4.6% Chief Executives And Legislators</li>
<li> 3.8% Software Developers</li>
<li> 3.5% Lawyers, And Judges, Magistrates, And Other Judicial Workers</li>
<li> 3.1% Physicians</li>
<li> 2.1% Financial Managers</li>
<li> 1.9% Accountants And Auditors</li>
<li> 1.8% Registered Nurses</li>
<li> 1.6% Postsecondary Teachers</li>
<li> 1.6% Management Analysts</li>
<li> 1.6% Computer And Information Systems Managers</li>
<li> 1.6% Sales Representatives, Wholesale And Manufacturing</li>
<li> 1.4% General And Operations Managers</li>
<li> 1.3% First-Line Supervisors Of Retail Sales Workers</li>
<li> 1.2% Other Engineers</li>
<li> 1.2% First-Line Supervisors Of Non-Retail Sales Workers</li>
<li> 1.1% Construction Managers</li>
<li> 1.1% Real Estate Brokers And Sales Agents</li>
<li> 1.1% Education And Childcare Administrators</li>
<li> 1.0% Sales Managers</li>
</ul>

 51.963637782372906
