In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('data/data.csv')

In [3]:
subset = data[data.PINCP > 0].copy()

In [4]:
subset.OCCP.fillna(9999, inplace=True)

In [5]:
subset = subset[subset.PINCP > 200_000].copy()

In [6]:
codes = pd.read_csv('data/codes.csv')

In [7]:
code_name = {}
for index, row in codes.iterrows():
    try:
        code = float(row['code'])
    except ValueError:
        continue
    code_name[code] = row['occupation']

In [8]:
len(code_name)

531

In [9]:
categories = pd.read_csv('data/categories.csv')

In [10]:
counts = dict(subset.OCCP.value_counts())

In [11]:
results = []

for code, count in counts.items():
    results.append([code_name[code], count, code, code])

for index, row in categories.iterrows():
    total = sum(count for code, count in counts.items()
                if row['lowest_code'] <= code <= row['greatest_code'])
    results.append([row['category'], total, row['lowest_code'], row['greatest_code']])

In [12]:
n = len(subset)

In [13]:
ordered = sorted(results, key=lambda x: (x[2], -x[3]))

In [14]:
ordered[:3]

[['Management, Business, Science, and Arts Occupations', 48227, 10, 3550],
 ['Management, Business, and Financial Occupations', 26432, 10, 960],
 ['Management Occupations', 20092, 10, 440]]

In [15]:
stack = [99999]
in_list = False

for name, number, first, last in ordered:
    percent = f'{100 * number / n: >5.2f}'
    if first == last:
        if not in_list:
            print('<ul>')
            in_list = True
        print(f'<li>{percent}% {name}</li>')
        continue
    else:
        if in_list:
            print('</ul>')
            in_list = False
    # Definitely doing a span
    while first > stack[-1]:  # ending previous
        print('</details>')
        stack.pop()
    print(f'<details><summary>{percent}% {name}</summary>')
    stack.append(last)

<details><summary>71.12% Management, Business, Science, and Arts Occupations</summary>
<details><summary>38.98% Management, Business, and Financial Occupations</summary>
<details><summary>29.63% Management Occupations</summary>
<ul>
<li> 9.02% Chief Executives And Legislators</li>
<li> 1.17% General And Operations Managers</li>
<li> 0.09% Advertising And Promotions Managers</li>
<li> 0.78% Marketing Managers</li>
<li> 1.17% Sales Managers</li>
<li> 0.08% Public Relations And Fundraising Managers</li>
<li> 0.04% Administrative Services Managers</li>
<li> 0.04% Facilities Managers</li>
<li> 1.17% Computer And Information Systems Managers</li>
<li> 2.35% Financial Managers</li>
<li> 0.01% Compensation And Benefits Managers</li>
<li> 0.28% Human Resources Managers</li>
<li> 0.03% Training And Development Managers</li>
<li> 0.24% Industrial Production Managers</li>
<li> 0.18% Purchasing Managers</li>
<li> 0.08% Transportation, Storage, And Distribution Managers</li>
<li> 0.96% Farmers, Ranc

In [16]:
[row for row in sorted(results, key=lambda x: -x[1]) if row[2] == row[3]][:10]

[['Unspecified; added by Aaron', 7204, 9999.0, 9999.0],
 ['Chief Executives And Legislators', 6119, 10.0, 10.0],
 ['Physicians', 5554, 3090.0, 3090.0],
 ['Other Managers', 5153, 440.0, 440.0],
 ['Lawyers, And Judges, Magistrates, And Other Judicial Workers',
  4048,
  2100.0,
  2100.0],
 ['Software Developers', 1768, 1021.0, 1021.0],
 ['Financial Managers', 1591, 120.0, 120.0],
 ['Management Analysts', 1242, 710.0, 710.0],
 ['Accountants And Auditors', 1239, 800.0, 800.0],
 ['Personal Financial Advisors', 1201, 850.0, 850.0]]

In [17]:
len(results)

533

In [19]:
print(sum(row[2] == row[3] for row in results), '\n')

ordered = [row for row in sorted(results, key=lambda x: -x[1]) if row[2] == row[3]]

print('<ul>')
total = 0
for name, number, first, last in ordered:
    if number / n < 0.01:
        break
    total += number
    percent = f'{100 * number / n: >4.1f}'
    print(f'<li>{percent}% {name}</li>')
print('</ul>')
    
print('\n', 100 * total / n)

500 

<ul>
<li>10.6% Unspecified; added by Aaron</li>
<li> 9.0% Chief Executives And Legislators</li>
<li> 8.2% Physicians</li>
<li> 7.6% Other Managers</li>
<li> 6.0% Lawyers, And Judges, Magistrates, And Other Judicial Workers</li>
<li> 2.6% Software Developers</li>
<li> 2.3% Financial Managers</li>
<li> 1.8% Management Analysts</li>
<li> 1.8% Accountants And Auditors</li>
<li> 1.8% Personal Financial Advisors</li>
<li> 1.7% Real Estate Brokers And Sales Agents</li>
<li> 1.4% Sales Representatives, Wholesale And Manufacturing</li>
<li> 1.4% Postsecondary Teachers</li>
<li> 1.2% First-Line Supervisors Of Non-Retail Sales Workers</li>
<li> 1.2% General And Operations Managers</li>
<li> 1.2% Sales Managers</li>
<li> 1.2% Computer And Information Systems Managers</li>
<li> 1.2% First-Line Supervisors Of Retail Sales Workers</li>
<li> 1.1% Dentists</li>
</ul>

 63.22356098927839
