In [5]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('data/data.csv')

In [3]:
subset = data[data.PINCP > 0].copy()

In [4]:
subset.OCCP.fillna(9999, inplace=True)

In [6]:
np.quantile(subset.PINCP, 0.99)

392400.0

In [7]:
subset = subset[subset.PINCP > 392400.0].copy()

In [8]:
codes = pd.read_csv('data/codes.csv')

In [9]:
code_name = {}
for index, row in codes.iterrows():
    try:
        code = float(row['code'])
    except ValueError:
        continue
    code_name[code] = row['occupation']

In [10]:
len(code_name)

531

In [11]:
categories = pd.read_csv('data/categories.csv')

In [12]:
counts = dict(subset.OCCP.value_counts())

In [13]:
results = []

for code, count in counts.items():
    results.append([code_name[code], count, code, code])

for index, row in categories.iterrows():
    total = sum(count for code, count in counts.items()
                if row['lowest_code'] <= code <= row['greatest_code'])
    results.append([row['category'], total, row['lowest_code'], row['greatest_code']])

In [14]:
n = len(subset)

In [15]:
ordered = sorted(results, key=lambda x: (x[2], -x[3]))

In [16]:
ordered[:3]

[['Management, Business, Science, and Arts Occupations', 18552, 10, 3550],
 ['Management, Business, and Financial Occupations', 10270, 10, 960],
 ['Management Occupations', 7908, 10, 440]]

In [17]:
stack = [99999]
in_list = False

for name, number, first, last in ordered:
    percent = f'{100 * number / n: >5.2f}'
    if first == last:
        if not in_list:
            print('<ul>')
            in_list = True
        print(f'<li>{percent}% {name}</li>')
        continue
    else:
        if in_list:
            print('</ul>')
            in_list = False
    # Definitely doing a span
    while first > stack[-1]:  # ending previous
        print('</details>')
        stack.pop()
    print(f'<details><summary>{percent}% {name}</summary>')
    stack.append(last)

<details><summary>77.63% Management, Business, Science, and Arts Occupations</summary>
<details><summary>42.98% Management, Business, and Financial Occupations</summary>
<details><summary>33.09% Management Occupations</summary>
<ul>
<li>12.87% Chief Executives And Legislators</li>
<li> 1.18% General And Operations Managers</li>
<li> 0.07% Advertising And Promotions Managers</li>
<li> 0.64% Marketing Managers</li>
<li> 1.11% Sales Managers</li>
<li> 0.06% Public Relations And Fundraising Managers</li>
<li> 0.03% Administrative Services Managers</li>
<li> 0.02% Facilities Managers</li>
<li> 0.80% Computer And Information Systems Managers</li>
<li> 2.95% Financial Managers</li>
<li> 0.01% Compensation And Benefits Managers</li>
<li> 0.27% Human Resources Managers</li>
<li> 0.03% Training And Development Managers</li>
<li> 0.23% Industrial Production Managers</li>
<li> 0.13% Purchasing Managers</li>
<li> 0.06% Transportation, Storage, And Distribution Managers</li>
<li> 0.70% Farmers, Ranc

In [18]:
[row for row in sorted(results, key=lambda x: -x[1]) if row[2] == row[3]][:10]

[['Chief Executives And Legislators', 3076, 10.0, 10.0],
 ['Physicians', 2861, 3090.0, 3090.0],
 ['Other Managers', 1904, 440.0, 440.0],
 ['Lawyers, And Judges, Magistrates, And Other Judicial Workers',
  1759,
  2100.0,
  2100.0],
 ['Unspecified; added by Aaron', 1009, 9999.0, 9999.0],
 ['Financial Managers', 706, 120.0, 120.0],
 ['Personal Financial Advisors', 578, 850.0, 850.0],
 ['Accountants And Auditors', 450, 800.0, 800.0],
 ['Management Analysts', 447, 710.0, 710.0],
 ['Real Estate Brokers And Sales Agents', 438, 4920.0, 4920.0]]

In [19]:
len(results)

482

In [21]:
print(sum(row[2] == row[3] for row in results), '\n')

ordered = [row for row in sorted(results, key=lambda x: -x[1]) if row[2] == row[3]]

print('<ul>')
total = 0
for name, number, first, last in ordered:
    if number / n < 0.01:
        break
    total += number
    percent = f'{100 * number / n: >4.1f}'
    print(f'<li>{percent}% {name}</li>')
print('</ul>')
    
print('\n', 100 * total / n)

449 

<ul>
<li>12.9% Chief Executives And Legislators</li>
<li>12.0% Physicians</li>
<li> 8.0% Other Managers</li>
<li> 7.4% Lawyers, And Judges, Magistrates, And Other Judicial Workers</li>
<li> 4.2% Unspecified; added by Aaron</li>
<li> 3.0% Financial Managers</li>
<li> 2.4% Personal Financial Advisors</li>
<li> 1.9% Accountants And Auditors</li>
<li> 1.9% Management Analysts</li>
<li> 1.8% Real Estate Brokers And Sales Agents</li>
<li> 1.7% Software Developers</li>
<li> 1.5% Securities, Commodities, And Financial Services Sales Agents</li>
<li> 1.5% Dentists</li>
<li> 1.4% Surgeons</li>
<li> 1.3% Sales Representatives, Wholesale And Manufacturing</li>
<li> 1.3% First-Line Supervisors Of Retail Sales Workers</li>
<li> 1.2% General And Operations Managers</li>
<li> 1.2% First-Line Supervisors Of Non-Retail Sales Workers</li>
<li> 1.1% Sales Managers</li>
<li> 1.0% Postsecondary Teachers</li>
</ul>

 68.5023224672553
