In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('data/data.csv')

In [3]:
subset = data[data.PINCP > 0].copy()

In [4]:
subset.OCCP.fillna(9999, inplace=True)

In [5]:
subset = subset[subset.PINCP > 1_600_000].copy()

In [6]:
codes = pd.read_csv('data/codes.csv')

In [7]:
code_name = {}
for index, row in codes.iterrows():
    try:
        code = float(row['code'])
    except ValueError:
        continue
    code_name[code] = row['occupation']

In [8]:
len(code_name)

531

In [9]:
categories = pd.read_csv('data/categories.csv')

In [10]:
counts = dict(subset.OCCP.value_counts())

In [11]:
results = []

for code, count in counts.items():
    results.append([code_name[code], count, code, code])

for index, row in categories.iterrows():
    total = sum(count for code, count in counts.items()
                if row['lowest_code'] <= code <= row['greatest_code'])
    results.append([row['category'], total, row['lowest_code'], row['greatest_code']])

In [12]:
n = len(subset)

In [13]:
ordered = sorted(results, key=lambda x: (x[2], -x[3]))

In [14]:
ordered[:3]

[['Management, Business, Science, and Arts Occupations', 3, 10, 3550],
 ['Management, Business, and Financial Occupations', 1, 10, 960],
 ['Management Occupations', 1, 10, 440]]

In [15]:
stack = [99999]
in_list = False

for name, number, first, last in ordered:
    percent = f'{100 * number / n: >5.2f}'
    if first == last:
        if not in_list:
            print('<ul>')
            in_list = True
        print(f'<li>{percent}% {name}</li>')
        continue
    else:
        if in_list:
            print('</ul>')
            in_list = False
    # Definitely doing a span
    while first > stack[-1]:  # ending previous
        print('</details>')
        stack.pop()
    print(f'<details><summary>{percent}% {name}</summary>')
    stack.append(last)

<details><summary>100.00% Management, Business, Science, and Arts Occupations</summary>
<details><summary>33.33% Management, Business, and Financial Occupations</summary>
<details><summary>33.33% Management Occupations</summary>
<ul>
<li>33.33% Financial Managers</li>
</ul>
</details>
<details><summary> 0.00% Business and Financial Operations Occupations</summary>
</details>
</details>
<details><summary> 0.00% Computer, Engineering, and Science Occupations</summary>
<details><summary> 0.00% Computer and Mathematical Occupations</summary>
</details>
<details><summary> 0.00% Architecture and Engineering Occupations</summary>
</details>
<details><summary> 0.00% Life, Physical, and Social Science Occupations</summary>
</details>
</details>
<details><summary>33.33% Education, Legal, Community Service, Arts, and Media Occupations</summary>
<details><summary> 0.00% Community and Social Service Occupations</summary>
</details>
<details><summary> 0.00% Legal Occupations</summary>
</details>
<de

In [16]:
[row for row in sorted(results, key=lambda x: -x[1]) if row[2] == row[3]][:10]

[['Financial Managers', 1, 120.0, 120.0],
 ['Physicians', 1, 3090.0, 3090.0],
 ['Postsecondary Teachers', 1, 2205.0, 2205.0]]

In [17]:
len(results)

36

In [20]:
print(sum(row[2] == row[3] for row in results), '\n')

ordered = [row for row in sorted(results, key=lambda x: -x[1]) if row[2] == row[3]]

print('<ul>')
total = 0
for name, number, first, last in ordered:
    if number / n < 0.01:
        break
    total += number
    percent = f'{100 * number / n: >4.1f}'
    print(f'<li>{percent}% {name}</li>')
print('</ul>')
    
print('\n', 100 * total / n)

3 

<ul>
<li>33.3% Financial Managers</li>
<li>33.3% Physicians</li>
<li>33.3% Postsecondary Teachers</li>
</ul>

 100.0


In [19]:
results

[['Financial Managers', 1, 120.0, 120.0],
 ['Physicians', 1, 3090.0, 3090.0],
 ['Postsecondary Teachers', 1, 2205.0, 2205.0],
 ['Management, Business, Science, and Arts Occupations', 3, 10, 3550],
 ['Management, Business, and Financial Occupations', 1, 10, 960],
 ['Management Occupations', 1, 10, 440],
 ['Business and Financial Operations Occupations', 0, 500, 960],
 ['Computer, Engineering, and Science Occupations', 0, 1005, 1980],
 ['Computer and Mathematical Occupations', 0, 1005, 1240],
 ['Architecture and Engineering Occupations', 0, 1305, 1560],
 ['Life, Physical, and Social Science Occupations', 0, 1600, 1980],
 ['Education, Legal, Community Service, Arts, and Media Occupations',
  1,
  2001,
  2970],
 ['Community and Social Service Occupations', 0, 2001, 2060],
 ['Legal Occupations', 0, 2100, 2180],
 ['Educational Instruction, and Library Occupations', 1, 2205, 2555],
 ['Arts, Design, Entertainment, Sports, and Media Occupations', 0, 2600, 2970],
 ['Healthcare Practitioners and