## CPS Demographics

Data on headship, age, and education, calculated from basic monthly CPS microdata files

In [2]:
import sys
sys.path.append('../src')

import uschartbook.config

from uschartbook.config import *
from uschartbook.utils import *

### Headship

In [3]:
cols = ['QSTNUM', 'AGE', 'YEAR', 'MONTH', 'HHWGT', 'PWSSWGT']
df = pd.concat([pd.read_feather(f'{cps_dir}/cps{year}.ft', columns=cols)
                  .query('AGE > 15') 
                for year in range(1996, 2023)])

headship_rate = (lambda grp: grp.groupby('QSTNUM').HHWGT.first().sum()
                 / grp.PWSSWGT.sum())

data = (df.groupby(['YEAR', 'MONTH']).apply(headship_rate)).reset_index()
data['DATE'] = pd.to_datetime(dict(year=data.YEAR, month=data.MONTH, day=1))
data = data.set_index('DATE').drop(['YEAR', 'MONTH'], axis=1)

sm = x13_arima_analysis(data[0])
result = sm.seasadj * 100
result.name = 'value'
result.to_csv(data_dir / 'headship.csv', index_label='date', header=True)

color = 'purple!70!violet'
node = end_node(result, color, date='m', digits=2, full_year=True)
write_txt(text_dir / 'headship_node.txt', node)

low = result.min()
low_dt = dtxt(result.idxmin())['mon1']
lt = result.iloc[-1]
lt_dt = dtxt(result.index[-1])['mon1']
feb = result.loc['2020-02-01']

cl = c_line(color)
text = (f'The headship rate reached a low of {low:.2f} percent during '+
        f'{low_dt}, and is currently {lt:.2f} percent, as of {lt_dt} '+
        f'{cl}. ')
write_txt(text_dir / 'headship.txt', text)
print(text)
#In February 2020, the headship rate was {feb:.2f} percent.

The headship rate reached a low of 49.19 percent during May 2020, and is currently 50.08 percent, as of January 2022 (see {\color{purple!70!violet}\textbf{---}}). 


### Median Age

In [None]:
def median_age(df, wgt='PWSSWGT', percentile=0.5):
    '''
    Returns age associated with given percentile.
    
    Default is median (0.5).
    '''
    bins=np.arange(-1, 86, 1)
    cdf = (df.groupby(pd.cut(df.AGE, bins))
             [wgt].sum().cumsum() / df[wgt].sum())
    
    return np.interp(percentile, cdf, bins[1:])

cols = ['YEAR', 'MONTH', 'AGE', 'BASICWGT', 'LFS']
dfe = pd.concat([pd.read_feather(f'{cps_dir}/cps{year}.ft', 
                                columns=cols)
                for year in range(1989, 1994)])
dfl = pd.concat([pd.read_feather(f'{cps_dir}/cps{year}.ft', 
                                columns=cols + ['PWSSWGT'])
                for year in range(1994, 2023)])
df = dfe.append(dfl)

grps = [('AGE16PLUS', 'BASICWGT', 'YEAR > 1988 and AGE > 15'), 
        ('PWSSWGT', 'PWSSWGT', 'YEAR > 1988'), 
        ('BASICWGT', 'BASICWGT', 'YEAR < 1998'), 
        ('EMP', 'BASICWGT', 'LFS == "Employed"')]
date = lambda x: pd.to_datetime(dict(year=x.YEAR, 
                                     month=x.MONTH, day=1))
data = pd.concat(
    [df.query(query)
       .groupby(['YEAR', 'MONTH'])
       .apply(lambda x: median_age(x, wgt))
       .rename(name).reset_index()
       .assign(date = date).set_index('date')
       .drop(['YEAR', 'MONTH'], axis=1) 
     for name, wgt, query in grps], axis=1)
data['ALL'] = (data.loc['1989':'1993', 'BASICWGT']
               .append(data.loc['1994':, 'PWSSWGT']))
res = data.loc['1990':, ['AGE16PLUS', 'EMP', 'ALL']]
res.to_csv(data_dir / 'median_age.csv', 
           index_label='date', float_format='%g')

In [2]:
res = pd.read_csv(data_dir / 'median_age.csv', 
                  index_col='date', parse_dates=True)
allcol = 'blue!60!cyan'
a16col = 'violet'
empcol = 'red'
grps = [('ALL', allcol, None, 0), ('AGE16PLUS', a16col, 'm', 0.17),
        ('EMP', empcol, None, 0)]

nodes = '\n'.join([end_node(res[srs], col, date=dt, loc=loc, 
                            offset=offset,
                            full_year=True, colon=False) 
                   for (srs, col, dt, offset), loc 
                   in itertools.product(grps, ['end', 'start'])])
write_txt(text_dir / 'med_age_node.txt', nodes) 

def drop_zero(value):
    '''
    Return text string with value but drop .0 if value ends in .0
    '''
    return (f'{value:.0f}' if round(value, 1) % 1 == 0.0 
            else f'{value:.1f}')

ltval = drop_zero(res.ALL.iloc[-1])
ltdt = dtxt(res.index[-1])['mon1']
prval = drop_zero(res.ALL.iloc[0])
prdt = dtxt(res.index[0])['mon1']
ltwval = drop_zero(res.EMP.iloc[-1])
prwval = drop_zero(res.EMP.iloc[0])
text = ('The \\textbf{median age} is the midpoint for the age of a '+
        'group; half of the group is older and half is younger. '+
        'Tracking this point over time summarizes the age composition '+
        'of the group. As a population ages, the median age will '+
        'increase.\n\nThe median age of the overall civilian non'+
        'institutionalized population, calculated from the Current '+
        f'Population Survey (CPS), is {ltval}, as of {ltdt}, compared '+
        f'to {prval} in {prdt} {c_line(allcol)}. The median worker '+
        f'is {ltwval} in {ltdt}, and {prwval} in {prdt} '+
        f'{c_line(empcol)}.')
write_txt(text_dir / 'med_age.txt', text) 
print(text)

The \textbf{median age} is the midpoint for the age of a group; half of the group is older and half is younger. Tracking this point over time summarizes the age composition of the group. As a population ages, the median age will increase.

The median age of the overall civilian noninstitutionalized population, calculated from the Current Population Survey (CPS), is 38, as of January 2022, compared to 31.3 in January 1990 (see {\color{blue!60!cyan}\textbf{---}}). The median worker is 41 in January 2022, and 35.5 in January 1990 (see {\color{red}\textbf{---}}).


### Age Groups / Composition

In [34]:
grps = [('0--15', '0', '15'), ('16--24', '16', '24'),
        ('25--34', '25', '34'), ('35--44', '35', '44'),
        ('45--54', '45', '54'), ('55--64', '55', '64'),
        ('65--74', '65', '74'), ('0--17', '0', '17'),
        ('25--54', '25', '54'), ('65+', '65', '85')]
col = ['AGE', 'PWSSWGT']
lt = (cps_1mo(cps_dir, cps_date(), col)
      .groupby('AGE')[col[-1]].sum())
col = ['AGE', 'BASICWGT']
py = '1989'
pr = (cps_1mo(cps_dir, pd.to_datetime(f'{py}-01-01'), col)
      .groupby('AGE')[col[-1]].sum())
sh, tot = pd.DataFrame(), pd.DataFrame()
totlt, totpr = lt.sum() / 1_000_000, pr.sum() / 1_000_000
tot['Latest'] = (pd.Series({name: lt.loc[a1:a2].sum() 
                          for name, a1, a2 in grps}) 
                 / 1_000_000)
sh['Latest'] = (tot.Latest / totlt) * 100
tot['Prev'] = (pd.Series({name: pr.loc[a1:a2].sum() 
                          for name, a1, a2 in grps})
               / 1_000_000)
sh['Prev'] = (tot.Prev / totpr) * 100
sh.to_csv(data_dir / 'cps_age.csv', index_label='name', 
          float_format='%g')
ltdt = dtxt(cps_date())['mon1']

In [48]:
sht = sh.applymap('{:.1f} percent'.format)
tott = tot.applymap('{:.1f} million'.format)
text = ('The noninstitutionalized civilian population '+
        'used in most labor statistics totals '+
        f'{totlt:.1f} million in {ltdt}. '+
        f'Of this, {sht.loc["0--15","Latest"]} '+
        'are under the working age of 16, '+
        f'equivalent to {tott.loc["0--15","Latest"]} '+
        'people. In 1989, the under-16 population '+
        f'was {sht.loc["0--15","Prev"]} '+
        'of the total. The juvenile population, those '+
        f'under 18, is {tott.loc["0--17","Latest"]}, '+
        f'equivalent to {sht.loc["0--17","Latest"]} of '+
        f'the population in {ltdt}, and compared to '+
        f'{sht.loc["0--17","Prev"]} in 1989.\n\n'+
        'The core of workforce is '+
        'historically those between the ages of '+
        '25 and 54. The age 25--54 group contains '+
        f'{tott.loc["25--54", "Latest"]} people and '+
        f'comprises {sht.loc["25--54", "Latest"]} of '+
        f'the population in {ltdt}. The group made '+
        f'up {sht.loc["25--54", "Prev"]} of the '+
        'population in 1989. The age 55 to 64 group '+
        f'makes up {sht.loc["55--64", "Latest"]} of '+
        'the total in the latest data and '+
        f'{sht.loc["55--64", "Prev"]} in 1989. Those '+
        'above the age of 65 comprise '+
        f'{sht.loc["65+", "Latest"]} in {ltdt} and '+
        f'{sht.loc["65+", "Prev"]} in 1989. ')
write_txt(text_dir / 'cps_age.txt', text)    
print(text)

The noninstitutionalized civilian population used in most labor statistics totals 327.7 million in January 2022. Of this, 19.7 percent are under the working age of 16, equivalent to 64.5 million people. In 1989, the under-16 population was 23.4 percent of the total. The juvenile population, those under 18, is 73.5 million, equivalent to 22.4 percent of the population in January 2022, and compared to 26.3 percent in 1989.

The core of workforce is historically those between the ages of 25 and 54. The age 25--54 group contains 127.2 million people and comprises 38.8 percent of the population in January 2022. The group made up 42.3 percent of the population in 1989. The age 55 to 64 group makes up 12.9 percent of the total in the latest data and 8.9 percent in 1989. Those above the age of 65 comprise 17.0 percent in January 2022 and 11.9 percent in 1989. 


In [26]:
sh.to_csv(data_dir / 'cps_age.csv', index_label='name', 
          float_format='%g')

In [16]:
sh['Latest'] = pd.Series({name: lt.loc[a1:a2].sum() 
                          for name, a1, a2 in grps})
tot['Latest'] = sh.Latest / totlt
sh['Prev'] = pd.Series({name: pr.loc[a1:a2].sum() 
                          for name, a1, a2 in grps})
tot['Prev'] = sh.Prev / totpr

In [14]:
tot['Latest'] = sh.Latest / totlt

In [10]:
totlt

327677570.0

In [7]:
write_txt(text_dir / 'cps_age_dt.txt', dtxt(cps_date())['mon2'])

dates = [('Latest', cps_date()), 
         ('Prev', cps_date() - pd.DateOffset(years=1)), 
         ('1989', pd.to_datetime('1989-01-01'))]

d = {y: {} for y, d in dates}
t = {y: {} for y, d in dates}

for name, date in dates:
    wgt = 'PWSSWGT' if date.year > 1993 else 'BASICWGT'
    cols = ['AGE', wgt]
    df = cps_1mo(cps_dir, date, cols)

    data = df.groupby('AGE')[wgt].sum()
    tot = data.sum()
    t[name]['tot'] = tot
    d[name]['0--15'] = data.loc[:'15'].sum() / tot
    d[name]['16--24'] = data.loc['16':'24'].sum() / tot
    d[name]['25--34'] = data.loc['25':'34'].sum() / tot
    d[name]['35--44'] = data.loc['35':'44'].sum() / tot
    d[name]['45--54'] = data.loc['45':'54'].sum() / tot
    d[name]['55--64'] = data.loc['55':'64'].sum() / tot
    d[name]['65--74'] = data.loc['65':'74'].sum() / tot
    d[name]['0--17'] = data.loc[:'17'].sum() / tot
    d[name]['25--54'] = data.loc['25':'54'].sum() / tot
    d[name]['65+'] = data.loc['65':].sum() / tot
    
result = pd.DataFrame(d) * 100
result[['1989', 'Latest']].to_csv(data_dir / 'cps_age.csv', 
                                  index_label='name')

In [21]:
result

Unnamed: 0,Latest,Prev,1989
0--15,19.676375,19.748072,23.425871
16--24,11.559856,11.489636,13.436559
25--34,13.505641,13.781972,17.631367
35--44,13.075386,12.802543,14.623985
45--54,12.233938,12.201227,10.07791
55--64,12.927431,12.938403,8.88555
65--74,10.272503,10.145205,7.305468
0--17,22.438267,22.422472,26.346278
25--54,38.814965,38.785741,42.333257
65+,17.021368,17.038155,11.918764


In [None]:
The noninstitutionalized civilian population used in most labor statistics totals {ltval} million in {ltdt}. Of this, XX.X percent are under the working age of 16, equivalent to XX.X million people. In 1989, the under 16 population was XX.X percent of the total. The juvenile population, those under 18, is XX.X million, equivalent to XX.X percent of population, and compared to XX.X percent in 1989. 

The core of workforce is historically those between the ages of 25 and 54. The age 25--54 group contains XXX.X million people and comprises XX.X percent of the population in ltdt. The group makes up XX.X percent in 1989. The age 55 to 64 group makes up XX.X percent of the total in the latest data and XX.X percent in 1989. Those above the age of 65 comprise XX.X percent in ltdt and XX.X percent in 1989. 

In [3]:
write_txt(text_dir / 'cps_age_dt.txt', dtxt(cps_date())['mon2'])

dates = [('Latest', cps_date()), 
         ('Prev', cps_date() - pd.DateOffset(years=1)), 
         ('1989', pd.to_datetime('1989-12-01'))]

d = {y: {} for y, d in dates}
t = {y: {} for y, d in dates}

for name, date in dates:
    wgt = 'PWSSWGT' if date.year > 1993 else 'BASICWGT'
    cols = ['AGE', wgt]
    df = cps_1mo(cps_dir, date, cols)

    data = df.groupby('AGE')[wgt].sum()
    tot = data.sum()
    t[name]['tot'] = tot
    d[name]['0--15'] = data.loc[:'15'].sum() / tot
    d[name]['16--24'] = data.loc['16':'24'].sum() / tot
    d[name]['25--34'] = data.loc['25':'34'].sum() / tot
    d[name]['35--44'] = data.loc['25':'34'].sum() / tot
    d[name]['45--54'] = data.loc['25':'34'].sum() / tot
    d[name]['55--64'] = data.loc['25':'34'].sum() / tot
    d[name]['65--74'] = data.loc['65':'74'].sum() / tot
    d[name]['25--54'] = data.loc['25':'54'].sum() / tot
    d[name]['65+'] = data.loc['65':].sum() / tot
    
result = pd.DataFrame(d) * 100
result[['1989', 'Latest']].to_csv(data_dir / 'cps_age.csv', 
                                  index_label='name')

pop = f'{t["Latest"]["tot"] / 1_000_000:.0f}'
popp = f'{t["1989"]["tot"] / 1_000_000:.0f}'
ldate = dtxt(cps_date())['mon1']
popgr = f'{((t["Latest"]["tot"] / t["Prev"]["tot"]) - 1) * 100:.1f}'

u18 = d['Latest']['0--17'] * 100
prer = d['Latest']['45--64'] * 100
o64 = d['Latest']['65+'] * 100
u18p = d['1989']['0--17'] * 100
prerp = d['1989']['45--64'] * 100
o64p = d['1989']['65+'] * 100

# Population growth estimate retrieved from Census API
census = Path(text_dir / 'pop_growth_percent.txt').read_text()

qual = '; ' if popgr == census.replace(' percent', '') else ', though'

text = (f'The CPS civilian noninstitutionalized population is {pop} '+
        f'million in the year ending {ldate}, with growth of '+
        f'{popgr} percent over the past year{qual} the official Census '+
        f'population growth estimate is {census}. By age, '+
        f'{u18:.1f} percent are under the age of 18 and {o64:.1f} '+
        f'percent are age 65 or older. In 1989, the US population '+
        f'was {popp} million, with {u18p:.1f} percent under 18 and '+
        f'{o64p:.1f} percent 65 or older. The pre-retirement age '+
        '(45--64) share of the population has increased to '+
        f'{prer:.1f} percent in the year ending {ldate} from '+
        f'{prerp:.1f} percent in 1989. ')

write_txt(text_dir / 'cps_age.txt', text)    
print(text)

The CPS civilian noninstitutionalized population is 326 million in the year ending January 2022, with growth of 0.2 percent over the past year, though the official Census population growth estimate is 0.1 percent. By age, 22.3 percent are under the age of 18 and 17.3 percent are age 65 or older. In 1989, the US population was 244 million, with 26.3 percent under 18 and 12.0 percent 65 or older. The pre-retirement age (45--64) share of the population has increased to 25.0 percent in the year ending January 2022 from 18.9 percent in 1989. 


### Education

In [None]:
cols = ['EDUCDT', 'EDUC', 'BASICWGT', 'AGE', 'MONTH', 'YEAR', 'LFS', 'HRSACTT', 'FEMALE']

educdt2 = lambda x: np.where(x.EDUCDT == 'Some college but no degree', 'SCND', 
                    np.where(x.EDUCDT == 'Associate degree-occupational/vocational', 'VOC',
                    np.where(x.EDUCDT == 'Associate degree-academic program', 'AAD', x.EDUC)))

df = cps_12mo(cps_dir, cps_date(), cols).query('AGE > 24').assign(EDUCDT2 = educdt2)
date2000 = pd.to_datetime('2000-12-01')
df2 = cps_12mo(cps_dir, date2000, cols).query('AGE > 24').assign(EDUCDT2 = educdt2)

df3 = df.query('LFS == "Employed"')
df4 = df2.query('LFS == "Employed"')

data = pd.DataFrame()

data['2000'] = df2.groupby('EDUCDT2').BASICWGT.sum() / df2.BASICWGT.sum() * 100
data['latest'] = df.groupby('EDUCDT2').BASICWGT.sum() / df.BASICWGT.sum() * 100

data['2000_emp'] = df4.groupby('EDUCDT2').BASICWGT.sum() / df4.BASICWGT.sum() * 100
data['latest_emp'] = df3.groupby('EDUCDT2').BASICWGT.sum() / df3.BASICWGT.sum() * 100

data = data.loc[['LTHS', 'HS', 'SCND', 'VOC', 'AAD', 'COLL', 'ADV']]

data.index = ['No High School Diploma', 'High School Diploma', 'Some College, No Degree', 
              'Associate Degree,\\\*Vocational',
              'Associate Degree,\\\*Academic', "Bachelor's Degree", 'Advanced Degree']

data.to_csv(data_dir / 'cps_educ.csv', sep=';', index_label='name')

In [None]:
tot = (df.groupby('EDUC').BASICWGT.sum() / 12_000_000).loc[['LTHS', 'HS', 'SC', 'COLL', 'ADV']]
tot.index = ['No High School Diploma', 'High School Diploma', 'Some College or Associate Degree', 
             "Bachelor's Degree", 'Advanced Degree']
tot.to_csv(data_dir / 'cps_educ_tot.csv', index_label='name', header=True)

ltdate = dtxt(cps_date())['mon1']
write_txt(text_dir / 'cps_ltdate.txt', ltdate)
ba_adv_tot = tot["Bachelor's Degree"] + tot['Advanced Degree']
ba_adv_sh = (data.loc["Bachelor's Degree", 'latest'] + 
             data.loc['Advanced Degree', 'latest'])

adv_tot = tot['Advanced Degree']
adv_sh = data.loc['Advanced Degree', 'latest']

sc_tot = tot['Some College or Associate Degree']

hs_tot = tot['High School Diploma']

lths_tot = tot['No High School Diploma']

text = (f'Over the year ending {ltdate}, {ba_adv_tot:.1f} million '+
        f'people over the age of 25, or {ba_adv_sh:.1f} percent of the total, '+
        f"have at least a bachelor's degree, with {adv_tot:.1f} million of "+
        f'those, or {adv_sh:.1f} percent of the total, holding '+
        "an advanced degree such as a master's degree, medical or law degree, or PhD. "+
        f'An additional {sc_tot:.1f} million people have some college coursework '+
        f'but no degree or have an associate degree. A total of {hs_tot:.1f} million '+
        f'have a high school diploma but no college, while {lths_tot:.1f} million '+
        f'have no high school diploma.')
write_txt(text_dir / 'cps_educ.txt', text)
write_txt(text_dir / 'cps_ltdate.txt', ltdate)
print(text)

In [None]:
ba_adv_sh_pr = (data.loc["Bachelor's Degree", '2000'] + 
                data.loc['Advanced Degree', '2000'])
ba_adv_sh_ch = ba_adv_sh - ba_adv_sh_pr
ba_adv_sh_emp = (data.loc["Bachelor's Degree", 'latest_emp'] + 
                 data.loc['Advanced Degree', 'latest_emp'])
ba_adv_sh_emp_pr = (data.loc["Bachelor's Degree", '2000_emp'] + 
                    data.loc['Advanced Degree', '2000_emp'])
ba_adv_sh_emp_ch = ba_adv_sh_emp - ba_adv_sh_emp_pr

text = ("The share of the population with a bachelor's degree or advanced degree "+
        f"increased by {ba_adv_sh_ch:.1f} percentage points since 2000. The increase "+
        "is even more pronounced among those who are employed; "+
        f"{ba_adv_sh_emp:.1f} percent have a college degree or advanced degree in {ltdate}, an "+
        f"increase of {ba_adv_sh_emp_ch:.1f} percentage points since 2000. ")
write_txt(text_dir / 'cps_educ2.txt', text)
print(text)