### bd econ CPS extract

bd_CPS_revisions_reader.ipynb

January 2026 (UPDATE)

Contact: Brian Dew, @bd_econ

-----

Reads in Census revised data and stores it as feather files for merging with the bd CPS. This also separately incldues revised union data.

**2000-based revised weights:**

Requires: `2000-2extract.txt` data dictionary and unzipped 2000 Based Public Use Extracts from [Census](https://thedataweb.rm.census.gov/ftp/cps_ftp.html#cpsbasic_extract). 

**2001-2002 revised union data:**

Requires: `2000-2extract.txt` data dictionary and unzipped 2000 Based Public Use Extracts from [Census](https://thedataweb.rm.census.gov/ftp/cps_ftp.html#cpsbasic_extract). 

**2000-2002 recoded occupation and industry:**

Requires: `2000-2extract.txt` data dictionary and unzipped 2000 Based Public Use Extracts from [Census](https://thedataweb.rm.census.gov/ftp/cps_ftp.html#cpsbasic_extract). 

**December 2007 revised weights:**

Requires: `dec07revwgts_dd.txt` data dictionary and unzipped `dec07revwgts.dat` from [Census CPS FTP](https://thedataweb.rm.census.gov/ftp/cps_ftp.html). 

-----

Reads in Certification extracts for 2015 and 2016, which identify people with a professional certification, and go into the bd CPS variable CERT.

**2015-2016 certification data:**

Requires `Certification_extract_file_YYYY_rec_layout.txt` data dictionaries for YYYY in 2015 and 2016, and unzipped data files from [Census CPS FTP](https://thedataweb.rm.census.gov/ftp/cps_ftp.html#cpscert).

-----

Reads disability flag for late 2008 observations.

**June to December 2008 disability flag data:**

Requires data dictionaries for YYYY in 2015 and 2016, and unzipped data files from [Census CPS FTP](https://thedataweb.rm.census.gov/ftp/cps_ftp.html#cpsbasic_extract).

----

**Telework supplement (October 2022 - May 2024):**

Requires: Unzipped telework supplement files from [Census CPS FTP](https://thedataweb.rm.census.gov/ftp/cps_ftp.html).

----

See [related GitHub issue](https://github.com/bdecon/econ_data/issues/82) for explanation.

In [1]:
# Import relevant libraries (python 3.7)
import os, re, struct
import pandas as pd
print('pandas:', pd.__version__)
import numpy as np
print('numpy:', np.__version__)

import sys
sys.path.insert(0, '/home/brian/Documents/econ_data/bd_CPS')
from bd_CPS_utils import id_dtype, create_struct_unpacker, normalize_weights

os.chdir('/home/brian/Documents/CPS/data')

pandas: 2.3.3
numpy: 2.3.5


In [2]:
# User-defined functions
# Note: id_dtype, create_struct_unpacker, normalize_weights imported from bd_CPS_utils

def data_dict_reader(dd_file, var_names):
    '''Read data dictionary and return variable locations'''
    data_dict = open(dd_file, 'r', encoding='iso-8859-1').read()
    p = f'({"|".join(var_names)})\s+(\d+)\s+.*? \s+.*?(\d+).*?(\d\d*)'
    if (data_dict[0] == '\t') & (dd_file == '2000-2extract.txt'):
        # Updated data dictionary from Census
        p = f'({"|".join(var_names)})\s+(\d+)\s+.*?\t+.*?(\d+).*?(\d\d*)'
    d = {s[0]: [int(s[2])-1, int(s[3]), f'{s[1]}s', id_dtype(s[1])]
         for s in re.findall(p, data_dict)}
    return d

def struct_unpacker(d):
    '''Return struct unpacker from variable locations'''
    return create_struct_unpacker(d, width_has_suffix=True)

def data_file_reader(file, unpacker, dtypes, wgt):
    '''Convert raw monthly file to dataframe'''
    raw_data = open(file, 'rb')
    data = [unpacker(row) for row in raw_data]
    np_data = np.array(data, dtype=dtypes)
    if wgt != 'None':
        df = pd.DataFrame(np_data[np_data[wgt] > 0])
    else:
        df = pd.DataFrame(np_data)
    return df

def df_adjuster(df, wgt_vars=None):
    '''Adjust dataframe to match with bd CPS'''
    rev_df = (df.rename({'HRYEAR4': 'YEAR', 'HRMONTH': 'MONTH'}, axis=1)
                .assign(YEAR = lambda x: pd.Categorical(x['YEAR'])))
    if wgt_vars is not None:
        normalize_weights(rev_df, wgt_vars)
    return rev_df.reset_index(drop=True)

  p = f'({"|".join(var_names)})\s+(\d+)\s+.*? \s+.*?(\d+).*?(\d\d*)'
  p = f'({"|".join(var_names)})\s+(\d+)\s+.*?\t+.*?(\d+).*?(\d\d*)'


#### Map person weight to HH weight

In [3]:
var_names = ['HWHHWGT', 'QSTNUM', 'PWSSWGT', 'OCCURNUM']
p = 'D (\w+)\s+(\d{1,2})\s+(\d+)\s+'
data_dict = open('jan98dd.asc', 'r', encoding='iso-8859-1').read()
d = {s[0]: [int(s[2])-1, int(s[2])+int(s[1])-1, f'{s[1]}s'] 
     for s in re.findall(p, data_dict) if s[0] in var_names} 
start, end, width = zip(*d.values())
skip = ([f'{s - e}x' for s, e in zip(start, [0] + list(end[:-1]))])
unpack_fmt = ''.join([j for i in zip(skip, width) for j in i])
unpacker = struct.Struct(unpack_fmt).unpack_from

hhw = {2000: {}, 2001: {}, 2002: {}}
for date in pd.date_range(start='2000-01-01', end='2002-12-01', freq='MS'):
    file = f'{date.strftime("%b%y").lower()}pub.dat'
    raw_data = open(file, 'rb').readlines()
    wgt = d['PWSSWGT']  
    data = [[*map(int, unpacker(row))] for row in raw_data
            if int(row[wgt[0]:wgt[1]]) > 0]
    df = pd.DataFrame(data, columns=d.keys())
    res = (df.query('HWHHWGT == PWSSWGT')
             .drop_duplicates('QSTNUM')
             [['QSTNUM', 'OCCURNUM']])
    hhw[date.year][date.month] = res.set_index('QSTNUM')['OCCURNUM'].to_dict()

  p = 'D (\w+)\s+(\d{1,2})\s+(\d+)\s+'


#### Revised 2000-based weights

In [4]:
# Store 2000-based revised weights as feather file
dd_file = '2000-2extract.txt'

var_names = ['HRMONTH', 'HRYEAR4', 'QSTNUM', 'OCCURNUM', 
             'NWCMPWGT', 'NWORWGT', 'NWSSWGT']

wgt_vars = ['NWCMPWGT', 'NWORWGT', 'NWSSWGT']

filter_wgt = 'NWSSWGT'

d = data_dict_reader(dd_file, var_names)

dtypes = [(k, v[-1]) for k, v in d.items()]

unpacker = struct_unpacker(d)

rev_data_path = 'pubuse2000_2002/'

file_names = os.listdir(rev_data_path)

files = [rev_data_path + month_file for month_file in file_names]

df = pd.concat([data_file_reader(file, unpacker, dtypes, filter_wgt) 
                for file in files])

for year in [2000, 2001, 2002]:
    dfy = df.query('HRYEAR4 == @year')

    dfy = df_adjuster(dfy, wgt_vars=wgt_vars)
    
    hhwy = (pd.DataFrame([(k, g, i) for k, v in hhw[year].items() for g, i in v.items()], 
                    columns=['MONTH', 'QSTNUM', 'HHWLN']))
    dfy = dfy.merge(hhwy)
    hhws = dfy.query('OCCURNUM == HHWLN')[['MONTH', 'QSTNUM', 'NWSSWGT']].rename({'NWSSWGT': 'NWHHWGT'}, axis=1)
    dfy = dfy.merge(hhws).drop('HHWLN', axis=1)

    dfy.to_feather(f'clean/cps_wgt_rev{year}.ft')

#### Revised union data (2001-2002)

In [5]:
# Store revised union data as feather file
dd_file = '2000-2extract.txt'

var_names = ['HRMONTH', 'HRYEAR4', 'QSTNUM', 'OCCURNUM', 
             'NEERNLAB', 'NEERNCOV', 'NWSSWGT']

filter_wgt = 'NWSSWGT'

d = data_dict_reader(dd_file, var_names)

dtypes = [(k, v[-1]) for k, v in d.items()]

unpacker = struct_unpacker(d)

rev_data_path = 'pubuse2000_2002/'

file_names = [file for file in os.listdir(rev_data_path) if file[3:5] != '00']

files = [rev_data_path + month_file for month_file in file_names]

df = pd.concat([data_file_reader(file, unpacker, dtypes, filter_wgt) 
                for file in files])

for year in [2001, 2002]:
    dfy = df.query('HRYEAR4 == @year')

    dfy = df_adjuster(dfy).drop('NWSSWGT', axis=1)

    dfy.to_feather(f'clean/cps_union_rev{year}.ft')

#### Recoded industry and occupation 2000-2002

In [6]:
# Store 2000-based revised weights as feather file
dd_file = '2000-2extract.txt'

io_vars = ['NEIO1ICD', 'NEIO2ICD', 'NRDTIND1', 'NRDTIND2', 'NRDTOCC1', 
           'NRDTOCC2', 'NRMJIND1', 'NRMJIND2', 'NRMJOCC1', 'NRMJOCC2', 
           'NTIO1OCD', 'NTIO2OCD']

var_names = ['HRMONTH', 'HRYEAR4', 'QSTNUM', 'OCCURNUM', 'NWSSWGT'] + io_vars

filter_wgt = 'NWSSWGT'

d = data_dict_reader(dd_file, var_names)

# Manually fixing -- bug to be resolved later
d['NEIO1ICD'] = [13, 17, '4s', 'int16']


dtypes = [(k, v[-1]) for k, v in d.items()]

unpacker = struct_unpacker(d)

rev_data_path = 'pubuse2000_2002/'

file_names = os.listdir(rev_data_path)

files = [rev_data_path + month_file for month_file in file_names]

df = pd.concat([data_file_reader(file, unpacker, dtypes, filter_wgt) 
                for file in files])

df = df.drop(['NWSSWGT'], axis=1)


df = df.rename({'NEIO1ICD': 'IND02', 'NEIO2ICD': 'IND202', 'NTIO1OCD': 'OCC00',
                'NRDTIND1': 'IND03D', 'NRDTIND2': 'IND203D', 'NTIO2OCD': 'OCC200',
                'NRDTOCC1': 'OCC03D', 'NRDTOCC2': 'OCC203D', 'NRMJIND1': 'IND03M',
                'NRMJIND2': 'IND203M', 'NRMJOCC1': 'OCC03M', 'NRMJOCC2': 'OCC203M'}, axis=1)


for year in [2000, 2001, 2002]:
    dfy = df.query('HRYEAR4 == @year')

    dfy = df_adjuster(dfy)

    dfy.to_feather(f'clean/cps_io_rev{year}.ft')

#### Revised December 2007 weights

In [7]:
# Store revised union data as feather file
dd_file = 'dec07revwgts_dd.txt'

var_names = ['QSTNUM', 'OCCURNUM', 'PWSSWGT', 'PWCMPWGT']

filter_wgt = 'PWSSWGT'

d = data_dict_reader(dd_file, var_names)

dtypes = [(k, v[-1]) for k, v in d.items()]

unpacker = struct_unpacker(d)

file = 'dec07revwgts.dat'

# Special code to remove rows with only '.'
with open(file, 'rb+') as f:
    new_f = f.readlines()
    f.seek(0)
    for line in new_f:
        if b'.' not in line:
            f.write(line)
    f.truncate()

df = data_file_reader(file, unpacker, dtypes, filter_wgt) 

normalize_weights(df, ['PWSSWGT', 'PWCMPWGT'])

df = df.rename({'PWSSWGT': 'NWSSWGT', 'PWCMPWGT': 'NWCMPWGT'}, axis=1)

df.reset_index(drop=True).to_feather('clean/cps_dec07_rev.ft')

#### Certification data for 2015-2016

In [8]:
# Store certification data as feather file

# Use the 2016 dictionary for both years
dd_file = 'Certification_extract_file_2016_rec_layout.txt'

var_names = ['QSTNUM', 'PULINENO', 'MONTH', 'PECERT1']

filter_wgt = 'PECERT1'

d = data_dict_reader(dd_file, var_names)

dtypes = [(k, v[-1]) for k, v in d.items()]

unpacker = struct_unpacker(d)

# Loop over two years and create feather for each
for year in ['2015', '2016']:
    file = f'jan{year[2:]}-dec{year[2:]}cert_ext.dat'

    df = data_file_reader(file, unpacker, dtypes, filter_wgt) 

    (df.reset_index(drop=True)
       .rename({'PULINENO': 'LINENO'}, axis=1)
       .to_feather(f'clean/cps_cert{year}.ft'))

#### Disability Flag late 2008

In [9]:
# Store disability data as feather file

d = {'QSTNUM': [0, 8, '8s', 'int32'],
     'HRMONTH': [8, 16, '8s', 'int8'],
     'OCCURNUM': [24, 32, '8s', 'int32'],
     'PRDISFLG': [80, 88, '8s', 'int32']}
dtypes = [(k, v[-1]) for k, v in d.items()]

unpacker = struct_unpacker(d)

# Unpack data
file = 'disability.dat'

df = data_file_reader(file, unpacker, dtypes, 'QSTNUM') 

df = df.rename({'HRMONTH': 'MONTH'}, axis=1)

df.reset_index(drop=True).to_feather(f'clean/cps_disability2008.ft')

#### Telework Supplement Data (October 2022 - May 2024)

Monthly telework supplement files stored in Telework/ subfolder. Starting June 2024,
telework questions were folded into the main CPS survey.

Only two variables are used by bd CPS:
- PTCOVR1: Telework status (1=yes, 2=no)
- PTCOVR2: Hours teleworked

Requires: Unzipped telework supplement files from [Census CPS FTP](https://thedataweb.rm.census.gov/ftp/cps_ftp.html).

In [10]:
# Store telework supplement data as feather files
# Supplement ran Oct 2022 - May 2024; only PTCOVR1 and PTCOVR2 are used

telework_path = 'Telework/'
all_telework = []

# Fixed-width format: same positions in both V1 (30-byte) and V2 (22-byte) files
# QSTNUM: 1-5, OCCURNUM: 6-7, HRMONTH: 8-9, HRYEAR: 10-13, PTCOVR1: 14-15, PTCOVR2: 16-18
widths = [5, 2, 2, 4, 2, 3]
columns = ['QSTNUM', 'OCCURNUM', 'HRMONTH', 'HRYEAR', 'PTCOVR1', 'PTCOVR2']

for filename in sorted(os.listdir(telework_path)):
    if not filename.endswith('.dat'):
        continue
    filepath = telework_path + filename

    data = []
    with open(filepath, 'rb') as f:
        for line in f:
            line = line.rstrip(b'\r\n')
            if len(line) == 0:
                continue
            pos = 0
            row = []
            for width in widths:
                field = line[pos:pos + width].decode('ascii').strip()
                try:
                    row.append(int(field))
                except ValueError:
                    row.append(np.nan)
                pos += width
            data.append(row)

    df = pd.DataFrame(data, columns=columns)
    all_telework.append(df)
    print(f"  {filename}: {len(df):,} records")

# Combine and process
if all_telework:
    combined = pd.concat(all_telework, ignore_index=True)
    combined = combined.rename(columns={'HRMONTH': 'MONTH', 'HRYEAR': 'YEAR'})
    combined.loc[combined['YEAR'] < 100, 'YEAR'] += 2000

    # Set dtypes (use nullable Int for PTCOVR2 since it can have NaN)
    combined = combined.astype({
        'QSTNUM': 'int32', 'OCCURNUM': 'int8', 'MONTH': 'int8', 'YEAR': 'int16',
        'PTCOVR1': 'int8', 'PTCOVR2': 'Int16'
    })

    # Save by year
    for year in sorted(combined['YEAR'].unique()):
        year_data = combined[combined['YEAR'] == year].drop('YEAR', axis=1)
        year_data.reset_index(drop=True).to_feather(f'clean/cps_telework{year}.ft')
        months = sorted(year_data['MONTH'].unique())
        print(f"Saved cps_telework{year}.ft: {len(year_data):,} records (months: {months})")

  apr23cpucvr_pub.dat: 45,907 records
  apr24cpucvr_pub.dat: 45,965 records
  aug23cpucvr_pub.dat: 46,000 records
  dec22cpucvr_pub.dat: 45,113 records
  dec23cpucvr_puf.dat: 45,690 records
  feb23cpucvr_pub.dat: 44,568 records
  feb24cpucvr_puf.dat: 45,768 records
  jan23cpucvr_pub.dat: 45,200 records
  jan24cpucvr_puf.dat: 45,154 records
  jul23cpucvr_pub.dat: 45,094 records
  jun23cpucvr_pub.dat: 45,540 records
  mar23cpucvr_pub.dat: 43,831 records
  mar24cpucvr_pub.dat: 43,918 records
  may23cpucvr_pub.dat: 46,743 records
  may24cpucvr_puf.dat: 46,029 records
  nov22cpucvr_pub.dat: 45,653 records
  nov23cpucvr_puf.dat: 46,310 records
  oct22cpucvr_pub.dat: 46,319 records
  oct23cpucvr_pub.dat: 47,135 records
  sep23cpucvr_pub.dat: 46,112 records
Saved cps_telework2022.ft: 137,085 records (months: [np.int8(10), np.int8(11), np.int8(12)])
Saved cps_telework2023.ft: 548,130 records (months: [np.int8(1), np.int8(2), np.int8(3), np.int8(4), np.int8(5), np.int8(6), np.int8(7), np.int8(8)