# Codebook

## bd CPS list of variables and possible values

April 29, 2019

Brian Dew, @bd_econ, brianwdew@gmail.com

-----

Store a list of bd CPS variables and their datatype, availability, average coverage, and list of possible entries.

In [1]:
# Preliminaries
import pandas as pd
import numpy as np
import sys
old_stdout = sys.stdout
sys.stdout = open('/home/brian/Documents/econ_data/bd_CPS/codebook.txt', 'w')

import os
os.chdir('/home/brian/Documents/econ_data/bd_CPS/')

from bd_CPS_details import CodebookNotes, ValueLabels

os.chdir('/home/brian/Documents/CPS/data/clean/')

In [2]:
# Go through data and store requested details
d = {}

for year in range(1989, 2021):
    data = pd.read_feather(f'cps{year}.ft')
    for month, df in data.groupby('MONTH'):
        date = pd.to_datetime(f'{year}-{month}-01')
        variables = list(df.keys())
        for var in variables:
            
            values_list = list(df[var].dropna().unique())
            
            if (var in d.keys()) and (len(values_list) > 1):

                date_list = d[var]['avail']
                date_list.append(date)
                
                dtypes = d[var]['dtype']
                if df[var].dtype.name not in dtypes:
                    if len(dtypes) > 0:
                        d[var]['breaks'].append(date)
                    dtypes.append(df[var].dtype.name)
                    
                if len(values_list) < 500 and 'HHID2' not in var and 'OTC' not in var:
                    values = d[var]['values']
                    for value in values_list:
                        if value not in values:
                            values.append(value)
                else:
                    loc_max = sorted(values_list)[-1]
                    loc_min = sorted(values_list)[0]
                    if d[var]['max_val'] == None:
                        max_val = loc_max
                        min_val = loc_min
                    if d[var]['max_val'] != None:
                        max_val = d[var]['max_val']
                        min_val = d[var]['min_val']

                    if loc_max >= max_val:
                        d[var]['max_val'] = loc_max
                    if loc_min <= min_val:
                        d[var]['min_val'] = loc_min
                          
            if (var not in d.keys()) and (len(values_list) > 1):
                d[var] = {}
                d[var]['breaks'] = []
                d[var]['max_val'] = None
                d[var]['min_val'] = None
                d[var]['avail'] = [date]
                d[var]['dtype'] = [df[var].dtype.name]
                if len(values_list) < 500 and len(values_list) > 0:
                    d[var]['values'] = values_list
                else:
                    d[var]['values'] = []
                    
for key, values in d.items():
    avail = sorted(values['avail'])
    max_date = avail[-1].strftime('%Y-%m')
    values['date_max'] = max_date
    min_date = avail[0].strftime('%Y-%m')
    values['date_min'] = min_date                    

## Print Codebook

In [3]:
# Print out the bd CPS codebook
print('\n========================\n\n    bd CPS Codebook'
      '\n\n========================\n')
today = pd.to_datetime('today').strftime('%B %d, %Y')
print(f'updated: {today}\n\nvariables:\n')
for key, values in d.items():
    if key in CodebookNotes.keys():
        print(f'{key} - {CodebookNotes[key]["Name"]}')
        print(f'    Notes: {CodebookNotes[key]["Notes"]}')
    else:
        print(f'{key}')
    print(f'    Data types: {values["dtype"]}')
    print(f'    Available from: {values["date_min"]} to: {values["date_max"]}')
    if len(values['breaks']) > 0:
          print(f'    Breaks in dtype: {values["breaks"]}')
    print('    Value range: ')
    if len(sorted(values['values'])) > 0:
        print(sorted(values['values']))
    else:
        print(f'{values["min_val"]} to {values["max_val"]}')
    print('\n\n')
print('\n==============================\n\n    Variable value labels'
      '\n\n==============================\n\n')
print('2010 Occupation codes (OCC):\n')
for key, value in ValueLabels['OCC'].items():
    print(f'{key}     {value}')
              
print('\n\n2010 Occupation detailed recodes (OCCD):\n')
for key, value in ValueLabels['OCCD'].items():
    print(f'{key}     {value}')
              
print('\n\n2010 Occupation major recodes (OCCM):\n')
for key, value in ValueLabels['OCCM'].items():
    print(f'{key}     {value}')
              
print('\n\n\n2012 Industry codes (IND):\n')
for key, value in ValueLabels['IND'].items():
    print(f'{key}     {value}')
              
print('\n\n2012 Industry detailed recodes (INDD):\n')
for key, value in ValueLabels['INDD'].items():
    print(f'{key}     {value}')
              
print('\n\n2012 Industry major recodes (INDM):\n')
for key, value in ValueLabels['INDM'].items():
    print(f'{key}     {value}')
              
print('\n\nCore-based statistical areas (CBSA):\n')
for key, value in ValueLabels['CBSA'].items():
    print(f'{key}     {value}')
              
print('\n\nConsolidated statistical areas (CSA):\n')
for key, value in ValueLabels['CSA'].items():
    print(f'{key}     {value}')
              
print('\n\nCounties (COUNTY):\n')
for key, value in ValueLabels['COUNTY'].items():
    print(f'{key}     {value}')

In [4]:
#sys.stdout = old_stdout

#codebook = open('/home/brian/Documents/econ_data/bd_CPS/codebook.txt', 'r').read()
#print(codebook)