# Codebook

## bd CPS list of variables and possible values

March 6, 2019

Brian Dew, @bd_econ, brianwdew@gmail.com

-----

Store a list of bd CPS variables and their datatype, availability, average coverage, and list of possible entries.

In [1]:
# Preliminaries
import pandas as pd
import numpy as np
import os

os.chdir('/home/brian/Documents/econ_data/bd_CPS/')

from bd_CPS_details import CodebookNotes

os.chdir('/home/brian/Documents/CPS/data/clean/')

In [2]:
# Go through data and store requested details
d = {}

for year in range(1989, 2020):
    data = pd.read_feather(f'cps{year}.ft')
    for month, df in data.groupby('MONTH'):
        date = pd.to_datetime(f'{year}-{month}-01')
        variables = list(df.keys())
        for var in variables:
            
            values_list = list(df[var].dropna().unique())
            if 'ID' in var or 'WAGE' in var:
                values_list = []
            
            if var in d.keys():

                date_list = d[var]['avail']
                date_list.append(date)
                
                dtypes = d[var]['dtype']
                if df[var].dtype.name not in dtypes:
                    if len(dtypes) > 0:
                        d[var]['breaks'].append(date)
                    dtypes.append(df[var].dtype.name)
                    
                if len(values_list) < 1000:
                    values = d[var]['values']
                    for value in values_list:
                        if value not in values:
                            values.append(value)
                if len(values_list) > 1000:
                    loc_max = sorted(values_list)[-1]
                    loc_min = sorted(values_list)[0]
                    if d[var]['max_val'] == None:
                        max_val = loc_max
                        min_val = loc_min
                    if d[var]['max_val'] != None:
                        max_val = d[var]['max_val']
                        min_val = d[var]['min_val']

                    if loc_max >= max_val:
                        d[var]['max_val'] = loc_max
                    if loc_min <= min_val:
                        d[var]['min_val'] = loc_min
                        
                            
            if var not in d.keys():
                d[var] = {}
                d[var]['breaks'] = []
                d[var]['max_val'] = None
                d[var]['min_val'] = None
                d[var]['avail'] = [date]
                d[var]['dtype'] = [df[var].dtype.name]
                if len(values_list) < 1000 and len(values_list) > 0:
                    d[var]['values'] = values_list
                else:
                    d[var]['values'] = []
                    
for key, values in d.items():
    avail = sorted(values['avail'])
    max_date = avail[-1].strftime('%Y-%m')
    values['date_max'] = max_date
    min_date = avail[0].strftime('%Y-%m')
    values['date_min'] = min_date                    

## Print Codebook

In [3]:
# Print out the bd CPS codebook
print('\n========================\n\n    bd CPS Codebook'
      '\n\n========================\n')
print('updated: March 6, 2019\n\nvariables:\n')
for key, values in d.items():
    if key in CodebookNotes.keys():
        print(f'{key} - {CodebookNotes[key]["Name"]}')
        print(f'    Notes: {CodebookNotes[key]["Notes"]}')
    else:
        print(f'{key}')
    print(f'    Data types: {values["dtype"]}')
    print(f'    Available from: {values["date_min"]} to: {values["date_max"]}')
    if len(values['breaks']) > 0:
          print(f'    Breaks in dtype: {values["breaks"]}')
    print('    Value range: ')
    if len(sorted(values['values'])) > 0:
        print(sorted(values['values']))
    else:
        print(f'{values["min_val"]} to {values["max_val"]}')
    print('\n\n')



    bd CPS Codebook


updated: March 6, 2019

variables:

MONTH - Month
    Notes: Survey reference month.
    Data types: ['int8', 'category']
    Available from: 1989-01 to: 2019-01
    Breaks in dtype: [Timestamp('1994-01-01 00:00:00')]
    Value range: 
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]



YEAR - Year
    Notes: Survey reference year.
    Data types: ['category']
    Available from: 1989-01 to: 2019-01
    Value range: 
[1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019]



MIS - Month in sample
    Notes: Household month in sample.
    Data types: ['int8', 'category']
    Available from: 1989-01 to: 2019-01
    Breaks in dtype: [Timestamp('1994-01-01 00:00:00')]
    Value range: 
[1, 2, 3, 4, 5, 6, 7, 8]



METSTA
    Data types: ['int8']
    Available from: 1989-01 to: 2019-01
    Value range: 
[-1, 1, 2, 3]



HHID - Household ID 1
    No