In [1]:
# Import python packages
import pandas as pd
import numpy as np
import struct, os
import pickle 

# Data path
path = 'E:/08_Other/Archive/'

# Open data dictionary
dd = pickle.load(open(f'{path}cps_dictionaries.pkl', 'rb'))

FileNotFoundError: [Errno 2] No such file or directory: 'E:/08_Other/Archive/cps_dictionaries.pkl'

In [2]:
# Parse external data dictionary file

# Read externally created data dictionary
d = {i[0]: [(i[1]-1), i[1]-1+i[2], f'{i[2]}s'] 
     for i in dd['January_2017_Record_Layout.txt']['vlist']}

# Identify datatimes by variable length
dtypes = [(k, 'int8' if int(v[2].replace('s', '')) < 3 
           else 'int16' if int(v[2].replace('s', '')) < 6 
           else 'int32' if int(v[2].replace('s', '')) < 15 
           else 'object') for k, v in d.items()]

In [16]:
# Functions for annual CPS feather files

def cps_to_feather(path, year, data_dict, dtypes):
    """Convert combined data to feather format df"""
    # Combine monthly data into one list
    combined_monthly_files = combine_data(path, year, data_dict)
    
    # Read into numpy and set datatypes
    np_annual_cps = np.array(combined_monthly_files, dtype=dtypes)
    
    # Convert to pandas dataframe
    df = pd.DataFrame(np_annual_cps)

    # Store as feather format file
    df.to_feather(f'data/cps_{year}x.ft')
    
    print(f'{year} Complete ({len(df)} obs.): data/cps_{year}x.ft')

def combine_data(path, year, data_dict):
    """Read and combine monthly CPS data for given year"""
    # List of files from requested year
    data_dir = os.listdir(f'{path}data/')
    monthly_file_list = [mo_file for mo_file in data_dir
         if mo_file.endswith(f'{year % 100}pub.dat')]
    
    unpacker = unpack(data_dict) # Create struct unpacker
    
    wgt = d['PWSSWGT']  # Location of sample weight variable
    
    # List to fill with monthly CPS dfs
    combined_data = []
    
    # Loop over monthly files, read and add to combined_data
    for file in monthly_file_list:
        raw_monthly_data = open(f'{path}data/{file}', 'rb')
        # Filter raw data to keep only valid sample weight obs
        mo_data = [unpacker(row) for row in raw_monthly_data 
                   if row[wgt[0]:wgt[1]] != b'-1        ']
        # Add current month to combined_data
        combined_data.extend(mo_data)
        
    return(combined_data)

def unpack(data_dict):
    """Creates struct unpacker from variable data"""
    # lists of variable starts, ends, and lengths
    start, end, width = zip(*data_dict.values())

    # create list of which characters to skip in each row
    skip = ([f'{st - en}x' if (st - en) > 0 else '' 
             for st, en in zip(start, [0] + list(end[:-1]))])

    # create format string by joining skip and variable segments
    unpack_fmt = ''.join([j for i in zip(skip, width) for j in i])

    # struct can interpret row bytes with the format string
    return(struct.Struct(unpack_fmt).unpack_from)

In [17]:
cps_to_feather(path, 2017, d, dtypes)

2017 Complete (1530207 obs.): data/cps_2017x.ft


In [10]:
year = 2018

In [11]:
year % 100

18

In [None]:
def unpack(d):
    """Creates struct unpacker from variable data"""
    # lists of variable starts, ends, and lengths
    start, end, width = zip(*d.values())

    # create list of which characters to skip in each row
    skip = ([f'{s - e}x' if (s - e) > 0 else '' 
             for s, e in zip(start, [0] + list(end[:-1]))])

    # create format string by joining skip and variable segments
    unpack_fmt = ''.join([j for i in zip(skip, width) for j in i])

    # struct can interpret row bytes with the format string
    return(struct.Struct(unpack_fmt).unpack_from)

In [None]:
def combine_data(path, year, d):
    """Read and combine monthly CPS data for given year"""
    # List of files from requested year
    flist = [f for f in os.listdir(f'{path}data/') 
         if f.endswith(f'{str(year)[2:]}pub.dat')]
    
    unpacker = unpack(d) # Create string format
    
    wgt = d['PWCMPWGT']  # Location of sample weight variable
    
    # List to fill with monthly CPS dfs
    comb_data = []
    
    for file in flist:
        raw_data = open(f'{path}data/{file}', 'rb')
        
        mo_data = [unpacker(row) for row in raw_data 
                   if row[wgt[0]:wgt[1]] != b'-1        ']
        
        comb_data.extend(mo_data)
        
    return(comb_data)

In [None]:
def cps_to_feather(path, year, d, dtypes):
    """Convert combined data to feather format df"""
    # Combine monthly data into one list
    comb_data = combine_data(path, year, d)
    
    # Read into numpy and set datatypes
    data = np.array(comb_data, dtype=dtypes)
    
    # Convert to pandas dataframe
    df = pd.DataFrame(data)

    # Store as feather format file
    df.to_feather(f'data/cps_{year}x.ft')
    
    print(f'{year} Complete ({len(df)} obs.): data/cps_{year}x.ft')

In [None]:
def cps_reader(path, year, d):
    # lists of variable starts, ends, and lengths
    start, end, width = zip(*d.values())

    # create list of which characters to skip in each row
    skip = ([f'{s - e}x' if (s - e) > 0 else '' 
             for s, e in zip(start, [0] + list(end[:-1]))])

    # create format string by joining skip and variable segments
    unpack_fmt = ''.join([j for i in zip(skip, width) for j in i])

    # struct can interpret row bytes with the format string
    unpacker = struct.Struct(unpack_fmt).unpack_from
    
    # List of files from requested year
    flist = [f for f in os.listdir(f'{path}data/') 
         if f.endswith(f'{str(year)[2:]}pub.dat')]
    
    wgt = d['PWCMPWGT']  # Location of sample weight variable
    
    # List to fill with monthly CPS dfs
    comb_data = []
    
    for file in flist:
        raw_data = open(f'{path}data/{file}', 'rb')
        
        mo_data = [unpacker(row) for row in raw_data 
                   if row[wgt[0]:wgt[1]] != b'-1        ']
        
        comb_data.extend(mo_data)   
        
    # Read into numpy and set datatypes
    data = np.array(comb_data, dtype=dtypes)
    
    # Convert to pandas dataframe
    df = pd.DataFrame(data)

    # Store as feather format file
    df.to_feather(f'data/cps_{year}x.ft')
    
    print(f'{year} Complete ({len(df)} obs.): data/cps_{year}x.ft')

In [None]:
df = cps_reader(path, 2018, d)

In [None]:
%load_ext line_profiler

In [None]:
%lprun -f cps_reader cps_reader(path, 2018, d)

In [None]:
df.to_feather('test.ft')

In [None]:
len(pd.read_feather('data/cps_2018.ft'))

In [None]:
df

In [None]:
d = {i[0]: [(i[1]-1), i[1]-1+i[2], f'{i[2]}s'] 
     for i in dd['January_2017_Record_Layout.txt']['vlist']}

dtypes = [(k, 'int8' if int(v[2].replace('s', '')) < 3 
           else 'int16' if int(v[2].replace('s', '')) < 6 
           else 'int32' if int(v[2].replace('s', '')) < 15 
           else object) for k, v in d.items()]

In [None]:
file = 'apr18pub.dat'
year = 2018

In [None]:
# lists of variable starts, ends, and lengths
start, end, width = zip(*d.values())

# create list of which characters to skip in each row
skip = ([f'{s - e}x' if (s - e) > 0 else '' 
         for s, e in zip(start, [0] + list(end[:-1]))])

# create format string by joining skip and variable segments
unpack_fmt = ''.join([j for i in zip(skip, width) for j in i])

# end buffer
eb = f'{1001 - struct.calcsize(unpack_fmt)}x'
unpack_fmt = ''.join([unpack_fmt, eb])

# struct can interpret row bytes with the format string
unpacker = struct.Struct(unpack_fmt).unpack_from
    
# List of files from requested year
flist = [f for f in os.listdir(f'{path}data/') 
         if f.endswith(f'{str(year)[2:]}pub.dat')]
    
wgt = d['PWCMPWGT']  # Location of sample weight variable

In [None]:
eb = f'{1001 - struct.calcsize(unpack_fmt)}x'

In [None]:
[len(i) for i in open(f'{path}data/{file}', 'rb')]

In [None]:
data = open(f'{path}data/{file}', 'rb')
t = struct.unpack(unpack_fmt*length, data.read())

In [None]:
data = open(f'{path}data/{file}', 'rb')
t = [unpacker(row) for row in data]

In [None]:
t = [t[x:x+55] for x in range(0, len(t), 55)]

In [None]:
a = np.array(t, dtype=dtypes)

In [None]:
pd.DataFrame(a)

In [None]:
length = len(open(f'{path}data/{file}', 'rb').readlines())

In [None]:
length

In [None]:
data.readlines()

In [None]:
open(f'{path}data/{file}', 'rb').read(1)

In [None]:
len(pd.read_feather('data/cps_2018.ft'))

In [None]:
import re

In [None]:
re.search(b'309025966210263', open(f'{path}data/{file}', 'rb').read())

In [None]:
len(raw_data[0])

In [None]:
unpacker(open(f'{path}data/{file}', 'rb').read()[3510507: 3511508])

In [None]:
df[df['PWSSWGT']==0]

In [None]:
unpack_wgt(raw_data[0])

In [None]:
df.HRMONTH.unique()

In [None]:
df.head()

In [None]:
b'          '

In [None]:
'a' not in [b'0         ', b'-1        ']

In [None]:
    flist = [f for f in os.listdir(f'{path}data/') 
         if f.endswith(f'{str(year)[2:]}pub.dat')]

In [None]:
flist

In [None]:
df.to_feather(f'{path}cps_{year}x.ft')

In [None]:
f'{path}cps_{year}x.ft'

In [None]:
data[0][0].isdigit()

In [None]:
%load_ext line_profiler

In [None]:
file = 'apr18pub.dat'
year = 2018

In [None]:
%lprun -f cps_reader cps_reader(path, 2018, d)

In [None]:
raw_data = open(f'{path}data/{file}', 'rb').readlines()

In [None]:
# lists of variable starts, ends, and lengths
start, end, width = zip(*d.values())

# create list of which characters to skip in each row
skip = ([f'{s - e}x' if (s - e) > 0 else '' 
         for s, e in zip(start, [0] + list(end[:-1]))])

# create format string by joining skip and variable segments
unpack_fmt = ''.join([j for i in zip(skip, width) for j in i])

# struct can interpret row bytes with the format string
unpacker = struct.Struct(unpack_fmt).unpack_from
    
# List of files from requested year
flist = [f for f in os.listdir(f'{path}data/') 
         if f.endswith(f'{str(year)[2:]}pub.dat')]
    
wgt = d['PWCMPWGT']  # Location of sample weight variable
    
# Format to capture the sample weight
wgt_unpack = f'{wgt[0]}x{wgt[2]}'

# struct can interpret row bytes with the format string
unpack_wgt = struct.Struct(wgt_unpack).unpack_from

[unpacker(row) for row in raw_data if int(unpack_wgt(raw_data[0])[0]) > 0]

In [None]:
data = [unpacker(row) for row in raw_data if int(unpack_wgt(row)[0]) > 0]

In [None]:
[int(i[2].replace('s', '')) for i in d.values()]

In [None]:
pd.DataFrame(np.array(data, dtype=dtypes))

In [None]:
dtypes = [(k, np.int8 if int(v[2].replace('s', '')) < 3 
           else np.int16 if int(v[2].replace('s', '')) < 6 
           else np.int32 if int(v[2].replace('s', '')) < 15 
           else object) for k, v in d.items()]

In [None]:
d = pd.DataFrame(data)

In [None]:
int(data[0][0])

In [None]:
b'-1'

In [None]:
import numpy as np

In [None]:
int(unpack_wgt(raw_data[0])[0])

In [None]:
[unpacker(row) for row in raw_data if int(unpack_wgt(raw))]

In [None]:
[unpacker(*raw_data)]

In [None]:
raw_data

In [None]:
raw_data[0]

In [None]:
data = [[int(i) for i in unpacker(row)] for row in raw_data if int(row[wgt[0]:wgt[1]]) > 0]

In [None]:
data = [[*map(int, unpacker(row))] for row in raw_data if int(row[wgt[0]:wgt[1]]) > 0]

In [None]:
data

In [None]:
# Format to capture the sample weight
wgt_unpack = f'{wgt[0]}x{wgt[2]}'

# struct can interpret row bytes with the format string
unpack_wgt = struct.Struct(wgt_unpack).unpack_from

int(unpack_wgt(raw_data[0])[0]) > 0

In [None]:
int(unpack_wgt(raw_data[0])[0]) > 0

In [None]:
wgt

In [None]:
df = pd.DataFrame(data, columns=d.keys())

In [None]:
df

In [None]:
list(map(int, data))

In [None]:
open()

In [None]:
[int.from_bytes(i) for i in data[0]]

In [None]:
int(data[0][0])

In [None]:
data = [[[*map(int, unpacker(row))]
         for row in open(f'{path}data/{f}', 'rb').readlines() 
         if int(row[wgt[0]:wgt[1]]) > 0]
        for f in flist]

In [None]:
    sel_data = [[*map(int, unpacker(row))] 
                for row in raw_data 
                if int(row[wgt[0]:wgt[1]]) > 0]

In [None]:
df

In [None]:
# lists of variable starts, ends, and lengths
start, end, width = zip(*d.values())

# create list of which characters to skip in each row
skip = ([f'{s - e}x' if (s - e) > 0 else '' 
         for s, e in zip(start, [0] + list(end[:-1]))])

# create format string by joining skip and variable segments
unpack_fmt = ''.join([j for i in zip(skip, width) for j in i])
print(unpack_fmt)

# struct can interpret row bytes with the format string
unpacker = struct.Struct(unpack_fmt).unpack_from

In [None]:
year = 2018
flist = [f for f in os.listdir(f'{path}data/') 
         if f.endswith(f'{str(year)[2:]}pub.dat')]

In [None]:
wgt = d['PWCMPWGT']  # Location of sample weight variable

In [None]:
df = (pd.DataFrame([[*map(int, unpacker(row))] 
            for row in itertools.chain(*[open(f'{path}data/{f}', 'rb').readlines() for f in flist])
            if int(row[wgt[0]:wgt[1]]) > 0], columns=d.keys(), dtypes=)

In [None]:
def cps_reader(year, d):
    

In [None]:
data = [[[*map(int, unpacker(row))]
         for row in open(f'{path}data/{f}', 'rb').readlines() 
         if int(row[wgt[0]:wgt[1]]) > 0]
        for f in flist]

In [None]:
data

In [None]:
df

In [None]:
dlists = [open(f'{path}data/{f}', 'rb').readlines() for f in flist]

data = list(itertools.chain(*[open(f'{path}data/{f}', 'rb').readlines() for f in flist]))

In [None]:
# Pandas dataframe of women age 25 to 54
df = pd.DataFrame(sel_data, columns=d.keys())

In [None]:
columns = d[]

In [None]:
wgt = d['PWCMPWGT']  # Location of sample weight variable

data = []

# Loop over one year of files
for f in flist:
    # open file (read as binary) and read lines into "raw_data"
    raw_data = open(f'{path}data/{f}', 'rb').readlines()
    
    # unpack and store data of interest if sample weight > 0
    data.extend([[*map(int, unpacker(row))] for row in raw_data
            if int(row[wgt[0]:wgt[1]]) > 0])

In [None]:
dlists

In [None]:
data = []
list(map(data.extend, dlists))

In [None]:
dlists[0]

In [None]:
data = [[*map(int, unpacker(row))] 
        for row in (open(f'{path}data/{f}', 'rb').readlines() for f in flist)
        if int(row[wgt[0]:wgt[1]]) > 0]

In [None]:
wgt = d['PWCMPWGT']  # Location of sample weight variable

data = []

# Loop over one year of files
for f in flist:
    # open file (read as binary) and read lines into "raw_data"
    raw_data = open(f'{path}data/{f}', 'rb').readlines()
    
    # unpack and store data of interest if sample weight > 0
    data.extend([[*map(int, unpacker(row))] for row in raw_data
            if int(row[wgt[0]:wgt[1]]) > 0])

In [None]:
data = []

# Loop over one year of files
for f in flist:
    # open file (read as binary) and read lines into "raw_data"
    raw_data = open(f'{path}data/{f}', 'rb').readlines()
    
    # unpack and store data of interest if sample weight > 0
    data.extend([[*map(int, unpacker(row))] for row in raw_data
            if int(row[wgt[0]:wgt[1]]) > 0])

In [None]:
[[*map(int, unpacker(row))] for row in open(f'{path}data/{f}', 'rb').readlines()
            if int(row[wgt[0]:wgt[1]]) > 0]

In [None]:
data

In [None]:
# open file (read as binary) and read lines into "raw_data"
raw_data = open('apr17pub.dat', 'rb').readlines()

wgt = d['PWCMPWGT']  # Location of sample weight variable

# unpack and store data of interest if sample weight > 0
data = [[*map(int, unpacker(row))] for row in raw_data
        if int(row[wgt[0]:wgt[1]]) > 0]

print(data[:5])

In [None]:
d['PWCMPWGT']

In [None]:
wgt = d['PWCMPWGT']  # Location of sample weight variable

In [None]:
flist

In [None]:
flist

In [None]:
         're4': {'race': ('PRDTRACE', [(1, [1]), 
                                       (2, [2, 6, 10, 11, 12, 16, 17, 18, 22, 23]), 
                                       (4, [4, 5, 8, 9, 13, 14, 15, 19, 20, 21, 24]), 
                                       (5, [3, 7, 25, 26])]),
                 'hisp': ('PRDTHSP', (3, [1, 2, 3, 4, 5, 6, 7, 8])),
                 'start': '2014-01-01',
                 'end': '2018-12-01'}}