In [24]:
import pandas as pd
import numpy as np

In [25]:
puf = pd.read_csv('puf_data/puf.csv')
wt = pd.read_csv('puf_stage2/puf_weights.csv.gz')

In [26]:
print('Number of records with a zero weight')
for col in wt.columns:
    print(col, len(wt[col][wt[col] == 0]))

Number of records with a zero weight
WT2011 1
WT2012 1
WT2013 1
WT2014 1
WT2015 1
WT2016 0
WT2017 0
WT2018 0
WT2019 0
WT2020 0
WT2021 0
WT2022 0
WT2023 0
WT2024 0
WT2025 0
WT2026 0
WT2027 0


In [27]:
len(puf[puf['s006'] == 0])

1

In [28]:
puf[puf['s006'] == 0]

Unnamed: 0,DSI,e00200,e00300,e00400,e00600,e00650,e00700,e00800,e00900,e01100,...,e00200p,e00200s,e00900p,e00900s,e02100p,e02100s,k1bx14p,k1bx14s,agi_bin,g20500
35680,0,40000,0,0,0,0,0,0,0,0,...,40000,0,0,0,0,0,0,0,8,0


In [29]:
match_puf = pd.read_csv('puf_data/cps-matched-puf.csv')

Before rounding, there are no records with a zero weight. The record that has a zero weight after rounding has a positive, though very small, weight before rounding.

In [30]:
len(match_puf[match_puf['matched_weight'] == 0])

0

In [31]:
match_puf['matched_weight'].iloc[35680]

0.0048743729892066776

As we extrapolate, that record eventually has a weight of one.

In [32]:
for col in wt.columns:
    print(col, wt[col].iloc[35680])

WT2011 0
WT2012 0
WT2013 0
WT2014 0
WT2015 0
WT2016 1
WT2017 1
WT2018 1
WT2019 1
WT2020 1
WT2021 1
WT2022 1
WT2023 1
WT2024 1
WT2025 1
WT2026 1
WT2027 1


This record also has the smallest weight in the file

In [33]:
match_puf['matched_weight'].nsmallest(10)

35680     0.004874
118979    0.008001
68564     0.013373
108797    0.014240
81000     0.017081
184560    0.018572
44833     0.032717
164027    0.040766
143730    0.049468
6007      0.058495
Name: matched_weight, dtype: float64

In [34]:
puf['s006'].nsmallest(10)

35680     0
68564     1
108797    1
118979    1
81000     2
184560    2
44833     3
164027    4
143730    5
6007      6
Name: s006, dtype: int64

In [35]:
smallest = {}
for col in wt.columns:
    smallest[col] = wt[col].nsmallest(10)
pd.DataFrame(smallest)

Unnamed: 0,WT2011,WT2012,WT2013,WT2014,WT2015,WT2016,WT2017,WT2018,WT2019,WT2020,WT2021,WT2022,WT2023,WT2024,WT2025,WT2026,WT2027
6007,6.0,6.0,6.0,7.0,7.0,7.0,7.0,7.0,8.0,8.0,8.0,,,,9.0,,
35680,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
44833,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
68564,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0
81000,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
108797,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
118979,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
143730,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0
164027,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
184560,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0


Don't worry about all of the NaN values in the above table. I checked and those values aren't actually null (see below). I think that is just a weird feature of the `nsmallest()` method.

In [36]:
wt['WT2023'].iloc[6007]

9

In [37]:
for col in wt.columns:
    print(col, wt[col].isnull().sum())

WT2011 0
WT2012 0
WT2013 0
WT2014 0
WT2015 0
WT2016 0
WT2017 0
WT2018 0
WT2019 0
WT2020 0
WT2021 0
WT2022 0
WT2023 0
WT2024 0
WT2025 0
WT2026 0
WT2027 0
