In [1]:
import numpy as np, pandas as pd

# cell key perturbation example

checking out example from explanation at https://gss.civilservice.gov.uk/wp-content/uploads/2017/01/ExN-Disclosure-control-methodology-in-2021-Census-outputs-Spicer-Blanchard-Dove-ONS.docx

In [2]:
n = 1_000 # number of records
b = 200 # parameter related to spread/granularity of noise

In [3]:
M = 200 # max count of any of the binned cells (this is a guess/overshoot)

In [4]:
# this is a table of values from which we obtain the 'random noise' to add to the counts
Ptable = np.random.randint(-5,6, size = [M,b]) #just adding [-5,5]; could tweak this as desired

In [5]:
print(Ptable)

[[-1 -4  5 ... -4  3 -5]
 [ 5  2 -4 ... -4  2 -2]
 [-2 -4  5 ... -5 -4 -1]
 ...
 [ 1  5 -4 ...  3 -1  1]
 [ 4  5 -3 ...  3  4 -4]
 [ 4 -2  0 ... -4 -4  1]]


In [6]:
#question: would census want noise to be some function of count size? what would the distribution/range be?

In [7]:
record_key = pd.DataFrame({'sex_id':np.random.choice(2,n), #assign random sexes
                           'age':np.random.randint(0, 116, size = n), # assign random ages
                           'key':np.random.choice(a = n, size = n, replace=False), #create unique random key
                           'pweight': 1})

In [8]:
#question: is this key assignment sufficiently random? what would happen if it was from some distribution w/out replacement?

In [9]:
record_key['5yr_age'] = [i - (i % 5) for i in record_key.age] #bin the data

In [10]:
record_key.head() #this is the underlying / 'true' data

Unnamed: 0,sex_id,age,key,pweight,5yr_age
0,1,38,780,1,35
1,1,104,72,1,100
2,0,55,680,1,55
3,0,30,547,1,30
4,1,95,277,1,95


In [11]:
freq_table = record_key[['sex_id','5yr_age','key','pweight']]
freq_table = freq_table.groupby(['sex_id','5yr_age']).sum().reset_index() #bin by sex and 5-year ages

In [12]:
freq_table['cell_key'] = [(i % b) for i in freq_table.key] #cell key is ((sum of random keys in bin) mod b)
freq_table.head()

Unnamed: 0,sex_id,5yr_age,key,pweight,cell_key
0,0,0,11627,21,27
1,0,5,7917,18,117
2,0,10,10550,22,150
3,0,15,10946,26,146
4,0,20,6709,12,109


In [13]:
freq_table.pweight.max() # M needed to be geq this

31

In [14]:
def add_noise(df, Ptable):
    df['noise'] = [Ptable[i -1][j] for (i,j) in zip(df.pweight, df.cell_key)]
    df['noisy_pweight'] = df.pweight + df.noise
    
    return df

In [15]:
output = add_noise(freq_table, Ptable)

In [16]:
output.head()

Unnamed: 0,sex_id,5yr_age,key,pweight,cell_key,noise,noisy_pweight
0,0,0,11627,21,27,-2,19
1,0,5,7917,18,117,5,23
2,0,10,10550,22,150,-5,17
3,0,15,10946,26,146,0,26
4,0,20,6709,12,109,2,14


In [17]:
output.noise.value_counts() #distribution of noise added

-2    7
 1    6
-1    6
 0    6
 4    5
 5    4
-5    4
 2    4
 3    3
-4    2
-3    1
Name: noise, dtype: int64