### https://samplics.readthedocs.io/en/latest/tutorial/psu_selection.html

In [1]:
import numpy as np
from samplics.datasets import load_psu_frame
from samplics.sampling import SampleSelection

In [3]:
psu_frame_dict = load_psu_frame()
psu_frame = psu_frame_dict["data"]
psu_frame.head(15)

Unnamed: 0,cluster,region,number_households_census,cluster_status,comment
0,1,North,105,1,
1,2,North,85,1,
2,3,North,95,1,
3,4,North,75,1,
4,5,North,120,1,
5,6,North,90,1,
6,7,North,130,1,
7,8,North,55,1,
8,9,North,30,1,
9,10,North,600,1,due to a large building


In [4]:
psu_sample_size = {"East":3, "West": 2, "North": 2, "South": 3}
print(f"\nThe sample size per domain is: {psu_sample_size}\n")


The sample size per domain is: {'East': 3, 'West': 2, 'North': 2, 'South': 3}



In [6]:
from samplics import array_to_dict

frame_size = array_to_dict(psu_frame["region"])
print(f"\nThe number of clusters per stratum is: {frame_size}")

psu_sample_size = frame_size.copy()
psu_sample_size["East"] = 3
psu_sample_size["North"] = 2
psu_sample_size["South"] = 3
psu_sample_size["West"] = 2
print(f"\nThe sample size per stratum is: {psu_sample_size}\n")


The number of clusters per stratum is: {'East': 25, 'North': 10, 'South': 20, 'West': 45}

The sample size per stratum is: {'East': 3, 'North': 2, 'South': 3, 'West': 2}



In [7]:
stage1_design = SampleSelection(method="pps-sys", stratification=True, with_replacement=False)

psu_frame["psu_prob"] = stage1_design.inclusion_probs(
    psu_frame["cluster"],
    psu_sample_size,
    psu_frame["region"],
    psu_frame["number_households_census"],
    )

nb_obs = 15
print(f"\nFirst {nb_obs} observations of the PSU frame \n")
psu_frame.head(nb_obs)


First 15 observations of the PSU frame 



Unnamed: 0,cluster,region,number_households_census,cluster_status,comment,psu_prob
0,1,North,105,1,,0.151625
1,2,North,85,1,,0.122744
2,3,North,95,1,,0.137184
3,4,North,75,1,,0.108303
4,5,North,120,1,,0.173285
5,6,North,90,1,,0.129964
6,7,North,130,1,,0.187726
7,8,North,55,1,,0.079422
8,9,North,30,1,,0.043321
9,10,North,600,1,due to a large building,0.866426


In [8]:
np.random.seed(23)

psu_frame["psu_sample"], psu_frame["psu_hits"], psu_frame["psu_probs"] = stage1_design.select(
    psu_frame["cluster"],
    psu_sample_size,
    psu_frame["region"],
    psu_frame["number_households_census"]
    )

nb_obs = 15
print(f"\nFirst {nb_obs} observations of the PSU frame with the sampling information \n")
psu_frame.head(nb_obs)


First 15 observations of the PSU frame with the sampling information 



Unnamed: 0,cluster,region,number_households_census,cluster_status,comment,psu_prob,psu_sample,psu_hits,psu_probs
0,1,North,105,1,,0.151625,0,0,0.151625
1,2,North,85,1,,0.122744,0,0,0.122744
2,3,North,95,1,,0.137184,0,0,0.137184
3,4,North,75,1,,0.108303,0,0,0.108303
4,5,North,120,1,,0.173285,0,0,0.173285
5,6,North,90,1,,0.129964,0,0,0.129964
6,7,North,130,1,,0.187726,1,1,0.187726
7,8,North,55,1,,0.079422,0,0,0.079422
8,9,North,30,1,,0.043321,0,0,0.043321
9,10,North,600,1,due to a large building,0.866426,1,1,0.866426


In [9]:
np.random.seed(23)

psu_sample = stage1_design.select(
    psu_frame["cluster"],
    psu_sample_size,
    psu_frame["region"],
    psu_frame["number_households_census"],
    to_dataframe = True,
    sample_only = True
    )

print("\nPSU sample without the non-sampled units\n")
psu_sample


PSU sample without the non-sampled units



Unnamed: 0,_samp_unit,_stratum,_mos,_sample,_hits,_probs
0,7,North,130,1,1,0.187726
1,10,North,600,1,1,0.866426
2,16,South,190,1,1,0.209174
3,24,South,75,1,1,0.082569
4,29,South,200,1,1,0.220183
5,34,East,305,1,1,0.210587
6,45,East,450,1,1,0.310702
7,52,East,700,1,1,0.483314
8,64,West,300,1,1,0.091673
9,86,West,280,1,1,0.085561


In [10]:
# Available sample methods
#Sample(method="pps-sys", with_replacement=True)
#Sample(method="pps-sys", with_replacement=False)
#Sample(method="pps-brewer", with_replacement=False)
#Sample(method="pps-hv", with_replacement=False) # Hanurav-Vijayan method
#Sample(method="pps-murphy", with_replacement=False)
#Sample(method="pps-sampford", with_replacement=False) # Rao-Sampford method

NameError: name 'Sample' is not defined

In [11]:
np.random.seed(23)

stage1_sampford = SampleSelection(method="pps-rs", stratification=True, with_replacement=False)

psu_sample_sampford = stage1_sampford.select(
    psu_frame["cluster"],
    psu_sample_size,
    psu_frame["region"],
    psu_frame["number_households_census"],
    to_dataframe=True,
    sample_only=False
    )

psu_sample_sampford

Unnamed: 0,_samp_unit,_stratum,_mos,_sample,_hits,_probs
0,1,North,105,0,0,0.151625
1,2,North,85,0,0,0.122744
2,3,North,95,1,1,0.137184
3,4,North,75,0,0,0.108303
4,5,North,120,0,0,0.173285
...,...,...,...,...,...,...
95,96,West,95,1,1,0.029030
96,97,West,40,0,0,0.012223
97,98,West,105,0,0,0.032086
98,99,West,320,0,0,0.097785
