# AMOSTRAGEM

Exemplo prático de técnicas de amostragem aplicadas ao Census - EUA.

The Census Bureau's mission is to serve as the nation's leading provider of quality data about its people and economy.

In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import StratifiedShuffleSplit

In [2]:
data = pd.read_csv(r"census.csv")
data.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
data.tail()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K
32560,52,Self-emp-inc,287927,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,15024,0,40,United-States,>50K


# Implementação de técnicas de amostragem

## Amostragem aleatória simples

Indica-se o tamanho da amostra e ela é selecionada aleatoriamente na base de dados.

Todas as amostras de tamanho *n* tem a mesma chance de serem escolhidas

In [4]:
#criando amostragem a partir de uma função

def amostragem_ale_simples (dataset, amostras):
    return data.sample(n = amostras, random_state=1)


df_amostra_simples = amostragem_ale_simples(data, 100)

In [5]:
print(df_amostra_simples.shape)
df_amostra_simples.head()


(100, 15)


Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
9646,62,Self-emp-not-inc,26911,7th-8th,4,Widowed,Other-service,Not-in-family,White,Female,0,0,66,United-States,<=50K
709,18,Private,208103,11th,7,Never-married,Other-service,Other-relative,White,Male,0,0,25,United-States,<=50K
7385,25,Private,102476,Bachelors,13,Never-married,Farming-fishing,Own-child,White,Male,27828,0,50,United-States,>50K
16671,33,Private,511517,HS-grad,9,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States,<=50K
21932,36,Private,292570,11th,7,Never-married,Machine-op-inspct,Unmarried,White,Female,0,0,40,United-States,<=50K


## Amostragem sistemática

Utilizada quando os elementos estão dispostos de maneira organizada.

Escolhe-se um ponto de partida e cada *k-ésimo* elemento da população.


In [6]:
# Amostra de 100 pessoas ao passo x
passo = len(data) // 100
passo

325

In [7]:
#inicio
random.seed(1) 
random.randint(0,325)

68

In [8]:
#seleção das amostras sistematicas
np.arange(68,len(data),step = 325)

array([   68,   393,   718,  1043,  1368,  1693,  2018,  2343,  2668,
        2993,  3318,  3643,  3968,  4293,  4618,  4943,  5268,  5593,
        5918,  6243,  6568,  6893,  7218,  7543,  7868,  8193,  8518,
        8843,  9168,  9493,  9818, 10143, 10468, 10793, 11118, 11443,
       11768, 12093, 12418, 12743, 13068, 13393, 13718, 14043, 14368,
       14693, 15018, 15343, 15668, 15993, 16318, 16643, 16968, 17293,
       17618, 17943, 18268, 18593, 18918, 19243, 19568, 19893, 20218,
       20543, 20868, 21193, 21518, 21843, 22168, 22493, 22818, 23143,
       23468, 23793, 24118, 24443, 24768, 25093, 25418, 25743, 26068,
       26393, 26718, 27043, 27368, 27693, 28018, 28343, 28668, 28993,
       29318, 29643, 29968, 30293, 30618, 30943, 31268, 31593, 31918,
       32243])

In [9]:
# Função
 
def amostragem_sistematica (dataset, num_amostras):
    passo = len(dataset) // num_amostras
    random.seed(1)
    inicio = random.randint(0, passo)
    indices = np.arange(inicio, len(data), step = passo)
    amostra_sistematica = data.iloc[indices]
    return amostra_sistematica

df_amostra_sistematica = amostragem_sistematica(data, 100)
print(df_amostra_sistematica.shape)
df_amostra_sistematica.head()

(100, 15)


Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
68,49,Self-emp-inc,191681,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,50,United-States,>50K
393,34,State-gov,98101,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,7688,0,45,?,>50K
718,22,Private,214399,Some-college,10,Never-married,Sales,Own-child,White,Female,0,0,15,United-States,<=50K
1043,44,Private,167005,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,7688,0,60,United-States,>50K
1368,52,Private,152234,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,Asian-Pac-Islander,Male,99999,0,40,Japan,>50K


## Amostragem por grupos

In [10]:
#definição do num de elementos em cada grupo (10)
len(data) // 10

3256

In [11]:
grupos = []
id_grupo = 0
cont = 0

for _ in data.iterrows():
    grupos.append(id_grupo)
    cont += 1
    if cont > 3256:
        cont = 0
        id_grupo += 1
        
#exibir valores unicos e sua contagem
np.unique(grupos, return_counts=True)


(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([3257, 3257, 3257, 3257, 3257, 3257, 3257, 3257, 3257, 3248],
       dtype=int64))

In [12]:
np.shape(grupos)

(32561,)

In [13]:
# add coluna grupo 
data['grupo'] = grupos
data.head(-1)

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income,grupo
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32555,22,Private,310152,Some-college,10,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40,United-States,<=50K,9
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K,9
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K,9
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K,9


In [14]:
#selecionando randomicamente o grupo
random.randint(0, 9)

9

In [15]:
#selecionando registros que grupo é 7

df_agrupamento = data[data['grupo'] == 7]
df_agrupamento.shape

(3257, 16)

In [16]:
df_agrupamento.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income,grupo
22799,25,Self-emp-not-inc,21472,Some-college,10,Never-married,Other-service,Not-in-family,White,Female,0,0,22,United-States,<=50K,7
22800,32,Private,90969,Assoc-voc,11,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,45,United-States,>50K,7
22801,26,Private,149734,HS-grad,9,Separated,Craft-repair,Unmarried,Black,Female,0,1594,40,United-States,<=50K,7
22802,42,Private,52849,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,<=50K,7
22803,39,Self-emp-not-inc,106347,Some-college,10,Divorced,Sales,Unmarried,White,Male,0,0,47,United-States,<=50K,7


In [17]:
# Função
def amostragem_grupo(data, num_grupos):
    intervalo = len(data) // num_grupos
    grupos = []
    id_grupo = 0
    cont = 0
    for _ in data.iterrows():
        grupos.append(id_grupo)
        cont += 1
        if cont > intervalo:
            cont = 0
            id_grupo += 1          
    data['grupo'] = grupos
    grupo_selecao = random.randint(0, num_grupos)
    return data[data['grupo'] == grupo_selecao]

In [18]:
df_amostra_agrupamento = amostragem_grupo(data, 100)
print(df_amostra_agrupamento.shape, df_amostra_agrupamento['grupo'].value_counts)
df_amostra_agrupamento.head(-1)

(326, 16) <bound method IndexOpsMixin.value_counts of 31622    97
31623    97
31624    97
31625    97
31626    97
         ..
31943    97
31944    97
31945    97
31946    97
31947    97
Name: grupo, Length: 326, dtype: int64>


Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income,grupo
31622,61,Self-emp-not-inc,268831,HS-grad,9,Married-civ-spouse,Sales,Husband,White,Male,0,0,53,United-States,<=50K,97
31623,45,Self-emp-not-inc,149640,Prof-school,15,Never-married,Prof-specialty,Not-in-family,White,Male,0,0,30,United-States,>50K,97
31624,29,Private,261725,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Female,0,0,35,United-States,<=50K,97
31625,74,Private,161387,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Female,0,0,16,United-States,<=50K,97
31626,61,Local-gov,260167,HS-grad,9,Widowed,Prof-specialty,Not-in-family,White,Female,0,0,40,United-States,<=50K,97
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31942,25,Local-gov,192321,Bachelors,13,Never-married,Prof-specialty,Own-child,White,Female,0,0,40,United-States,<=50K,97
31943,24,Private,88095,Some-college,10,Never-married,Machine-op-inspct,Not-in-family,White,Female,0,0,24,Mexico,<=50K,97
31944,44,Private,144067,Bachelors,13,Divorced,Adm-clerical,Not-in-family,White,Male,0,0,12,?,<=50K,97
31945,32,Private,124187,9th,5,Married-civ-spouse,Farming-fishing,Husband,Black,Male,0,0,40,United-States,<=50K,97


## Amostragem estratificada

Divide a população ou o "objeto de estudo" em diferentes subgrupos ou estratos diferentes, de maneira que um indivíduo pode fazer parte apenas de um único estrato ou camada.

Retorna a proporção da base de dados

In [19]:
data['income'].value_counts()

 <=50K    24720
 >50K      7841
Name: income, dtype: int64

In [20]:
# Porcentagem maior que 50k 

maior_50k = (7841 / len(data)) * 100
maior_50k

24.080955744602438

In [21]:
menor_50k = (24720 / len(data)) * 100
menor_50k

75.91904425539757

In [22]:
#x = 90%  y= 10%
split = StratifiedShuffleSplit(test_size=0.1)
for x, y in split.split(data, data['income']):
    df_x = data.iloc[x]
    df_y = data.iloc[y]
    
df_x.shape, df_y.shape

((29304, 16), (3257, 16))

In [23]:
100 / len(data)

0.0030711587481956942

In [24]:
split = StratifiedShuffleSplit(test_size= 100 / len(data))
for x, y in split.split(data, data['income']):
    df_x = data.iloc[x]
    df_y = data.iloc[y]

df_x.shape, df_y.shape


((32461, 16), (100, 16))

In [25]:
#verificando se a amostra foi feita corretamente
df_y['income'].value_counts()

 <=50K    76
 >50K     24
Name: income, dtype: int64

In [31]:
# Função 
def amostragem_estratificada (data, percentual):
    split = StratifiedShuffleSplit(test_size= percentual, random_state = 1)
    for _, y in split.split(data, data['income']):
        df_y = data.iloc[y]
    return df_y

df_amostra_estratificada = amostragem_estratificada(data, 100 / len(data))
df_amostra_estratificada.shape

(100, 16)

## Amostra de Reservatório

**Data stream** de itens com tamanho desconhecido que pode ser acessado somente uma vez.

Dados estão sendo sempre atualizados

In [None]:
def amostragem_reservatorio(data, amostras):
    stream = []
    for i in (len(data)):
        stream.append()
    i = 0
    tamanho = len(data)
    reservatorio = [0] * amostras
    for i in range(amostras):
        reservatorio[i] = stream[i]
    while i < tamanho:
        j = random.randrange(i + 1)
        if j < amostras:
            reservatorio[j] = stream[i]
        i += 1
    return data.iloc[reservatorio]

# Comparativo de Amostragens

In [32]:
data['age'].mean()

38.58164675532078

In [33]:
df_amostra_simples['age'].mean()

39.41

In [34]:
df_amostra_estratificada['age'].mean()

36.9

In [35]:
df_amostra_agrupamento['age'].mean()

40.38957055214724

In [36]:
df_amostra_sistematica['age'].mean()

37.57