# Amostragem

### Carregamento da base de dados

In [86]:
import pandas as pd
import random
import numpy as np

from sklearn.model_selection import StratifiedShuffleSplit

In [6]:
df = pd.read_csv('../dados/census.csv')

In [8]:
df.shape

(32561, 15)

In [9]:
df.head(2)

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K


### Amostragem aleatória simples

In [12]:
df_amostra_aleatoria_simples = df.sample(n=100, random_state=1)

In [13]:
df_amostra_aleatoria_simples.head(3)

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
9646,62,Self-emp-not-inc,26911,7th-8th,4,Widowed,Other-service,Not-in-family,White,Female,0,0,66,United-States,<=50K
709,18,Private,208103,11th,7,Never-married,Other-service,Other-relative,White,Male,0,0,25,United-States,<=50K
7385,25,Private,102476,Bachelors,13,Never-married,Farming-fishing,Own-child,White,Male,27828,0,50,United-States,>50K


In [17]:
def amostragem_aleatoria_simples(df, n, seed):
    SEED = seed
    df_amostragem = df.sample(n=n, random_state=SEED)
    return df_amostragem

In [18]:
df_amostragem_simples = amostragem_aleatoria_simples(df, 100, 2)

df_amostragem_simples.head(3)

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
16054,42,State-gov,160369,HS-grad,9,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40,United-States,>50K
32382,44,Local-gov,150171,HS-grad,9,Divorced,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
10749,39,Local-gov,256997,Bachelors,13,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K


### Amostragem sistemática

In [21]:
tamanho_df = len(df)
n = 100

tamanho_df//n

325

In [23]:
random.seed(1)
random.randint(0,325)

68

In [26]:
np.arange(68, tamanho_df, step=325)

array([   68,   393,   718,  1043,  1368,  1693,  2018,  2343,  2668,
        2993,  3318,  3643,  3968,  4293,  4618,  4943,  5268,  5593,
        5918,  6243,  6568,  6893,  7218,  7543,  7868,  8193,  8518,
        8843,  9168,  9493,  9818, 10143, 10468, 10793, 11118, 11443,
       11768, 12093, 12418, 12743, 13068, 13393, 13718, 14043, 14368,
       14693, 15018, 15343, 15668, 15993, 16318, 16643, 16968, 17293,
       17618, 17943, 18268, 18593, 18918, 19243, 19568, 19893, 20218,
       20543, 20868, 21193, 21518, 21843, 22168, 22493, 22818, 23143,
       23468, 23793, 24118, 24443, 24768, 25093, 25418, 25743, 26068,
       26393, 26718, 27043, 27368, 27693, 28018, 28343, 28668, 28993,
       29318, 29643, 29968, 30293, 30618, 30943, 31268, 31593, 31918,
       32243])

In [48]:
def amostragem_sistematica(df, n):
    t = len(df)
    random.seed(1)
    amostragem = t // n
    valor_escolhido = random.randint(0, amostragem)
    numeros_escolhidos = np.arange(valor_escolhido, t, step=amostragem)
    amostra_sistematica = df.iloc[numeros_escolhidos]
    return amostra_sistematica

In [49]:
amostragem_sistematica(df, 100).head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
68,49,Self-emp-inc,191681,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,50,United-States,>50K
393,34,State-gov,98101,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,7688,0,45,?,>50K
718,22,Private,214399,Some-college,10,Never-married,Sales,Own-child,White,Female,0,0,15,United-States,<=50K
1043,44,Private,167005,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,7688,0,60,United-States,>50K
1368,52,Private,152234,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,Asian-Pac-Islander,Male,99999,0,40,Japan,>50K


### Amostragem por grupos

In [75]:
len(df) / 10

3256.1

In [51]:
grupos = []
id_grupo = 0
contagem = 0
for _ in df.iterrows():
    grupos.append(id_grupo)
    contagem += 1
    if contagem > 3256:
        contagem = 0
        id_grupo +=1

In [53]:
np.unique(grupos, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([3257, 3257, 3257, 3257, 3257, 3257, 3257, 3257, 3257, 3248],
       dtype=int64))

In [54]:
np.shape(grupos), df.shape

((32561,), (32561, 15))

In [55]:
df['grupo'] = grupos

In [57]:
df.head(1)

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income,grupo
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,0


In [59]:
df.tail(1)

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income,grupo
32560,52,Self-emp-inc,287927,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,15024,0,40,United-States,>50K,9


In [71]:
random.seed(9)
random.randint(0,9)

7

In [72]:
df_agrupamento = df[df['grupo'] == 7]

In [74]:
df_agrupamento.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income,grupo
22799,25,Self-emp-not-inc,21472,Some-college,10,Never-married,Other-service,Not-in-family,White,Female,0,0,22,United-States,<=50K,7
22800,32,Private,90969,Assoc-voc,11,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,45,United-States,>50K,7
22801,26,Private,149734,HS-grad,9,Separated,Craft-repair,Unmarried,Black,Female,0,1594,40,United-States,<=50K,7
22802,42,Private,52849,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,<=50K,7
22803,39,Self-emp-not-inc,106347,Some-college,10,Divorced,Sales,Unmarried,White,Male,0,0,47,United-States,<=50K,7


In [80]:
def amostragem_agrupamento(df, numero_grupos):
    total_por_grupo = len(df) / numero_grupos
    grupos = []
    id_grupo = 0
    contagem = 0
    for _ in df.iterrows():
        grupos.append(id_grupo)
        contagem += 1
        if contagem > total_por_grupo:
            contagem = 0
            id_grupo +=1
    df['grupo'] = grupos
    random.seed(8)
    numero_aleatorio = random.randint(0,9)
    df_agrupamento = df[df['grupo'] == numero_aleatorio]
    return df_agrupamento

In [81]:
df_amostragem_grupo = amostragem_agrupamento(df, 10)

In [82]:
df_amostragem_grupo.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income,grupo
9771,24,Private,168997,Some-college,10,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K,3
9772,39,Private,168894,Some-college,10,Never-married,Adm-clerical,Not-in-family,White,Female,0,0,40,United-States,<=50K,3
9773,21,Private,149809,Assoc-acdm,12,Never-married,Sales,Own-child,White,Male,0,0,40,United-States,<=50K,3
9774,34,Private,344073,HS-grad,9,Separated,Adm-clerical,Not-in-family,White,Male,0,0,40,United-States,>50K,3
9775,22,Private,416165,Some-college,10,Never-married,Sales,Unmarried,White,Female,0,0,32,United-States,<=50K,3


### Amostragem estratificada

In [88]:
df['income'].value_counts()

 <=50K    24720
 >50K      7841
Name: income, dtype: int64

In [89]:
7841 / len(df)

0.2408095574460244

In [90]:
24720 / len(df)

0.7591904425539756

In [94]:
100 / len(df)

0.0030711587481956942

In [95]:
split = StratifiedShuffleSplit(test_size=0.0030711587481956942)

In [96]:
for x, y in split.split(df, df['income']):
    df_x = df.iloc[x]
    df_y = df.iloc[y]

In [97]:
df_x.shape, df_y.shape

((32461, 16), (100, 16))

In [98]:
df_y['income'].value_counts()

 <=50K    76
 >50K     24
Name: income, dtype: int64

### Amostragem de reservatório

In [99]:
stream = []
for i in range(len(df)):
    stream.append(i)

In [107]:
def amostragem_reservatorio(df, amostras):
    stream = []
    for i in range(len(df)):
        stream.append(i)
        
    i = 0
    tamanho = len(df)
    
    reservatorio = [0] * amostras
    for i in range(amostras):
        reservatorio[i] = stream[i]
        
    while i < tamanho:
        j = random.randrange(i + 1)
        if j < amostras:
            reservatorio[j] = stream[i]
        i += 1
    
    return df.iloc[reservatorio]
    

In [109]:
amostragem_reservatorio(df, 100).head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income,grupo
10107,31,Private,341672,Bachelors,13,Married-civ-spouse,Adm-clerical,Husband,Asian-Pac-Islander,Male,0,0,60,India,<=50K,3
20862,45,Private,155489,Some-college,10,Married-civ-spouse,Sales,Husband,White,Male,0,0,60,United-States,>50K,6
3313,23,Private,227594,Bachelors,13,Never-married,Prof-specialty,Own-child,White,Female,0,0,38,United-States,<=50K,1
3176,49,Private,160647,Bachelors,13,Married-civ-spouse,Adm-clerical,Wife,White,Female,0,0,40,United-States,>50K,0
1735,64,Self-emp-not-inc,134960,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,15024,0,35,United-States,>50K,0
