In [None]:
# https://jakevdp.github.io/PythonDataScienceHandbook/01.01-help-and-documentation.html
# https://www.dataquest.io/blog/jupyter-notebook-tips-tricks-shortcuts/

In [2]:
import pandas as pd
import numpy as np

RESOURCES_FOLDER = 'resources'

In [3]:
# Load datasets
iris_dataset = pd.read_csv(f'{RESOURCES_FOLDER}/iris.csv')
infert_dataset = pd.read_csv(f'{RESOURCES_FOLDER}/infert.csv')

# Sampling (amostragem)

## Simple (simples)

`np.random.choice`
- `a` : sample generator
- `size` : sample size
- `replace` : items from `a` can repeat
- `p` : prob of each `a` item to appear.
    * MUST sum 1

In [10]:
# See documentation
?np.random.choice

In [8]:
print(iris_dataset.shape)
iris_dataset.describe()

(150, 5)


Unnamed: 0,sepal length,sepal width,petal length,petal width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [29]:
# Everytime that these commands are executed the sample changes
sample_mask = np.random.choice(a=[0,1], size=iris_dataset.shape[0], replace=True, p=[0.5, 0.5])
print(sample_mask[:20])

print(len(sample_mask[sample_mask==1]))
print(len(sample_mask[sample_mask==0]))

iris_dataset[sample_mask==1][:10]

[0 1 1 1 1 1 1 0 0 0 0 1 0 0 1 0 1 0 1 0]
74
76


Unnamed: 0,sepal length,sepal width,petal length,petal width,class
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
5,5.4,3.9,1.7,0.4,Iris-setosa
6,4.6,3.4,1.4,0.3,Iris-setosa
11,4.8,3.4,1.6,0.2,Iris-setosa
14,5.8,4.0,1.2,0.2,Iris-setosa
16,5.4,3.9,1.3,0.4,Iris-setosa
18,5.7,3.8,1.7,0.3,Iris-setosa


In [26]:
# Reproduce random results
## seed must be set BEFORE sampling.
np.random.seed(1) 
sample_mask = np.random.choice(a=[0,1], size=iris_dataset.shape[0], replace=True, p=[0.5, 0.5])
print(sample_mask[:20])

print(len(sample_mask[sample_mask==1]))
print(len(sample_mask[sample_mask==0]))

iris_dataset[sample_mask==1][:10]

[0 1 0 0 0 0 0 0 0 1 0 1 0 1 0 1 0 1 0 0]
76
74


Unnamed: 0,sepal length,sepal width,petal length,petal width,class
1,4.9,3.0,1.4,0.2,Iris-setosa
9,4.9,3.1,1.5,0.1,Iris-setosa
11,4.8,3.4,1.6,0.2,Iris-setosa
13,4.3,3.0,1.1,0.1,Iris-setosa
15,5.7,4.4,1.5,0.4,Iris-setosa
17,5.1,3.5,1.4,0.3,Iris-setosa
20,5.4,3.4,1.7,0.2,Iris-setosa
21,5.1,3.7,1.5,0.4,Iris-setosa
23,5.1,3.3,1.7,0.5,Iris-setosa
24,4.8,3.4,1.9,0.2,Iris-setosa


## Stratified (estratificada)

Compenents percentual division

`train_test_split`

- `arrays` : data to split, data class to splict
- `test_size` : test percentual size
- `stratify` : column to guide stratification

In [1]:
from sklearn.model_selection import train_test_split

In [1]:
?train_test_split

Object `train_test_split` not found.


### Iris dataset

In [8]:
# Count distincts
iris_dataset['class'].value_counts()

Iris-setosa        50
Iris-virginica     50
Iris-versicolor    50
Name: class, dtype: int64

In [9]:
x,_,y,_ = train_test_split(iris_dataset.iloc[:, :4], iris_dataset.iloc[:, 4], 
                           test_size=0.5, stratify=iris_dataset.iloc[:, 4])
print(f'x.shape : {x.shape}')
print(y.value_counts())

x.shape : (75, 4)
Iris-virginica     25
Iris-versicolor    25
Iris-setosa        25
Name: class, dtype: int64


### Infert dataset

In [42]:
# Always brings different results
x,_,y,_ = train_test_split(infert_dataset.iloc[:, 2:], infert_dataset.iloc[:, 1],
                          test_size=0.6, stratify=infert_dataset.iloc[:, 2])
print(f'x.shape : {x.shape}')
print(y.value_counts())

x.shape : (99, 7)
6-11yrs    52
12+ yrs    44
0-5yrs      3
Name: education, dtype: int64


## Systemic


In [7]:
?np.random.randint

In [70]:
from math import ceil,floor

In [104]:
# NOTE: error prone. Why? by using ceil, depedending on sample_size the acummulator overfloats
sample_size = 40
sample_step = ceil(iris_dataset.shape[0]/sample_size)
accumulator = np.random.randint(low=0, high=sample_step+1, size=1)[0]

print(f'sample_size : {sample_size}')
print(f'sample_step : {sample_step}')
print(f'acummulator: {accumulator}')

sample = list()

for _ in range(sample_size):
    sample.append(acummulator)
    acummulator += sample_step

print(f'sample: {sample}')

sample_size : 40
sample_step : 4
acummulator: 1
sample: [1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61, 65, 69, 73, 77, 81, 85, 89, 93, 97, 101, 105, 109, 113, 117, 121, 125, 129, 133, 137, 141, 145, 149, 153, 157]


In [222]:
# R package : teaching sampling : S.SY function (adapted)
population_size = iris_dataset.shape[0]                                          
sample_step = 6                                                                 
accumulator = np.random.randint(low=0, high=sample_step+1, size=1)[0]            
sample_ceil = population_size - sample_step * floor(population_size/sample_step)

if accumulator <= sample_ceil:
    sample_size = floor(population_size/sample_step) + 1
else:
    sample_size = floor(population_size/sample_step)

sample = list()
for i in range(sample_size):
    idx = accumulator + (sample_step * (i))
    sample.append(idx)

print(f'sample_ceil : {sample_ceil}')
print(f'sample_size : {sample_size}')
print(f'acummulator: {accumulator}')
print(f'sample: {sample}')

sample_ceil : 0
sample_size : 25
acummulator: 1
sample: [1, 7, 13, 19, 25, 31, 37, 43, 49, 55, 61, 67, 73, 79, 85, 91, 97, 103, 109, 115, 121, 127, 133, 139, 145]
