In [21]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import StratifiedShuffleSplit


In [9]:
data = pd.read_csv("credit_data.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   i#clientid  2000 non-null   int64  
 1   income      2000 non-null   float64
 2   age         1997 non-null   float64
 3   loan        2000 non-null   float64
 4   c#default   2000 non-null   int64  
dtypes: float64(3), int64(2)
memory usage: 78.2 KB


In [38]:
data.head(-1)

Unnamed: 0,i#clientid,income,age,loan,c#default,grupo
0,1,66155.925095,59.017015,8106.532131,0,0
1,2,34415.153966,48.117153,6564.745018,0,0
2,3,57317.170063,63.108049,8020.953296,0,0
3,4,42709.534201,45.751972,6103.642260,0,1
4,5,66952.688845,18.584336,8770.099235,1,1
...,...,...,...,...,...,...
1994,1995,24254.700791,37.751622,2225.284643,0,664
1995,1996,59221.044874,48.518179,1926.729397,0,665
1996,1997,69516.127573,23.162104,3503.176156,0,665
1997,1998,44311.449262,28.017167,5522.786693,1,665


In [40]:
def amostra_aleatoria_simples(data, n_amostras):
    return data.sample(n = n_amostras, random_state = 1)

df_aas = amostra_aleatoria_simples(data, 1000)
df_aas.shape

(1000, 6)

In [41]:
def amostra_sistematica(data, n_amostras):
    passo = len(data) // n_amostras
    random.seed(1)
    inicio = random.randint(0, passo)
    indices = np.arange(inicio, len(data), step = passo)
    amostra_sistematica = data.iloc[indices]
    return amostra_sistematica

df_amostra_sist = amostra_sistematica(data, 1000)
df_amostra_sist.shape

(1000, 6)

In [45]:
def amostragem_grupo(data, num_grupos):
    intervalo = len(data) // num_grupos
    grupos = []
    id_grupo = 0
    cont = 0
    for _ in data.iterrows():
        grupos.append(id_grupo)
        cont += 1
        if cont > intervalo:
            cont = 0
            id_grupo += 1          
    data['grupo'] = grupos
    grupo_selecao = random.randint(0, num_grupos)
    return data[data['grupo'] == grupo_selecao]

df_amostra_grupo = amostragem_grupo(data, 2)
df_amostra_grupo['grupo'].value_counts

<bound method IndexOpsMixin.value_counts of 1001    1
1002    1
1003    1
1004    1
1005    1
       ..
1995    1
1996    1
1997    1
1998    1
1999    1
Name: grupo, Length: 999, dtype: int64>

In [46]:
amostra_grupo.head()

Unnamed: 0,i#clientid,income,age,loan,c#default,grupo
783,784,36029.301577,52.640624,2928.100439,0,261
784,785,48457.963548,22.344924,8108.172683,1,261
785,786,46038.510655,39.038673,6868.987805,0,261


In [49]:
def amostragem_estratificada (data, percentual, campo):
    split = StratifiedShuffleSplit(test_size=percentual, random_state=1)
    for _, y in split.split(data, data[campo]):
        df_y = data.iloc[y]
    return df_y

df_amostra_estratif = amostragem_estratificada(data, 0.5, 'c#default')
df_amostra_estratif.shape

(1000, 6)

In [51]:
def amostragem_reservatorio(dataset, amostras):
  stream = []
  for i in range(len(dataset)):
    stream.append(i)

  i = 0
  tamanho = len(dataset)

  reservatorio = [0] * amostras
  for i in range(amostras):
    reservatorio[i] = stream[i]

  while (i < tamanho):
    j = random.randrange(i + 1)
    if (j < amostras):
      reservatorio[j] = stream[i]
    i += 1

  return dataset.iloc[reservatorio]

df_amostra_reserv = amostragem_reservatorio(data, 1000)
df_amostra_reserv.shape

(1000, 6)

In [52]:
print(data['age'].mean(), data['income'].mean(),
      data['loan'].mean())


40.80755937840458 45331.60001779333 4444.369694688262


In [34]:
print(df_aas['age'].mean(),
df_aas['income'].mean(),
df_aas['loan'].mean())




40.49552561124429 45563.26865376901 4449.4469004423645


In [35]:
print(df_amostra_grupo['age'].mean(),
df_aas['income'].mean(),
df_aas['loan'].mean())

34.852124052743136 45563.26865376901 4449.4469004423645


In [36]:
print(df_amostra_sist['age'].mean(),
      df_aas['income'].mean(),
      df_aas['loan'].mean())


40.91117381141754 45563.26865376901 4449.4469004423645


In [37]:
print(df_amostra_estratif['age'].mean(),
      df_aas['income'].mean(),
      df_aas['loan'].mean())


40.53363707173919 45563.26865376901 4449.4469004423645
