In [1]:
# -*- coding:utf-8 -*- #

### 6.4.1  簡單隨機抽樣 

In [2]:
import statsmodels.api as sm
import pandas as pd
import numpy as np
import random

data = sm.datasets.anes96.load_pandas().data
df = data.sample(50)
print(df.head())

     popul  TVnews  selfLR  ClinLR  DoleLR  PID   age  educ  income  vote  \
84     0.0     2.0     6.0     4.0     5.0  4.0  34.0   4.0     5.0   1.0   
693  290.0     1.0     5.0     3.0     6.0  2.0  37.0   4.0    21.0   0.0   
884    9.0     1.0     3.0     3.0     6.0  2.0  65.0   7.0    24.0   0.0   
707    0.0     2.0     5.0     3.0     6.0  5.0  31.0   7.0    21.0   1.0   
70   350.0     1.0     4.0     5.0     6.0  5.0  30.0   6.0     5.0   0.0   

     logpopul  
84  -2.302585  
693  5.670226  
884  2.208274  
707 -2.302585  
70   5.858219  


### 6.4.2  系統抽樣 

In [3]:
index_list = [i for i in range(len(data)) if i % 10 == 0]
df = data.iloc[index_list]
print(df.head())

     popul  TVnews  selfLR  ClinLR  DoleLR  PID   age  educ  income  vote  \
0      0.0     7.0     7.0     1.0     6.0  6.0  36.0   3.0     1.0   1.0   
10  1600.0     0.0     3.0     2.0     4.0  4.0  26.0   2.0     1.0   0.0   
20    74.0     7.0     4.0     4.0     7.0  2.0  88.0   2.0     2.0   0.0   
30   350.0     0.0     3.0     4.0     7.0  2.0  26.0   2.0     2.0   0.0   
40   180.0     6.0     5.0     5.0     5.0  0.0  51.0   2.0     3.0   0.0   

    logpopul  
0  -2.302585  
10  7.377821  
20  4.305416  
30  5.858219  
40  5.193512  


### 6.4.3  分層抽樣 

In [4]:
def typicalSampling(grp, typicalFracDict):
    name = grp.name
    frac = typicalFracDict[name]
    return grp.sample(frac=frac)

typicalFracDict = {
    0.0: 0.35,  
    1.0: 0.5,  
}
df = data.groupby('vote').apply(typicalSampling, typicalFracDict)
print(df.head())

          popul  TVnews  selfLR  ClinLR  DoleLR  PID   age  educ  income  \
vote                                                                       
0.0  255   93.0     2.0     1.0     3.0     6.0  1.0  41.0   6.0    14.0   
     311  190.0     4.0     2.0     3.0     6.0  0.0  62.0   7.0    15.0   
     89    31.0     7.0     2.0     2.0     7.0  0.0  22.0   4.0     6.0   
     1    190.0     1.0     3.0     3.0     5.0  1.0  20.0   4.0     1.0   
     558  290.0     7.0     2.0     2.0     7.0  0.0  35.0   4.0    19.0   

          vote  logpopul  
vote                      
0.0  255   0.0  4.533674  
     311   0.0  5.247550  
     89    0.0  3.437208  
     1     0.0  5.247550  
     558   0.0  5.670226  


### 6.4.4  整羣抽樣 

In [5]:
unique = np.unique(data['income'])
sample = random.sample(list(unique),2)
df = pd.DataFrame()
for label in sample:
    tmp = data[data['income']==label]  
    df = pd.concat([df, tmp])
print(df.head())

     popul  TVnews  selfLR  ClinLR  DoleLR  PID   age  educ  income  vote  \
573    1.0     3.0     6.0     1.0     6.0  4.0  49.0   6.0    20.0   1.0   
574   22.0     7.0     5.0     2.0     7.0  2.0  43.0   7.0    20.0   0.0   
575    1.0     0.0     4.0     3.0     5.0  6.0  39.0   4.0    20.0   1.0   
576    2.0     2.0     4.0     3.0     7.0  1.0  27.0   3.0    20.0   0.0   
577   34.0     4.0     6.0     2.0     6.0  6.0  30.0   6.0    20.0   1.0   

     logpopul  
573  0.095310  
574  3.095578  
575  0.095310  
576  0.741937  
577  3.529297  
