In [1]:
# -*- coding:utf-8 -*-

import pandas as pd
import numpy as np

### 3.3.1  分組groupby

In [2]:
import statsmodels.api as sm
data = sm.datasets.anes96.load_pandas().data
print(data.head())

grp = data.groupby('educ') # 按單特徵分組
print(len(grp))
grp = data.groupby(['educ','vote']) # 按兩特徵分組
print(len(grp))
grp = data.groupby(lambda n: n%2) # 按索引值奇偶分組
print(len(grp))

print(grp.get_group(1).head())

for desc,item in grp:
    print(desc, item.head())

for desc,item in grp['age']:
    print(desc, type(item))

grp = data.groupby(['vote'])
print(grp['vote'].count()) # 求每組人數

df = grp['age'].mean().reset_index()
print(type(df))
print(df)

   popul  TVnews  selfLR  ClinLR  DoleLR  PID   age  educ  income  vote  \
0    0.0     7.0     7.0     1.0     6.0  6.0  36.0   3.0     1.0   1.0   
1  190.0     1.0     3.0     3.0     5.0  1.0  20.0   4.0     1.0   0.0   
2   31.0     7.0     2.0     2.0     6.0  1.0  24.0   6.0     1.0   0.0   
3   83.0     4.0     3.0     4.0     5.0  1.0  28.0   6.0     1.0   0.0   
4  640.0     7.0     5.0     6.0     4.0  0.0  68.0   6.0     1.0   0.0   

   logpopul  
0 -2.302585  
1  5.247550  
2  3.437208  
3  4.420045  
4  6.461624  
7
14
2
    popul  TVnews  selfLR  ClinLR  DoleLR  PID   age  educ  income  vote  \
1   190.0     1.0     3.0     3.0     5.0  1.0  20.0   4.0     1.0   0.0   
3    83.0     4.0     3.0     4.0     5.0  1.0  28.0   6.0     1.0   0.0   
5   110.0     3.0     3.0     4.0     6.0  1.0  21.0   4.0     1.0   0.0   
7    31.0     1.0     5.0     4.0     5.0  4.0  21.0   4.0     1.0   0.0   
9  2800.0     0.0     3.0     3.0     7.0  0.0  39.0   3.0     1.0   0.0   

 

### 3.3.2  聚合agg

In [3]:
data = sm.datasets.ccard.load_pandas().data
print(data.head()) # 顯示數據前5行

grp = data.groupby('OWNRENT')
print(grp.agg(np.mean)) # 調用聚合函數

print(grp.agg(lambda df: df.loc[(df.INCOME.idxmax())]))

   AVGEXP   AGE  INCOME  INCOMESQ  OWNRENT
0  124.98  38.0    4.52   20.4304      1.0
1    9.85  33.0    2.42    5.8564      0.0
2   15.00  34.0    4.50   20.2500      1.0
3  137.87  31.0    2.54    6.4516      0.0
4  546.50  32.0    9.79   95.8441      1.0
             AVGEXP        AGE    INCOME   INCOMESQ
OWNRENT                                            
0.0      203.000667  28.866667  2.818667   8.764329
1.0      361.751111  35.296296  4.467778  24.490293
         AVGEXP   AGE  INCOME  INCOMESQ
OWNRENT                                
0.0      306.03  41.0     6.0      36.0
1.0      548.03  40.0    10.0     100.0


### 3.3.3  轉換Transform

In [4]:
data = sm.datasets.ccard.load_pandas().data # 讀取數據
grp = data.groupby('OWNRENT')
data['NEW_INCOME'] = grp['INCOME'].transform(lambda x: x - x.mean()) # 按組轉換
print(data[['INCOME', 'NEW_INCOME', 'OWNRENT']].head())

   INCOME  NEW_INCOME  OWNRENT
0    4.52    0.052222      1.0
1    2.42   -0.398667      0.0
2    4.50    0.032222      1.0
3    2.54   -0.278667      0.0
4    9.79    5.322222      1.0


### 3.3.4  過濾Filter

In [5]:
data = sm.datasets.ccard.load_pandas().data
grp = data.groupby('OWNRENT')
print(grp.filter(lambda df: False if df['INCOME'].mean() < 3 else True).head())

   AVGEXP   AGE  INCOME  INCOMESQ  OWNRENT
0  124.98  38.0    4.52   20.4304      1.0
2   15.00  34.0    4.50   20.2500      1.0
4  546.50  32.0    9.79   95.8441      1.0
7  150.79  29.0    2.37    5.6169      1.0
8  777.82  37.0    3.80   14.4400      1.0


### 3.3.5  應用Apply

In [6]:
print(grp.apply(np.mean))
print(grp['INCOME'].apply(lambda x: x - x.mean()).head()) # 同transform
print(grp.apply(lambda df: df if df['INCOME'].mean() < 3 else None).head())
print(grp.apply(lambda df: df.head(3) if df['INCOME'].mean() < 3 else None).head())

             AVGEXP        AGE    INCOME   INCOMESQ  OWNRENT
OWNRENT                                                     
0.0      203.000667  28.866667  2.818667   8.764329      0.0
1.0      361.751111  35.296296  4.467778  24.490293      1.0
0    0.052222
1   -0.398667
2    0.032222
3   -0.278667
4    5.322222
Name: INCOME, dtype: float64
           AVGEXP   AGE  INCOME  INCOMESQ  OWNRENT
OWNRENT                                           
0.0     1    9.85  33.0    2.42    5.8564      0.0
        3  137.87  31.0    2.54    6.4516      0.0
        5   92.00  23.0    2.50    6.2500      0.0
        6   40.83  28.0    3.96   15.6816      0.0
        9   52.58  28.0    3.20   10.2400      0.0
           AVGEXP   AGE  INCOME  INCOMESQ  OWNRENT
OWNRENT                                           
0.0     1    9.85  33.0    2.42    5.8564      0.0
        3  137.87  31.0    2.54    6.4516      0.0
        5   92.00  23.0    2.50    6.2500      0.0
