## Categorical data

### 개발 배경과 동기

In [1]:
import numpy as np
import pandas as pd

In [2]:
values = pd.Series(['apple','orange','apple','apple']*2)

values

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
dtype: object

In [3]:
pd.unique(values)

array(['apple', 'orange'], dtype=object)

In [5]:
pd.value_counts(values)

apple     6
orange    2
dtype: int64

In [6]:
values = pd.Series([0,1,0,0]*2)
dim = pd.Series(['apple','orange'])

In [7]:
values, dim

(0    0
 1    1
 2    0
 3    0
 4    0
 5    1
 6    0
 7    0
 dtype: int64,
 0     apple
 1    orange
 dtype: object)

In [8]:
dim.take(values)

0     apple
1    orange
0     apple
0     apple
0     apple
1    orange
0     apple
0     apple
dtype: object

### pandas의 Categorical

In [9]:
fruits = ['apple','orange','apple','apple']*2
N = len(fruits)
df = pd.DataFrame({'fruit':fruits
                  ,'basket_id':np.arange(N)
                  ,'count':np.random.randint(3,15,size=N)
                  ,'weight':np.random.uniform(0,4,size=N)})

In [12]:
df = df[['basket_id', 'fruit', 'count', 'weight']]
df

Unnamed: 0,basket_id,fruit,count,weight
0,0,apple,12,3.541196
1,1,orange,6,2.527259
2,2,apple,3,2.237291
3,3,apple,7,2.166079
4,4,apple,14,0.650534
5,5,orange,10,2.832233
6,6,apple,11,0.546465
7,7,apple,7,1.897097


In [13]:
fruit_car = df.fruit.astype('category') # 범주형으로 바꾸기
fruit_car

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
Name: fruit, dtype: category
Categories (2, object): [apple, orange]

In [16]:
c = fruit_car.values
type(c)

pandas.core.arrays.categorical.Categorical

In [17]:
c.categories

Index(['apple', 'orange'], dtype='object')

In [18]:
c.codes

array([0, 1, 0, 0, 0, 1, 0, 0], dtype=int8)

In [19]:
df.fruit = df.fruit.astype('category')

In [22]:
my_categories = pd.Categorical(['foo','bar','baz','foo','bar'])
my_categories

[foo, bar, baz, foo, bar]
Categories (3, object): [bar, baz, foo]

In [24]:
categories = ['foo','bar','baz']
codes = [0,1,2,0,0,1]

my_cats_2 = pd.Categorical.from_codes(codes,categories)
my_cats_2

[foo, bar, baz, foo, foo, bar]
Categories (3, object): [foo, bar, baz]

In [25]:
categories = ['foo','baz','bar']
ordered_cat = pd.Categorical.from_codes(codes,categories,ordered=True)
ordered_cat # 카테고리별 순서를 나타내기

[foo, baz, bar, foo, foo, baz]
Categories (3, object): [foo < baz < bar]

In [26]:
my_cats_2.as_ordered() # 순서가 없던 카테고리에 순서 부여하기

[foo, bar, baz, foo, foo, bar]
Categories (3, object): [foo < bar < baz]

### Categorical 연산

In [27]:
np.random.seed(12345)

draws = np.random.randn(1000)

draws[:5]

array([-0.20470766,  0.47894334, -0.51943872, -0.5557303 ,  1.96578057])

In [31]:
bins = pd.qcut(draws,4,labels=['Q1','Q2','Q3','Q4'])
bins

[Q2, Q3, Q2, Q2, Q4, ..., Q3, Q2, Q1, Q3, Q4]
Length: 1000
Categories (4, object): [Q1 < Q2 < Q3 < Q4]

In [32]:
bins.codes

array([1, 2, 1, 1, 3, 3, 2, 2, 3, 3, 3, 0, 2, 2, 3, 3, 0, 1, 3, 1, 1, 2,
       3, 0, 1, 2, 2, 2, 2, 3, 0, 0, 0, 0, 0, 2, 0, 2, 0, 2, 0, 1, 0, 0,
       0, 2, 2, 0, 2, 3, 2, 2, 1, 3, 3, 0, 0, 2, 3, 1, 3, 2, 2, 3, 3, 0,
       1, 0, 1, 0, 0, 3, 3, 3, 3, 1, 1, 0, 0, 2, 2, 0, 3, 2, 3, 3, 0, 3,
       1, 3, 2, 3, 1, 3, 2, 3, 2, 0, 2, 2, 0, 1, 1, 0, 1, 1, 3, 3, 1, 3,
       1, 2, 3, 0, 0, 1, 2, 1, 3, 0, 0, 0, 2, 3, 1, 1, 2, 1, 1, 1, 0, 0,
       3, 3, 2, 2, 3, 1, 1, 2, 0, 3, 2, 1, 2, 1, 1, 1, 3, 3, 0, 3, 1, 0,
       2, 0, 1, 3, 3, 1, 1, 1, 0, 3, 0, 3, 1, 3, 3, 3, 3, 1, 3, 0, 0, 1,
       0, 0, 2, 1, 1, 1, 0, 3, 3, 3, 0, 1, 1, 1, 0, 0, 3, 2, 3, 3, 0, 0,
       3, 2, 3, 1, 2, 1, 0, 0, 0, 1, 0, 1, 3, 2, 1, 3, 1, 3, 0, 2, 3, 1,
       1, 2, 0, 0, 2, 3, 3, 2, 1, 2, 0, 0, 0, 3, 1, 0, 0, 2, 2, 2, 0, 1,
       3, 1, 3, 0, 2, 1, 3, 1, 3, 0, 1, 0, 2, 0, 1, 1, 0, 2, 0, 3, 1, 3,
       0, 0, 1, 2, 1, 2, 2, 1, 0, 2, 2, 1, 0, 0, 1, 1, 1, 1, 2, 0, 3, 0,
       0, 3, 0, 2, 1, 2, 0, 2, 2, 1, 3, 3, 1, 3, 3,

In [33]:
bins = pd.Series(bins, name='quartile')

result =(pd.Series(draws).groupby(bins).agg(['count','min','max']).reset_index())

In [34]:
result

Unnamed: 0,quartile,count,min,max
0,Q1,250,-2.949343,-0.685484
1,Q2,250,-0.683066,-0.010115
2,Q3,250,-0.010032,0.628894
3,Q4,250,0.634238,3.927528


In [35]:
result['quartile']

0    Q1
1    Q2
2    Q3
3    Q4
Name: quartile, dtype: category
Categories (4, object): [Q1 < Q2 < Q3 < Q4]

In [37]:
N = 10000000
draws = pd.Series(np.random.randn(N))
labels = pd.Series(['foo','bar','baz','qux']*(N//4))

In [38]:
categories = labels.astype('category')

In [39]:
labels.memory_usage()

80000128

In [40]:
categories.memory_usage()

10000320

In [42]:
%time _ = labels.astype('category')

Wall time: 452 ms


### Categotical 메서드

In [45]:
s = pd.Series(['a','b','c','d']*2)

cat_s = s.astype('category')
cat_s

0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: category
Categories (4, object): [a, b, c, d]

In [46]:
cat_s.cat.codes

0    0
1    1
2    2
3    3
4    0
5    1
6    2
7    3
dtype: int8

In [124]:
cat_s.cat.categories

Index(['a', 'b', 'c', 'd'], dtype='object')

In [54]:
actual_category = ['a','b','c','d','e']
cat_s2 = cat_s.cat.set_categories(actual_category) # 새로운 값이 있는경우 세팅 e가 추가됨
cat_s2.value_counts()

d    2
c    2
b    2
a    2
e    0
dtype: int64

In [87]:
cat_s[cat_s.isin(['a'])].cat.codes

0    0
4    0
dtype: int8


In [97]:
cat = pd.Categorical(['a','b','c'])
dir(cat)

['T',
 '__annotations__',
 '__array__',
 '__array_priority__',
 '__array_ufunc__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setitem__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_accessors',
 '_can_hold_na',
 '_concat_same_type',
 '_constructor',
 '_deprecations',
 '_dir_additions',
 '_dir_deletions',
 '_dtype',
 '_formatter',
 '_from_factorized',
 '_from_inferred_categories',
 '_from_sequence',
 '_from_sequence_of_strings',
 '_get_codes',
 '_get_repr',
 '_internal_get_values',
 '_maybe_coerce_indexer',
 '_ndarray_values',
 '_reduce',
 '_repr_categories',
 '_repr_categories_info',
 '_repr_footer',
 '_reset_cache',
 '_reverse

In [122]:
pd.Categorical(['b','b','a'],categories=cat.categories).codes

array([1, 1, 0], dtype=int8)

In [127]:
cat_s3 = cat_s[cat_s.isin(['a','b'])]
cat_s3

0    a
1    b
4    a
5    b
dtype: category
Categories (4, object): [a, b, c, d]

In [148]:
a = pd.Series(['a','c'])
pd.get_dummies(a,columns=['a','b','c'])

Unnamed: 0,a,c
0,1,0
1,0,1


In [138]:
list(cat.categories)

['a', 'b', 'c']

In [154]:
pd.get_dummies(cat_s)[['a','b','c','d']]

Unnamed: 0,a,b,c,d
0,1,0,0,0
1,0,1,0,0
2,0,0,1,0
3,0,0,0,1
4,1,0,0,0
5,0,1,0,0
6,0,0,1,0
7,0,0,0,1


#### 중요!!

In [155]:
cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c'])

In [158]:
pd.get_dummies(cat)

Unnamed: 0,a,b,c
0,1,0,0
1,1,0,0
2,0,0,1


### 모델링을 위한 더미값 생성하기

In [160]:
cat_s = pd.Series(['a','b','c','d']*2, dtype='category')

In [161]:
pd.get_dummies(cat_s)

Unnamed: 0,a,b,c,d
0,1,0,0,0
1,0,1,0,0
2,0,0,1,0
3,0,0,0,1
4,1,0,0,0
5,0,1,0,0
6,0,0,1,0
7,0,0,0,1


## 고급 Groupby 사용

### 그룹변환과 Gruopby 객체 풀어내기

In [163]:
df = pd.DataFrame({'key':['a','b','c']*4
                  ,'value':np.arange(12)})
df

Unnamed: 0,key,value
0,a,0
1,b,1
2,c,2
3,a,3
4,b,4
5,c,5
6,a,6
7,b,7
8,c,8
9,a,9


In [166]:
g = df.groupby('key').value
g.mean()

key
a    4.5
b    5.5
c    6.5
Name: value, dtype: float64

In [167]:
g.transform(lambda x:x.mean()) # x에 그룹별로 인식된다.

0     4.5
1     5.5
2     6.5
3     4.5
4     5.5
5     6.5
6     4.5
7     5.5
8     6.5
9     4.5
10    5.5
11    6.5
Name: value, dtype: float64

In [168]:
g.transform('mean')

0     4.5
1     5.5
2     6.5
3     4.5
4     5.5
5     6.5
6     4.5
7     5.5
8     6.5
9     4.5
10    5.5
11    6.5
Name: value, dtype: float64

In [169]:
g.transform(lambda x:x*2)

0      0
1      2
2      4
3      6
4      8
5     10
6     12
7     14
8     16
9     18
10    20
11    22
Name: value, dtype: int32

In [170]:
g.transform(lambda x:x.rank(ascending=False)) # 그룹별 등수

0     4
1     4
2     4
3     3
4     3
5     3
6     2
7     2
8     2
9     1
10    1
11    1
Name: value, dtype: int32

In [171]:
def normalize(x):
    return (x-x.mean())/x.std() # 표준화

g.transform(normalize)

0    -1.161895
1    -1.161895
2    -1.161895
3    -0.387298
4    -0.387298
5    -0.387298
6     0.387298
7     0.387298
8     0.387298
9     1.161895
10    1.161895
11    1.161895
Name: value, dtype: float64

In [172]:
g.apply(normalize)

0    -1.161895
1    -1.161895
2    -1.161895
3    -0.387298
4    -0.387298
5    -0.387298
6     0.387298
7     0.387298
8     0.387298
9     1.161895
10    1.161895
11    1.161895
Name: value, dtype: float64

In [173]:
g.transform('mean')

0     4.5
1     5.5
2     6.5
3     4.5
4     5.5
5     6.5
6     4.5
7     5.5
8     6.5
9     4.5
10    5.5
11    6.5
Name: value, dtype: float64

In [174]:
normalized = (df['value']-g.transform('mean'))/g.transform('std')
normalized

0    -1.161895
1    -1.161895
2    -1.161895
3    -0.387298
4    -0.387298
5    -0.387298
6     0.387298
7     0.387298
8     0.387298
9     1.161895
10    1.161895
11    1.161895
Name: value, dtype: float64

### 시계열 그룹 리샘플링

In [175]:
N =15
times = pd.date_range('2017-05-20 00:00:00',freq='1min',periods=N)
df = pd.DataFrame({'time':times,'value':np.arange(N)})
df

Unnamed: 0,time,value
0,2017-05-20 00:00:00,0
1,2017-05-20 00:01:00,1
2,2017-05-20 00:02:00,2
3,2017-05-20 00:03:00,3
4,2017-05-20 00:04:00,4
5,2017-05-20 00:05:00,5
6,2017-05-20 00:06:00,6
7,2017-05-20 00:07:00,7
8,2017-05-20 00:08:00,8
9,2017-05-20 00:09:00,9


In [179]:
df.set_index('time').resample('5min').sum()

Unnamed: 0_level_0,value
time,Unnamed: 1_level_1
2017-05-20 00:00:00,10
2017-05-20 00:05:00,35
2017-05-20 00:10:00,60


In [183]:
df2 = pd.DataFrame({'time':times.repeat(3)
                   ,'key':np.tile(['a','b','c'],N)
                   ,'value':np.arange(N*3)})
df2

Unnamed: 0,time,key,value
0,2017-05-20 00:00:00,a,0
1,2017-05-20 00:00:00,b,1
2,2017-05-20 00:00:00,c,2
3,2017-05-20 00:01:00,a,3
4,2017-05-20 00:01:00,b,4
5,2017-05-20 00:01:00,c,5
6,2017-05-20 00:02:00,a,6
7,2017-05-20 00:02:00,b,7
8,2017-05-20 00:02:00,c,8
9,2017-05-20 00:03:00,a,9


In [194]:
resampled = df2.set_index('time').groupby('key').resample('5min').sum()
resampled

Unnamed: 0_level_0,Unnamed: 1_level_0,value
key,time,Unnamed: 2_level_1
a,2017-05-20 00:00:00,30
a,2017-05-20 00:05:00,105
a,2017-05-20 00:10:00,180
b,2017-05-20 00:00:00,35
b,2017-05-20 00:05:00,110
b,2017-05-20 00:10:00,185
c,2017-05-20 00:00:00,40
c,2017-05-20 00:05:00,115
c,2017-05-20 00:10:00,190


In [196]:
resampled.reset_index()

Unnamed: 0,key,time,value
0,a,2017-05-20 00:00:00,30
1,a,2017-05-20 00:05:00,105
2,a,2017-05-20 00:10:00,180
3,b,2017-05-20 00:00:00,35
4,b,2017-05-20 00:05:00,110
5,b,2017-05-20 00:10:00,185
6,c,2017-05-20 00:00:00,40
7,c,2017-05-20 00:05:00,115
8,c,2017-05-20 00:10:00,190


## 메서드 연결 기법

In [203]:
from sklearn import datasets
iris = datasets.load_iris()

In [211]:
iris 

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

In [214]:
df = pd.DataFrame(iris.data, columns=['sepal_length' ,'sepal_width' ,'petal_length' ,'petal_width'])
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [215]:
df2 = df.assign(taget = 'iris')
df2

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,taget
0,5.1,3.5,1.4,0.2,iris
1,4.9,3.0,1.4,0.2,iris
2,4.7,3.2,1.3,0.2,iris
3,4.6,3.1,1.5,0.2,iris
4,5.0,3.6,1.4,0.2,iris
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,iris
146,6.3,2.5,5.0,1.9,iris
147,6.5,3.0,5.2,2.0,iris
148,6.2,3.4,5.4,2.3,iris


In [216]:
result = (df2.assign(demaned=df2.sepal_length - df2.sepal_length.mean())
         )

In [217]:
result

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,taget,demaned
0,5.1,3.5,1.4,0.2,iris,-0.743333
1,4.9,3.0,1.4,0.2,iris,-0.943333
2,4.7,3.2,1.3,0.2,iris,-1.143333
3,4.6,3.1,1.5,0.2,iris,-1.243333
4,5.0,3.6,1.4,0.2,iris,-0.843333
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,iris,0.856667
146,6.3,2.5,5.0,1.9,iris,0.456667
147,6.5,3.0,5.2,2.0,iris,0.656667
148,6.2,3.4,5.4,2.3,iris,0.356667


In [219]:
df3 = df[lambda x: x.petal_width<1]
df3

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
5,5.4,3.9,1.7,0.4
6,4.6,3.4,1.4,0.3
7,5.0,3.4,1.5,0.2
8,4.4,2.9,1.4,0.2
9,4.9,3.1,1.5,0.1
