# Category

In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals

import pandas as pd
import numpy as np
import xarray as xr

def show_component(cmp):
    s = '\n{}'.format(cmp)
    return s.replace('\n', '\n\t')

def show_df(df):
    from IPython.display import display, HTML
    html = '<div style="margin-left:55px">{}</div>'.format(df.to_html())
    display(HTML(html))

## 创建分类对象

### 创建具备类别的列集

#### 创建 Series 对象时指定为分类类型

In [None]:
s = pd.Series(['a', 'b', 'c', 'a'], dtype='category')
print('* series is: {}'.format(show_component(s)))

#### 分类对象

- 根据数据自动分类

In [None]:
c = pd.Categorical(['a', 'b', 'c', 'a'])
print('* category is: {}'.format(show_component(c)))

- 指定分类
    - `ordered` 参数定义分类是否有序（即另分类原本顺序为大小顺序）

In [None]:
data = ['a', 'b', 'c', 'a', 'b', 'c', 'd', 'b', 'c', 'a']
category = ['a', 'c', 'b']

c = pd.Categorical(data, categories=category, ordered=True)
print('* category is: {}'.format(show_component(c)))

- 创建列集

In [None]:
s = pd.Series(c)
print('* series is: {}'.format(show_component(s)))

### 在 DataFrame 中包含分类列

- 即在`DataFrame`中包含`dtype`为`category`的列或类型为`Categorical`的列

#### DataFrame 包含不同类型列时产生“描述”的差异

In [None]:
rows = 6
categories = ['test', 'train']

df = pd.DataFrame({
    'A': pd.Categorical(np.tile(categories, reps=rows//2), categories=categories, ordered=True)
}, index=[chr(0x61 + n) for n in range(rows)])

print('* when data frame “df” is:')
show_df(df)

summary = df.describe()
print('  then "df.describe()" is: {}'.format(show_component(summary)))

summary = df['A'].describe()
print('\n  and "df[\'A\'].describe()" is: {}'.format(show_component(summary)))

df['B'] = ['a', 'b', 'a', 'b', 'c', 'a']
print('\n* when data frame “df” is:')
show_df(df)

summary = df.describe()
print('  then "df.describe()" is: {}'.format(show_component(summary)))

summary = df['B'].describe()
print('\n  and "df[\'B\'].describe()" is: {}'.format(show_component(summary)))

df['C'] = np.random.uniform(low=0, high=1, size=rows)
print('\n* when data frame “df” is:')
show_df(df)

summary = df.describe()
print('  then "df.describe()" is: {}'.format(show_component(summary)))

summary = df['C'].describe()
print('\n  and "df[\'C\'].describe()" is: {}'.format(show_component(summary)))

### 利用分类约束数据

In [None]:
df = pd.DataFrame({
    'A': ['Alvin', 'Lily', 'Lucy'],
    'B': pd.Categorical(['M', 'F', 'F'])
})

# rows = pd.DataFrame([
#     ['Tom', 'M'],
#     ['Author', 'X']
# ], columns=['A', 'B'])

df.loc[3] = ['Tom', 'M']
df