# Pandas 基础数据结构

- Pandas 处理以下三个数据结构
    - 系列 (Series)
    - 数据帧 (DataFrame)
    - 数据集 (Dataset)

In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals

import pandas as pd
import numpy as np
import xarray as xr

def show_component(cmp):
    s = '\n{}'.format(cmp)
    return s.replace('\n', '\n\t')

def show_df(df):
    from IPython.display import display, HTML
    html = '<div style="margin-left:55px">{}</div>'.format(df.to_html())
    display(HTML(html))

## Series

-  1D 数据结构，表示一个数据列

In [None]:
data = [1, 3, 5, np.nan, 6, 8]

series = pd.Series(data, index=list(range(1, len(data) + 1)))
print('* Series is: {}'.format(show_component(series)))
print('\n  then values are: {}'.format(show_component(series.values)))
print('\n  and summary is: {}'.format(show_component(series.describe())))
print('\n  and dtypes of series is: {}'.format(show_component(series.dtypes)))

## DataFrame

- 2D 数据结构，表示一个表格

### DataFrame 基本

- 设置 DataFrame 的列名和索引

In [None]:
rows = 7
columns = list('ABCD')
indecs = pd.date_range('20190101', periods=rows)

- 准备一个矩阵

In [None]:
data = np.reshape(np.arange(rows * len(columns)) + 1, (rows, len(columns)))
print('* data is: {} (shape={})'.format(show_component(data), data.shape))

- 生成 DataFrame 对象

In [None]:
df = pd.DataFrame(data=data, index=indecs, columns=columns)
print('* data frame "df" is:')
show_df(df)

- 查看 DataFrame 属性

In [None]:
print('* dtypes of "df" is: {}'.format(show_component(df.dtypes)))
print('\n* index of "df" is: {}'.format(show_component(df.index)))
print('\n* columns of "df" is: {}'.format(show_component(df.columns)))
print('\n* values of "df" is: {} (shape={})'.format(show_component(df.values), df.values.shape))

- DataFrame 内容概览

In [None]:
print('* describe of "df" is: ')
show_df(df.describe())

print('\n* head of "df" is:')
show_df(df.head())

print('\n* tail of "df" is:')
show_df(df.tail())

print('\n* transpos of "df" is: ')
show_df(df.T)

### Dataframe 操作

#### 按列创建 DataFrame 对象

In [None]:
rows = 6

df = pd.DataFrame({
    'A': np.random.rand(rows) * (10 - 1) + 1,
    'B': pd.date_range('20190101', periods=rows),
    'C': pd.Series(np.arange(0.1, 0.7, step=0.1), index=[chr(0x61 + n) for n in range(rows)], dtype='float32'),
    'D': np.array([3] * rows, dtype='int32'),
    'E': pd.Categorical(np.tile(['test', 'train'], reps=rows//2)),
    'F': 'foo'
})

print('* when data frame “df” is:')
show_df(df)

#### 列操作

- 列集合

In [None]:
cols = df.columns
print('* columns of "df" are: {}'.format(show_component(cols)))

col_names = cols.values
print('  and column names are: {}'.format(show_component(col_names)))

col_index = cols.get_loc('C')
print('  and index of column "C" is: {}'.format(col_index))

- 根据列名获取该列数据

In [None]:
column_c = df['C']
print('* data of column "C" are: {}'.format(show_component(column_c)))
print('\n  and column values are: {}'.format(show_component(column_c.values)))

data_c_d = column_c['d']
print('\n  and data of row "d" of column is: {:.2f}'.format(data_c_d))

data_c_2 = column_c[3]
print('  and data of row 2 of column is: {:.2f}'.format(data_c_2))

- 添加列

In [None]:
df_ = df.copy()

column_count = len(df.columns)

series = pd.Series(data=[chr(0x41 + n) for n in range(0, column_count)],
                   index=[chr(0x61 + n) for n in range(0, column_count)])
print('* when column data is: {}'.format(show_component(series)))

df_['G'] = series

print('\n  after column was add as "G", dataframe "df" is:')
show_df(df_)

- 删除列

In [None]:
df_ = df.copy()

del_col = df_.pop('B')
print('* removed column "B" is: {}'.format(show_component(del_col)))

print('\n  then dataframe "df" is:')
show_df(df_)

del df_['D']
print('\n* after removed column "D", dataframe "df" is:')
show_df(df_)

#### 行操作

- 根据行索引获取数据

In [None]:
row_b = df.loc['b']
print('* data of row "b" are: {}'.format(show_component(row_b)))
print('\n  and row values are: {}'.format(show_component(row_b.values)))

column_b = row_b['B']
print('\n  and column "B" of row is: {}'.format(column_b))

- 根据行下标进获取数据

In [None]:
row_3 = df_.iloc[3]
print('* data of row 3 are: {}'.format(show_component(row_3)))
print('\n  and row values are: {}'.format(show_component(row_3.values)))

column_2 = row_3[2]
print('\n  and column 2 of row is: {}'.format(column_2))

- 添加行

In [None]:
df_ = df.copy()

rows = 2
new_rows = pd.DataFrame({
    'A': np.random.rand(rows) * (10 - 1) + 1,
    'B': pd.date_range('20190107', periods=rows),
    'C': [0.7, 0.8],
    'D': np.array([3] * rows, dtype='int32'),
    'E': ['test', 'train'],
    'F': pd.Series(['foo'] * 2, index=['g', 'h'])
})
print('* when new row is: {}'.format(show_component(new_rows)))

df_ = df_.append(new_rows)
print('\n  after append new row, data frame "df" is:')
show_df(df_)

- 以 Series 结构添加一行

In [None]:
df_ = df.copy()

new_row = pd.Series(data={
    'A': np.random.rand() * (10 - 1) + 1,
    'B': pd.Timestamp('20190107'),
    'C': 0.7,
    'D': 3,
    'E': 'test',
    'F': 'foo'
}, name='g')
print('\n* when new row is: {}'.format(show_component(new_row)))
df_ = df_.append(new_row)

print('  after append new row, data frame "df" is:')
show_df(df_)

- 删除行

In [None]:
df_ = df.copy()

df_droped = df_.drop(index='a')
print('* after drop row "a", then the "df" is')
show_df(df_droped)

df_droped = df_.drop(index=['a', 'b', 'c'])
print('* after drop row "a", then the "df" is')
show_df(df_droped)

num_to_delete = list(range(1, 5))
df_droped = df_.drop(index=[df_.index[n] for n in num_to_delete])
print('* after drop row {}, then the "df" is'.format(num_to_delete))
show_df(df_droped)

#### 行列操作

- 获取行列数据 (根据索引)

In [None]:
row = df.loc['a', :]
print('* row of index "a" is: {}'.format(show_component(row)))

column = df.loc[:, 'B']
print('\n* column of name "B" is: {}'.format(show_component(column)))

rows = df.loc[['a', 'b', 'd'], :]
print('\n* rows of index ["a", "b", "d"] is: {}'.format(show_component(rows)))

columns = df.loc[:, ['A', 'B', 'D']]
print('\n* columns of name ["A", "B", "D"] is: {}'.format(show_component(columns)))

rows = df.loc['b':'e', :]
print('\n* rows of index "b":"e" is: {}'.format(show_component(rows)))

columns = df.loc[:, 'B':'E']
print('\n* columns of name "B":"E" is: {}'.format(show_component(columns)))

datas = df.loc[['a', 'd'], ['A', 'B', 'D']]
print('\n* rows ["a", "d"] and columns ["A", "B", "D"] is: {}'.format(show_component(datas)))

datas = df.loc['c':'e', 'A':'D']
print('\n* rows "c":"e" and columns "A":"D" is: {}'.format(show_component(datas)))

- 获取行列数据 (根据行列号)

In [None]:
row = df.iloc[0, :]
print('* row of num 0 is: {}'.format(show_component(row)))

column = df.iloc[:, 1]
print('\n* column of num 1 is: {}'.format(show_component(column)))

rows = df.iloc[[1, 2, 4], :]
print('\n* rows of num [1, 2, 4] is: {}'.format(show_component(rows)))

columns = df.iloc[:, [1, 2, 4]]
print('\n* columns of num [1, 2, 4] is: {}'.format(show_component(columns)))

rows = df.iloc[2:5, :]
print('\n* rows of num 2:4 is: {}'.format(show_component(rows)))

columns = df.iloc[:, 2:5]
print('\n* columns of num 2:4 is: {}'.format(show_component(columns)))

datas = df.iloc[[2, 5], [1, 2, 4]]
print('\n* rows [2, 5] and columns [1, 2, 4 is: {}'.format(show_component(datas)))

datas = df.iloc[2:5, 1:4]
print('\n* rows 2:5 and columns 1:4 is: {}'.format(show_component(datas)))

#### 遍历

- 遍历列名

In [None]:
print('* column names are: ')
for n, col in enumerate(df):
    print('\tcolumn {} is "{}"'.format(n, col))

- 按列遍历数据

In [None]:
print('* data of "df" is:')
for column, rows in df.iteritems():
    print('\tcolumn "{}"'.format(column))
    for index, value in rows.iteritems():
        print('\t\t{} => {}'.format(index, value))

- 按行遍历数据 1

In [None]:
print('* data of "df" is:')
for row_index, row in df.iterrows():
    print('\trow "{}"'.format(row_index))
    for column, data in row.iteritems():
        print('\t\t{} => {}'.format(column, data))

- 按行遍历数据 2

In [None]:
print('* data of "df" is:')
for row in df.itertuples():
    index = row[0]
    print('\trow "{}"'.format(index))
    for n, col in enumerate(df):
        print('\t\t{} => {}'.format(col, row[n + 1]))

#### 排序

- 按索引排序

In [None]:
df_sorted = df.sort_index(ascending=False)
print('* after sort index by desc, the "df" is: ')
show_df(df_sorted)

df_sorted = df_sorted.sort_index()
print('* after sort index by asc, the "df" is: ')
show_df(df_sorted)

- 对列名进行排序

In [None]:
df_sorted = df.sort_index(axis=1, ascending=False)
print('* after sort column name by desc, the "df" is: ')
show_df(df_sorted)

df_sorted = df_sorted.sort_index(axis=1)
print('* after sort column name by asc, the "df" is: ')
show_df(df_sorted)

- 按值排序

In [None]:
df_sorted = df.sort_values(by='A', ascending=False)
print('* after sort values by column "A" desc, the "df" is: ')
show_df(df_sorted)

df_sorted = df.sort_values(by=['E', 'B'], ascending=[True, False])
print('* after sort values by column "A" asc and "B" desc, the "df" is: ')
show_df(df_sorted)

df_sorted = df.sort_values(by='A', kind='mergeesort')
print('* after sort values by column "A" with "mergeesort", the "df" is: ')
show_df(df_sorted)

df_sorted = df.sort_values(by='A', kind='heapsort')
print('* after sort values by column "A" with "heapsort", the "df" is: ')
show_df(df_sorted)

df_sorted = df.sort_values(by='A', kind='quicksort')
print('* after sort values by column "A" with "quicksort", the "df" is: ')
show_df(df_sorted)

#### 分组

In [None]:
df_ = df.copy()

df_['A'] = df_['A'].map(np.trunc)
print('* the new dataframe "df" is: ')
show_df(df_)

- 按列分组

In [None]:
groupby = df_.groupby('E')
print('* group by column "E", the result is: {}'.format(show_component(groupby)))

groups = groupby.groups
print('  and the groups is: {}'.format(show_component(groups)))

group_test = groups['test']
print('  and index of group "test" are: {}'.format(show_component(group_test.values)))

group_train = groups['train']
print('  and index of group "train" are: {}'.format(show_component(group_train.values)))

print('\n* the detail of each groups are:')
for group in groupby:
    print('{}the group "{}"'.format(' ' * 7, group[0]))
    show_df(group[1])

print('\n* group by column "E" and "A":')
groupby = df_.groupby(['E', 'A'])
for group in groupby:
    print('{}the group "{}"'.format(' ' * 7, group[0]))
    show_df(group[1])

## xArray

- 表示一个 Dataset 组成的数据集

In [None]:
df1 = pd.DataFrame(data=np.reshape(np.arange(10) + 1, (2, 5)))
print('* when dataframe "df1" is:')
show_df(df1)

da_one = xr.DataArray(df1,
                      dims=['n1', 'n2'],
                      coords={'n1': ['x', 'y'], 'n2': ['a', 'b', 'c', 'd', 'e']},
                      name='one')
print('  then convert to DataArray "da_one" is: {}'.format(da_one))

df2 = pd.DataFrame(data=np.reshape(np.arange(18) + 1, (3, 6)))
print('\n  and dataframe "df2" is:')
show_df(df2)

da_two = xr.DataArray(df2,
                      dims=['n1', 'n2'],
                      coords={'n1': ['x', 'y', 'z'], 'n2': ['a', 'b', 'c', 'd', 'e', 'f']},
                      name='two')
print('  then convert to DataArray "da_one" is: {}'.format(da_one))

ds = xr.Dataset(data_vars={'one': da_one, 'two': da_two})
print('\n  then dataset "ds" is: {}'.format(show_component(ds)))
print('  then ds is: ')
show_df(ds.to_dataframe())

In [None]:
df_sorted = df.sort_values(by=['E', 'C'], ascending=[False, True])

print('* and sorted by "E" and "C" columns of frame is: ')
show_df(df_sorted)