# Dataframe

In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals

import pandas as pd
import numpy as np
import xarray as xr

def show_component(cmp):
    s = '\n{}'.format(cmp)
    return s.replace('\n', '\n\t')

def show_df(df):
    from IPython.display import display, HTML
    html = '<div style="margin-left:55px">{}</div>'.format(df.to_html())
    display(HTML(html))

def show_dict(d):
    s = []
    bs = 1
    for k, v in d.items():
        s.append('{}{}: {}'.format(' ' * bs, k, v))
        bs = 2
    return show_component(''.join(['{', ',\n'.join(s), ' }']))

def show_list(l):
    bs = 1
    s = []
    for i in l:
        s.append('{}{}'.format(' ' * bs, i))
        bs = 2
    return show_component(''.join(['[', ',\n'.join(s), ' ]']))

## 创建

In [None]:
rows = 6
columns = ['A', 'B', 'C', 'D', 'E', 'F']
indces = [chr(0x61 + n) for n in range(rows)]

c_a = np.random.rand(rows) * (10 - 1) + 1
c_b = pd.date_range('20190101', periods=rows)
c_c = np.arange(0.1, 0.7, step=0.1)
c_d = np.array([3] * rows, dtype='int32')
c_e = np.tile(['test', 'train'], reps=rows//2)
c_f = 'foo'

### 通过字典创建

In [None]:
d_ = {k: v for k, v in zip(columns, [c_a, c_b, c_c, c_d, c_e, c_f])}
print('* when dict is: {}'.format(show_dict(d_)))

df = pd.DataFrame(d_, index=indces)

print('\n  then data frame “df” is:')
show_df(df)

### 通过集合创建

- `list(zip(*l)` 求`l`的转置矩阵，`l`必须为二维数组

In [None]:
l_ = list(zip(c_a.tolist(),
              c_b.tolist(),
              c_c.tolist(),
              c_d.tolist(),
              c_e.tolist(),
              [c_f] * rows))
print('* when list is: {}'.format(show_list(l_)))

df = pd.DataFrame(l_, index=indces, columns=columns)

print('\n  then data frame “df” is:')
show_df(df)

### 通过字典的集合创建

In [None]:
l_ = []
for row in zip(c_a.tolist(),
               c_b.tolist(),
               c_c.tolist(),
               c_d.tolist(),
               c_e.tolist(),
               [c_f] * rows):
    l_.append({name: data for name, data in zip(columns, row)})

print('* when list is: {}'.format(show_list(l_)))

    
df = pd.DataFrame(l_, index=indces)

print('\n  then data frame “df” is:')
show_df(df)

### 通过 Series 对象创建

- 只要有一个 Series 列具备索引，则整个 DataFrame 具备索引

In [None]:
s_a = pd.Series(c_a, index=indces)
print('* when series "s_a" is: {}'.format(show_component(s_a)))

s_e = pd.Series(pd.Categorical(c_e), index=indces)
print('\n  and series "s_e" is: {}'.format(show_component(s_e)))

d_ = {k: v for k, v in zip(columns, [s_a, c_b, c_c, c_d, s_e, c_f])}
df = pd.DataFrame(d_, columns=columns)

print('\n  then data frame “df” is:')
show_df(df)

## 列操作

### 列集合

In [None]:
cols = df.columns
print('* columns of "df" are: {}'.format(show_component(cols)))

col_names = cols.values
print('  and column names are: {}'.format(show_component(col_names)))

col_index = cols.get_loc('C')
print('  and index of column "C" is: {}'.format(col_index))

### 根据列名获取数据

In [None]:
column_c = df['C']
print('* data of column "C" are: {}'.format(show_component(column_c)))
print('\n  and column values are: {}'.format(show_component(column_c.values)))

data_c_d = column_c['d']
print('\n  and data of row "d" of column is: {:.2f}'.format(data_c_d))

data_c_2 = column_c[3]
print('  and data of row 2 of column is: {:.2f}'.format(data_c_2))

### 添加列

In [None]:
df_ = df.copy()

column_count = len(df.columns)

series = pd.Series(data=[chr(0x41 + n) for n in range(0, column_count)],
                   index=[chr(0x61 + n) for n in range(0, column_count)])
print('* when column data is: {}'.format(show_component(series)))

df_['G'] = series

print('\n  after column was add as "G", dataframe "df" is:')
show_df(df_)

### 删除列

In [None]:
df_ = df.copy()

del_col = df_.pop('B')
print('* removed column "B" is: {}'.format(show_component(del_col)))

print('\n  then dataframe "df" is:')
show_df(df_)

del df_['D']
print('\n* after removed column "D", dataframe "df" is:')
show_df(df_)

## 行操作

### 根据行索引获取数据

In [None]:
row_b = df.loc['b']
print('* data of row "b" are: {}'.format(show_component(row_b)))
print('\n  and row values are: {}'.format(show_component(row_b.values)))

column_b = row_b['B']
print('\n  and column "B" of row is: {}'.format(column_b))

### 根据行下标获取数据

In [None]:
row_3 = df_.iloc[3]
print('* data of row 3 are: {}'.format(show_component(row_3)))
print('\n  and row values are: {}'.format(show_component(row_3.values)))

column_2 = row_3[2]
print('\n  and column 2 of row is: {}'.format(column_2))

### 添加行

#### 以 DataFrame 对象为行集添加

In [None]:
df_ = df.copy()

rows = 2
new_rows = pd.DataFrame(
    {
        'A': np.random.rand(rows) * (10 - 1) + 1,
        'B': pd.date_range('20190107', periods=rows),
        'C': [0.7, 0.8],
        'D': np.array([3] * rows, dtype='int32'),
        'E': ['test', 'train'],
        'F': ['foo'] * 2
    }, index=['g', 'h']
)
print('* when new row is: {}'.format(show_component(new_rows)))

df_ = df_.append(new_rows)
print('\n  after append new row, data frame "df" is:')
show_df(df_)

#### 直接通过索引添加

In [None]:
df_ = df.copy()

new_row = [np.random.rand() * (10 - 1) + 1, pd.to_datetime('20190107'), 0.7, 3., 'test', 'bar']
print('* when new row is: {}'.format(show_component(new_row)))

df_.loc['g'] = new_row

print('\n  after append new row, data frame "df" is:')
show_df(df_)

#### 以 Series 结构添加

In [None]:
df_ = df.copy()

new_row = pd.Series(data={
    'A': np.random.rand() * (10 - 1) + 1,
    'B': pd.Timestamp('20190107'),
    'C': 0.7,
    'D': 3,
    'E': 'test',
    'F': 'foo'
}, name='g')

print('* when new row is: {}'.format(show_component(new_row)))
df_ = df_.append(new_row)

print('  after append new row, data frame "df" is:')
show_df(df_)

#### 忽略索引

- `df.append`方法的`ignore_index`如果为`True`，则添加完后的结果中，索引会重新从`0`编排，之前的索引失效

In [None]:
df_ = pd.DataFrame([
    ['A', 100],
    ['B', 200],
    ['C', 300]
], columns=['C1', 'C2'], index=['a', 'b', 'c'])

print('* when "df" is')
show_df(df_)

df_ = df_.append({'C1': 'D', 'C2': 400}, ignore_index=True)
print('  then after append ignore index, "df" is:')
show_df(df_)

### 删除行

In [None]:
df_ = df.copy()

df_droped = df_.drop(index='a')
print('* after drop row "a", then the "df" is')
show_df(df_droped)

df_droped = df_.drop(index=['a', 'b', 'c'])
print('* after drop row "a", then the "df" is')
show_df(df_droped)

num_to_delete = list(range(1, 5))
df_droped = df_.drop(index=[df_.index[n] for n in num_to_delete])
print('* after drop row {}, then the "df" is'.format(num_to_delete))
show_df(df_droped)

## 行列操作

### 获取行列数据 (根据索引)

In [None]:
row = df.loc['a', :]
print('* row of index "a" is: {}'.format(show_component(row)))

column = df.loc[:, 'B']
print('\n* column of name "B" is: {}'.format(show_component(column)))

rows = df.loc[['a', 'b', 'd'], :]
print('\n* rows of index ["a", "b", "d"] is: {}'.format(show_component(rows)))

columns = df.loc[:, ['A', 'B', 'D']]
print('\n* columns of name ["A", "B", "D"] is: {}'.format(show_component(columns)))

rows = df.loc['b':'e', :]
print('\n* rows of index "b":"e" is: {}'.format(show_component(rows)))

columns = df.loc[:, 'B':'E']
print('\n* columns of name "B":"E" is: {}'.format(show_component(columns)))

datas = df.loc[['a', 'd'], ['A', 'B', 'D']]
print('\n* rows ["a", "d"] and columns ["A", "B", "D"] is: {}'.format(show_component(datas)))

datas = df.loc['c':'e', 'A':'D']
print('\n* rows "c":"e" and columns "A":"D" is: {}'.format(show_component(datas)))

### 获取行列数据 (根据行列号)

In [None]:
row = df.iloc[0, :]
print('* row of num 0 is: {}'.format(show_component(row)))

column = df.iloc[:, 1]
print('\n* column of num 1 is: {}'.format(show_component(column)))

rows = df.iloc[[1, 2, 4], :]
print('\n* rows of num [1, 2, 4] is: {}'.format(show_component(rows)))

columns = df.iloc[:, [1, 2, 4]]
print('\n* columns of num [1, 2, 4] is: {}'.format(show_component(columns)))

rows = df.iloc[2:5, :]
print('\n* rows of num 2:4 is: {}'.format(show_component(rows)))

columns = df.iloc[:, 2:5]
print('\n* columns of num 2:4 is: {}'.format(show_component(columns)))

datas = df.iloc[[2, 5], [1, 2, 4]]
print('\n* rows [2, 5] and columns [1, 2, 4 is: {}'.format(show_component(datas)))

datas = df.iloc[2:5, 1:4]
print('\n* rows 2:5 and columns 1:4 is: {}'.format(show_component(datas)))

## 遍历

### 遍历列名

In [None]:
print('* column names are: ')
for n, col in enumerate(df):
    print('\tcolumn {} is "{}"'.format(n, col))

### 遍历数据

#### 按列遍历

In [None]:
print('* data of "df" is:')
for column, rows in df.iteritems():
    print('\tcolumn "{}"'.format(column))
    for index, value in rows.iteritems():
        print('\t\t{} => {}'.format(index, value))

#### 按行遍历

In [None]:
print('* data of "df" is:')
for row_index, row in df.iterrows():
    print('\trow "{}"'.format(row_index))
    for column, data in row.iteritems():
        print('\t\t{} => {}'.format(column, data))

#### 整行遍历

In [None]:
print('* data of "df" is:')
for row in df.itertuples():
    index = row[0]
    print('\trow "{}"'.format(index))
    for n, col in enumerate(df):
        print('\t\t{} => {}'.format(col, row[n + 1]))