# Pandas入门
## 目录
1. Object Creation 创建对象
2. Viewing Data 显示数据
3. Selection
    - Getting
    - Selection by Label
    - Selection by Position
    - Boolean Indexing
    - Setting
4. Missing Data
5. Operations
    - Stats
    - Apply
    - Histogramming
    - String Methods
6. Merge
    - Concat
    - Join
    - Append
7. Grouping
8. Reshaping
    - Stack
    - Pivot Tables
9. Time Series
10. Categoricals
11. Plotting
12. Getting Data In/Out
    - CSV
    - HDF5
    - Excel
13. Gotchas

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## 创建对象

In [6]:
# 通过传递列表 创建序列
s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [10]:
# 传递Numpy数组 datetime索引 标签行来创建DataFrame
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
print(dates)
print(df)

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')
                   A         B         C         D
2013-01-01  2.052510  1.446556 -0.153591  0.805145
2013-01-02 -0.021575  2.062778 -0.844067 -0.351292
2013-01-03 -0.899637 -0.568931 -0.272361  0.106631
2013-01-04 -0.016522  1.540225 -1.254772 -1.860999
2013-01-05 -0.256377 -0.679196 -1.167124 -1.041644
2013-01-06 -0.254593 -0.235811  1.141688 -0.705360


In [None]:
# 传递 可被转化成序列形式的对象的字典 创建DataFrame
df2 = pd.DataFrame({ 
    'A': 1.,
    'B': pd.Timestamp('20190428'),
    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
    'D': np.array([3]*4,dtype='int32'),
    'E': pd.Categorical(['test','train','test','train']),
    'F': 'foo'})
df2

In [14]:
df2.dtypes
# df2.<tab>

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

# Viewing Data 显示数据

In [15]:
df.head()
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,-0.016522,1.540225,-1.254772,-1.860999
2013-01-05,-0.256377,-0.679196,-1.167124,-1.041644
2013-01-06,-0.254593,-0.235811,1.141688,-0.70536


In [16]:
# 显示索引、行以及underlying Numpy data
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [17]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [18]:
df.values

array([[ 2.05250958,  1.44655586, -0.15359147,  0.80514485],
       [-0.02157533,  2.0627784 , -0.84406693, -0.35129215],
       [-0.89963678, -0.56893144, -0.27236075,  0.10663066],
       [-0.01652185,  1.54022501, -1.25477221, -1.8609989 ],
       [-0.25637744, -0.67919566, -1.16712397, -1.04164399],
       [-0.25459309, -0.23581126,  1.14168843, -0.70535972]])

In [20]:
df.describe()# 展示数据快速的统计摘要

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.100634,0.59427,-0.425038,-0.50792
std,1.009224,1.219961,0.890658,0.925312
min,-0.899637,-0.679196,-1.254772,-1.860999
25%,-0.255931,-0.485651,-1.08636,-0.957573
50%,-0.138084,0.605372,-0.558214,-0.528326
75%,-0.017785,1.516808,-0.183284,-0.00785
max,2.05251,2.062778,1.141688,0.805145


In [21]:
df.T # 转制数据

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,2.05251,-0.021575,-0.899637,-0.016522,-0.256377,-0.254593
B,1.446556,2.062778,-0.568931,1.540225,-0.679196,-0.235811
C,-0.153591,-0.844067,-0.272361,-1.254772,-1.167124,1.141688
D,0.805145,-0.351292,0.106631,-1.860999,-1.041644,-0.70536
