# Pandas入门
## 目录
1. Object Creation 创建对象
2. Viewing Data 显示数据
3. Selection
    - Getting
    - Selection by Label
    - Selection by Position
    - Boolean Indexing
    - Setting
4. Missing Data
5. Operations
    - Stats
    - Apply
    - Histogramming
    - String Methods
6. Merge
    - Concat
    - Join
    - Append
7. Grouping
8. Reshaping
    - Stack
    - Pivot Tables
9. Time Series
10. Categoricals
11. Plotting
12. Getting Data In/Out
    - CSV
    - HDF5
    - Excel
13. Gotchas

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## 创建对象

In [2]:
# 通过传递列表 创建序列
s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [3]:
# 传递Numpy数组 datetime索引 标签行来创建DataFrame
dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [4]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.83126,-1.103149,1.040924,-0.474663
2013-01-02,-2.870575,-0.132806,-0.644409,-0.925214
2013-01-03,-0.366481,-1.659924,0.062137,-1.141534
2013-01-04,0.099317,0.395525,-0.755232,0.617548
2013-01-05,-0.289928,0.170239,0.449273,0.848893
2013-01-06,-0.306429,-1.715975,1.351024,0.030628


In [5]:
# 传递 可被转化成序列形式的对象的字典 创建DataFrame
df2 = pd.DataFrame({ 
    'A': 1.,
    'B': pd.Timestamp('20190428'),
    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
    'D': np.array([3]*4,dtype='int32'),
    'E': pd.Categorical(['test','train','test','train']),
    'F': 'foo'})
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2019-04-28,1.0,3,test,foo
1,1.0,2019-04-28,1.0,3,train,foo
2,1.0,2019-04-28,1.0,3,test,foo
3,1.0,2019-04-28,1.0,3,train,foo


In [6]:
df2.dtypes
# df2.<tab>

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

## Viewing Data 显示数据

In [7]:
df.head()
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,0.099317,0.395525,-0.755232,0.617548
2013-01-05,-0.289928,0.170239,0.449273,0.848893
2013-01-06,-0.306429,-1.715975,1.351024,0.030628


In [8]:
# 显示索引、行以及underlying Numpy data
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [9]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [10]:
df.values

array([[-1.83126004, -1.10314854,  1.04092435, -0.47466257],
       [-2.87057478, -0.13280595, -0.64440921, -0.92521365],
       [-0.36648138, -1.65992368,  0.06213716, -1.14153368],
       [ 0.09931704,  0.39552473, -0.7552322 ,  0.61754785],
       [-0.28992804,  0.17023856,  0.44927319,  0.84889313],
       [-0.30642852, -1.71597524,  1.35102429,  0.03062815]])

In [11]:
df.describe()# 展示数据快速的统计摘要

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.927559,-0.674348,0.25062,-0.174057
std,1.162209,0.93717,0.862934,0.813089
min,-2.870575,-1.715975,-0.755232,-1.141534
25%,-1.465065,-1.52073,-0.467773,-0.812576
50%,-0.336455,-0.617977,0.255705,-0.222017
75%,-0.294053,0.094477,0.893012,0.470818
max,0.099317,0.395525,1.351024,0.848893


In [12]:
df.T # 转制数据

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,-1.83126,-2.870575,-0.366481,0.099317,-0.289928,-0.306429
B,-1.103149,-0.132806,-1.659924,0.395525,0.170239,-1.715975
C,1.040924,-0.644409,0.062137,-0.755232,0.449273,1.351024
D,-0.474663,-0.925214,-1.141534,0.617548,0.848893,0.030628


In [13]:
df.sort_index(axis=1, ascending=False) # 按axis排序

Unnamed: 0,D,C,B,A
2013-01-01,-0.474663,1.040924,-1.103149,-1.83126
2013-01-02,-0.925214,-0.644409,-0.132806,-2.870575
2013-01-03,-1.141534,0.062137,-1.659924,-0.366481
2013-01-04,0.617548,-0.755232,0.395525,0.099317
2013-01-05,0.848893,0.449273,0.170239,-0.289928
2013-01-06,0.030628,1.351024,-1.715975,-0.306429


In [14]:
df.sort_values(by='B') # 按值排序

Unnamed: 0,A,B,C,D
2013-01-06,-0.306429,-1.715975,1.351024,0.030628
2013-01-03,-0.366481,-1.659924,0.062137,-1.141534
2013-01-01,-1.83126,-1.103149,1.040924,-0.474663
2013-01-02,-2.870575,-0.132806,-0.644409,-0.925214
2013-01-05,-0.289928,0.170239,0.449273,0.848893
2013-01-04,0.099317,0.395525,-0.755232,0.617548


## Selection
注意：虽然用于选择和设置的标准Python / Numpy表达式非常直观并且可以用于交互式工作，但对于生产代码，我们建议使用优化的pandas数据访问方法.at，.iat，.loc和.iloc。请参阅索引文档索引和选择数据以及MultiIndex / Advanced索引。
### Getting

In [15]:
# 用序列方式选择单行 等价于df.a
df['A']

2013-01-01   -1.831260
2013-01-02   -2.870575
2013-01-03   -0.366481
2013-01-04    0.099317
2013-01-05   -0.289928
2013-01-06   -0.306429
Freq: D, Name: A, dtype: float64

In [16]:
# 通过[]对列切片
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,-1.83126,-1.103149,1.040924,-0.474663
2013-01-02,-2.870575,-0.132806,-0.644409,-0.925214
2013-01-03,-0.366481,-1.659924,0.062137,-1.141534


In [17]:
df['20130101':'20130104']

Unnamed: 0,A,B,C,D
2013-01-01,-1.83126,-1.103149,1.040924,-0.474663
2013-01-02,-2.870575,-0.132806,-0.644409,-0.925214
2013-01-03,-0.366481,-1.659924,0.062137,-1.141534
2013-01-04,0.099317,0.395525,-0.755232,0.617548


### 用标签选择

In [18]:
df.loc[dates[0]]   # For getting a cross section using a label:

A   -1.831260
B   -1.103149
C    1.040924
D   -0.474663
Name: 2013-01-01 00:00:00, dtype: float64

In [19]:
df.loc[:,['A','B']]    # Selecting on a multi-axis by label

Unnamed: 0,A,B
2013-01-01,-1.83126,-1.103149
2013-01-02,-2.870575,-0.132806
2013-01-03,-0.366481,-1.659924
2013-01-04,0.099317,0.395525
2013-01-05,-0.289928,0.170239
2013-01-06,-0.306429,-1.715975


In [20]:
df.loc['20130101',['A','B']]    # 减少返回对象的尺寸

A   -1.831260
B   -1.103149
Name: 2013-01-01 00:00:00, dtype: float64

In [21]:
df.at[dates[0],'A']    # 为了快速访问标量（相当于以前的方法）：

-1.8312600351105242