# pandas

pandas是专门为处理表格和混杂数据设计，而Numpy更适合处理统一的数值数组数据。

In [3]:
import pandas as pd
from pandas import Series, DataFrame

import numpy as np


<br/>

## 5.1 pandas数据结构

两个主要数据结构:

- Series
- DataFrame

<br/>

### Series

Series是一种类似于一位数组的对象。它由数据标签(索引)和数据组成

In [4]:
obj = pd.Series([4, 7, -5, 3])
obj
#未指定索引会自动创建

0    4
1    7
2   -5
3    3
dtype: int64

In [5]:
obj.values

array([ 4,  7, -5,  3])

In [6]:
obj.index
for i in obj.index:
    print(i)

0
1
2
3


In [7]:
#自带索引
obj2 = pd.Series([4, 7, 5, 3], index=['d', 'b', 'a', 'c'])
obj2

d    4
b    7
a    5
c    3
dtype: int64

In [8]:
obj2['a']

5

In [9]:
obj2[['a', 'b', 'c']]

a    5
b    7
c    3
dtype: int64

In [10]:
obj2[obj2 > 3]

d    4
b    7
a    5
dtype: int64

In [11]:
obj2 **2

d    16
b    49
a    25
c     9
dtype: int64

In [12]:
np.exp(obj2)

d      54.598150
b    1096.633158
a     148.413159
c      20.085537
dtype: float64

In [13]:
'b' in obj2

True

In [14]:
11 in obj2

False

In [15]:
#dict --> Series
sdata = {'Ohio': 3500, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = pd.Series(sdata)

In [16]:
obj3

Ohio       3500
Oregon    16000
Texas     71000
Utah       5000
dtype: int64


<br/>

### DataFrame

DataFrame是一个表格型的数据结构，它含有一组有序的列，每列可以是不同的值类型。
DateFrame既有行索引，又有列索引。

In [17]:
data = { 'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
         'year': [2000, 2001, 2002, 2001, 2002, 2003],
         'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

frame = pd.DataFrame(data)
frame
#自动添加索引

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002
5,3.2,Nevada,2003


In [18]:
#指定索引
pd.DataFrame(data, index=['one', 'two', 'three', 'four', 'five', 'six'])

Unnamed: 0,pop,state,year
one,1.5,Ohio,2000
two,1.7,Ohio,2001
three,3.6,Ohio,2002
four,2.4,Nevada,2001
five,2.9,Nevada,2002
six,3.2,Nevada,2003


In [19]:
#指定序列
pd.DataFrame(data, columns=['state', 'year', 'pop'])

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [20]:
#同时指定索引和列
#未存在的列会存在缺失，索引个数不对会报错
frame2 = pd.DataFrame(data, columns=['state', 'year', 'pop', 'debt', 'addition'], index=['one', 'two', 'three', 'four', 'five', 'six'])
frame2

Unnamed: 0,state,year,pop,debt,addition
one,Ohio,2000,1.5,,
two,Ohio,2001,1.7,,
three,Ohio,2002,3.6,,
four,Nevada,2001,2.4,,
five,Nevada,2002,2.9,,
six,Nevada,2003,3.2,,


In [21]:
frame2.columns

Index(['state', 'year', 'pop', 'debt', 'addition'], dtype='object')

In [22]:
frame2['year']

one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64

In [23]:
frame2.loc['three']

state       Ohio
year        2002
pop          3.6
debt         NaN
addition     NaN
Name: three, dtype: object

In [24]:
#给列赋值
frame2['debt'] = np.arange(6)
frame2

Unnamed: 0,state,year,pop,debt,addition
one,Ohio,2000,1.5,0,
two,Ohio,2001,1.7,1,
three,Ohio,2002,3.6,2,
four,Nevada,2001,2.4,3,
five,Nevada,2002,2.9,4,
six,Nevada,2003,3.2,5,


In [25]:

val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
val

two    -1.2
four   -1.5
five   -1.7
dtype: float64

In [26]:
#Series and DataFrame
frame2['addition'] =  val
frame2

Unnamed: 0,state,year,pop,debt,addition
one,Ohio,2000,1.5,0,
two,Ohio,2001,1.7,1,-1.2
three,Ohio,2002,3.6,2,
four,Nevada,2001,2.4,3,-1.5
five,Nevada,2002,2.9,4,-1.7
six,Nevada,2003,3.2,5,


In [27]:
#删除
del frame2['addition']
frame2

Unnamed: 0,state,year,pop,debt
one,Ohio,2000,1.5,0
two,Ohio,2001,1.7,1
three,Ohio,2002,3.6,2
four,Nevada,2001,2.4,3
five,Nevada,2002,2.9,4
six,Nevada,2003,3.2,5


In [28]:
#另一种嵌套字典
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
      'Ohio': {2000: 1.5, 2001: 1.7 , 2002: 3.6}}

frame3 = pd.DataFrame(pop)
frame3

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [29]:
#转置(行列交换)
frame3.T

Unnamed: 0,2000,2001,2002
Nevada,,2.4,2.9
Ohio,1.5,1.7,3.6


In [30]:
frame3.index.name = 'year'; frame3.columns.name = 'state'
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [31]:
frame3.values

array([[nan, 1.5],
       [2.4, 1.7],
       [2.9, 3.6]])

In [32]:
frame2.values

array([['Ohio', 2000, 1.5, 0],
       ['Ohio', 2001, 1.7, 1],
       ['Ohio', 2002, 3.6, 2],
       ['Nevada', 2001, 2.4, 3],
       ['Nevada', 2002, 2.9, 4],
       ['Nevada', 2003, 3.2, 5]], dtype=object)

In [33]:
!date

Mon Sep  3 16:19:07 CST 2018


In [34]:
#与python的集合不同，pandas的index可以包含重复的标签
dup_labels = pd.Index(['foo', 'foo', 'bar', 'bar'])
dup_labels

Index(['foo', 'foo', 'bar', 'bar'], dtype='object')


<br/>

## 5.1 基本功能

Series和DataFrame的数据基本手段。


<br/>

### reindex



In [35]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['b', 'd', 'a', 'c'])
obj

b    4.5
d    7.2
a   -5.3
c    3.6
dtype: float64

In [36]:
#索引如果不存在会引入缺失值
obj2 =  obj.reindex(['a', 'b', 'e', 'c', 'd'])
obj2

a   -5.3
b    4.5
e    NaN
c    3.6
d    7.2
dtype: float64

In [37]:
#值填充
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
obj3

0      blue
2    purple
4    yellow
dtype: object

In [38]:
obj3.reindex(range(6))

0      blue
1       NaN
2    purple
3       NaN
4    yellow
5       NaN
dtype: object

In [39]:
#前向值填充
obj3.reindex(range(6), method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [40]:
#后向值填充
obj3.reindex(range(6), method='bfill')

0      blue
1    purple
2    purple
3    yellow
4    yellow
5       NaN
dtype: object


<br/>

### 丢弃指定轴上的项



In [41]:
obj = pd.Series(['a', 'b', 'c', 'd'])
obj

0    a
1    b
2    c
3    d
dtype: object

In [42]:
new_obj = obj.drop(2)
new_obj

0    a
1    b
3    d
dtype: object

In [43]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                   columns=['a', 'b', 'c', 'd'])

data

Unnamed: 0,a,b,c,d
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


In [44]:
data.drop([1, 3])

Unnamed: 0,a,b,c,d
0,0,1,2,3
2,8,9,10,11


In [45]:
data.drop(['a', 'c'], axis='columns')

Unnamed: 0,b,d
0,1,3
1,5,7
2,9,11
3,13,15



<br/>

### loc和iloc

- `loc`: 轴标签
- `iloc`: 索引

In [46]:
data.loc[0, ['a', 'b', 'c']]

a    0
b    1
c    2
Name: 0, dtype: int64

In [47]:
data.iloc[2, [0, 1, 2]]

a     8
b     9
c    10
Name: 2, dtype: int64


<br/>

### 整数索引

pandas的整数索引对象与Python内置的列表和元组的索引语法不同，请注意。

In [48]:
ser = pd.Series(np.arange(3.))
ser

0    0.0
1    1.0
2    2.0
dtype: float64

In [49]:
ser2 = pd.Series(np.arange(3.), index=['a', 'b', 'c'])
ser2

a    0.0
b    1.0
c    2.0
dtype: float64

In [50]:
#对于整数索引，有歧义，会报错
#ser[-1]


#对于非整数索引，不会有歧义
ser2[-1]

2.0

In [51]:
#为了更精确，请使用 loc/iloc
ser[:1]

0    0.0
dtype: float64

In [52]:
ser.loc[:1]

0    0.0
1    1.0
dtype: float64

In [53]:
#索引
ser.iloc[:1]

0    0.0
dtype: float64


<br/>

### 算数运算和数据对齐



In [54]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = pd.Series([-2.1, 3.6, -.15, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])

In [55]:
s1

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

In [56]:
s2

a   -2.10
c    3.60
e   -0.15
f    4.00
g    3.10
dtype: float64

In [57]:
#相加
#自动的数据对齐操作在不重叠的索引处引入了NA值，缺失值会在算数运算中传播
s1 + s2

a    5.20
c    1.10
d     NaN
e    1.35
f     NaN
g     NaN
dtype: float64

In [58]:
df1 = pd.DataFrame(np.arange(9.).reshape(3, 3), columns=list('bcd'), index=['Ohio', 'Texas', 'Colorado'])
df2 = pd.DataFrame(np.arange(12.).reshape(4, 3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])

In [59]:
df1

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


In [60]:
df2

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [61]:
df1 + df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


In [62]:
df3 = pd.DataFrame({'A': [1, 2]})
df4 = pd.DataFrame({'B': [3, 4]})

In [63]:
df3

Unnamed: 0,A
0,1
1,2


In [64]:
df4

Unnamed: 0,B
0,3
1,4


In [65]:
df3 - df4

Unnamed: 0,A,B
0,,
1,,



<br/>

### 在算数方法中填充值



In [66]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)), columns=list('abcd'))
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)), columns=list('abcde'))

In [67]:
df2.loc[1, 'b'] = np.nan

In [68]:
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [69]:
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [70]:
df1 + df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [71]:
df1.add(df2, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,5.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [72]:
df1.reindex(columns=df2.columns, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,0
1,4.0,5.0,6.0,7.0,0
2,8.0,9.0,10.0,11.0,0



<br>

### DataFrame和Series之间的运算



In [73]:
arr = np.arange(12.).reshape((3, 4))
arr

array([[ 0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.]])

In [74]:
arr[0]

array([0., 1., 2., 3.])

In [75]:
arr - arr[0]

array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])


<br/>

### 函数应用和映射


In [76]:
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame

Unnamed: 0,b,d,e
Utah,-0.968497,0.889375,0.604942
Ohio,1.205142,-0.264666,0.806868
Texas,-0.274819,-1.264442,-0.187878
Oregon,1.504245,0.24757,-1.089808


In [77]:
np.abs(frame)

Unnamed: 0,b,d,e
Utah,0.968497,0.889375,0.604942
Ohio,1.205142,0.264666,0.806868
Texas,0.274819,1.264442,0.187878
Oregon,1.504245,0.24757,1.089808


In [78]:
f = lambda x: x.max() - x.min()

In [79]:
#列
frame.apply(f)

b    2.472742
d    2.153817
e    1.896676
dtype: float64

In [80]:
#row
frame.apply(f, axis='columns')

Utah      1.857872
Ohio      1.469807
Texas     1.076565
Oregon    2.594053
dtype: float64

In [81]:
def f(x):
    return pd.Series([x.min(), x.max()], index=['min', 'max'])

In [82]:
frame.apply(f)

Unnamed: 0,b,d,e
min,-0.968497,-1.264442,-1.089808
max,1.504245,0.889375,0.806868


In [83]:
frame.apply(f, axis='columns')

Unnamed: 0,min,max
Utah,-0.968497,0.889375
Ohio,-0.264666,1.205142
Texas,-1.264442,-0.187878
Oregon,-1.089808,1.504245


In [84]:
format1 = lambda x: '%.2f' % x
format2 = lambda x: '{:.3f}'.format(x)

In [85]:
frame.applymap(format1)

Unnamed: 0,b,d,e
Utah,-0.97,0.89,0.6
Ohio,1.21,-0.26,0.81
Texas,-0.27,-1.26,-0.19
Oregon,1.5,0.25,-1.09


In [86]:
frame.applymap(format2)

Unnamed: 0,b,d,e
Utah,-0.968,0.889,0.605
Ohio,1.205,-0.265,0.807
Texas,-0.275,-1.264,-0.188
Oregon,1.504,0.248,-1.09



<br>

### 排序和排名


In [87]:
obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])

In [88]:
obj

d    0
a    1
b    2
c    3
dtype: int64

In [89]:
#索引排序
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [90]:
#值排序
obj.sort_values()

d    0
a    1
b    2
c    3
dtype: int64

In [91]:
obj2 = pd.Series([4, np.nan, 7, np.nan, -3, 2])
obj2.sort_values()

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

In [92]:
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
                    index=['three', 'one'],
                    columns=['d', 'a', 'b', 'c'])

In [93]:
frame

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [94]:
#索引排序
frame.sort_index(ascending=False)

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [95]:
#值排序
frame.sort_values(by='b')

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [96]:
frame.sort_values(by=['a', 'b'], ascending=False)

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


<br>

### 带有重复标签的轴索引

虽然许多pandans函数都要求标签唯一，但这并不是强制性的。

In [97]:
obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])

In [98]:
obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [99]:
obj.index.is_unique

False

In [100]:
obj['a']

a    0
a    1
dtype: int64

In [101]:
df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])

In [102]:
df

Unnamed: 0,0,1,2
a,-0.327245,0.541054,-0.314721
a,-0.12679,-2.175487,0.01718
b,0.072036,-0.082775,1.527897
b,0.83535,-2.43146,0.419306


In [103]:
df.loc['b']

Unnamed: 0,0,1,2
b,0.072036,-0.082775,1.527897
b,0.83535,-2.43146,0.419306



<br/>

## 汇总和计算描述统计

In [104]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
                  [np.nan, np.nan], [0.75, -1.3]],
                  index=['a', 'b', 'c', 'd'],
                  columns=['one', 'two'])

In [105]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [106]:
df.sum()

one    9.25
two   -5.80
dtype: float64

In [107]:
df.sum(axis=1)

a    1.40
b    2.60
c     NaN
d   -0.55
dtype: float64

In [108]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


<br>

### 相关系数和协方差


In [114]:
import pandas_datareader.data as web

all_data = {ticker: web.get_data_yahoo(ticker)
           for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']}

price = pd.DataFrame({ticker: data['Adj Close'] for ticker, data in all_data.items()})

volume = pd.DataFrame({ticker: data['Volume'] for ticker, data in all_data.items()})

ImmediateDeprecationError: 
Yahoo Actions has been immediately deprecated due to large breaks in the API without the
introduction of a stable replacement. Pull Requests to re-enable these data
connectors are welcome.

See https://github.com/pydata/pandas-datareader/issues
