In [2]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np

## 排序和排名

In [3]:
obj = Series(range(4), index=['d','a','b','c'])
# 按行或列索引排序
obj2 = obj.sort_index()
print(obj2)

a    1
b    2
c    3
d    0
dtype: int64


In [6]:
frame = DataFrame(np.arange(8).reshape((2,4)), 
                  index=['three','one'],
                  columns=['d','a','b','c'])
frame2 = frame.sort_index()
print(frame2)
frame3 = frame.sort_index(axis=1)
print(frame3)

       d  a  b  c
one    4  5  6  7
three  0  1  2  3
       a  b  c  d
three  1  2  3  0
one    5  6  7  4


In [7]:
print(frame.sort_index(axis=1, ascending=False))

       d  c  b  a
three  0  3  2  1
one    4  7  6  5


In [8]:
# 按值对Series排序
obj = Series([4,7,-3,2])
print(obj.sort_values())

2   -3
3    2
0    4
1    7
dtype: int64


In [9]:
# 缺失值放到末尾
obj = Series([4, np.nan, 7, np.nan, -3, 2])
print(obj.sort_values())

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64


In [12]:
# 按其中一列或多列排序
frame = DataFrame({'b':[4,7,-3,2],'a':[0,1,0,1]})
print(frame)
print(frame.sort_values(by='b'))
print(frame.sort_values(by=['a','b']))

   b  a
0  4  0
1  7  1
2 -3  0
3  2  1
   b  a
2 -3  0
3  2  1
0  4  0
1  7  1
   b  a
2 -3  0
0  4  0
3  2  1
1  7  1


In [13]:
# 排名
obj = Series([7, -5, 7, 4, 2, 0, 4])
# 在相同分组中，为各值分配平均排名
print(obj.rank())

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64


In [14]:
# 在相同分组中，为各值分配第一排名
print(obj.rank(method='first'))

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64


In [15]:
# 在相同分组中，为各值分配最大排名
print(obj.rank(ascending=False, method='max'))
# 在相同分组中，为各值分配最小排名
print(obj.rank(ascending=False, method='min'))


0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64
0    1.0
1    7.0
2    1.0
3    3.0
4    5.0
5    6.0
6    3.0
dtype: float64


In [16]:
frame = DataFrame({'b':[4.3, 7, -3, 2],
                   'a':[0, 1, 0, 1],
                   'c':[-2, 5, 8, -2.5]})
print(frame)
print(frame.rank(axis=1))

     b  a    c
0  4.3  0 -2.0
1  7.0  1  5.0
2 -3.0  0  8.0
3  2.0  1 -2.5
     b    a    c
0  3.0  2.0  1.0
1  3.0  1.0  2.0
2  1.0  2.0  3.0
3  3.0  2.0  1.0


## 带有重复值的轴索引

In [17]:
obj = Series(range(5), index=['a','a','b','b','c'])
print(obj)

a    0
a    1
b    2
b    3
c    4
dtype: int64


In [18]:
print(obj.index.is_unique)

False


In [19]:
# 重复索引返回series
print(obj['a'])
# 不重复索引返回标量值
print(obj['c'])


a    0
a    1
dtype: int64
4


In [20]:
df = DataFrame(np.random.randn(4,3), index=['a','a','b','b'])
print(df)
print(df.loc['b'])

          0         1         2
a  0.566535  1.594597 -0.340173
a  0.498034  0.114467 -0.467178
b -0.420690 -0.379445 -0.149683
b -0.880011  1.048230 -0.061825
          0         1         2
b -0.420690 -0.379445 -0.149683
b -0.880011  1.048230 -0.061825


## 汇总和计算描述统计

In [21]:
df = DataFrame([[1.4, np.nan], [7.1, -4.5],
                [np.nan, np.nan], [0.75, -1.3]],
               index=['a','b','c','d'],
               columns=['one','two'])
print(df)

    one  two
a  1.40  NaN
b  7.10 -4.5
c   NaN  NaN
d  0.75 -1.3


In [23]:
# 求和 
# 排除NA
print(df.sum())
print(df.sum(axis=1))

one    9.25
two   -5.80
dtype: float64
a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64


In [24]:
# 不排除NA
print(df.mean(axis=1, skipna=False))


a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64


In [26]:
print(df.max())
# 最大值的索引
print(df.idxmax())

one    7.1
two   -1.3
dtype: float64
one    b
two    d
dtype: object


In [27]:
# 累计和
print(df.cumsum())


    one  two
a  1.40  NaN
b  8.50 -4.5
c   NaN  NaN
d  9.25 -5.8


In [28]:
# 计算列的汇总统计
print(df.describe())

            one       two
count  3.000000  2.000000
mean   3.083333 -2.900000
std    3.493685  2.262742
min    0.750000 -4.500000
25%    1.075000 -3.700000
50%    1.400000 -2.900000
75%    4.250000 -2.100000
max    7.100000 -1.300000


In [29]:
obj = Series(['a','a','b','c'] * 4)
print(obj.describe())

count     16
unique     3
top        a
freq       8
dtype: object


## 相关系数与协方差

In [40]:
import pandas.io.data as web

all_data = {}
for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']:
    all_data[ticker] = web.get_data_yahoo(ticker)

price = DataFrame({tic: data['Adj Close']
                   for tic, data in all_data.iteritems()})
volume = DataFrame({tic: data['Volume']
                    for tic, data in all_data.iteritems()})


In [None]:
# [90, 91]  (91-90)/90
returns = price.pct_change()
returns.tail()

In [None]:
# 相关系数
returns.MSFT.corr(returns.IBM)

In [None]:
# 协方差
returns.MSFT.cov(returns.IBM)

In [None]:
returns.corr()

In [None]:
returns.cov()

In [None]:
# 其列或行与另一个Series或dataframe间的相关系数
returns.corrwith(returns.IBM)

In [None]:
# 传一个dataframe会计算按列名配对的相关系数
returns.corrwith(volume)


## 唯一值、值计数以及成员资格
