#  描述性统计的概述与运算


In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame(np.arange(9).reshape(3,3),index=['a','b','c'],columns=['d1','d2','d3'])
df.iloc[0,1]=np.nan
df.iloc[1,2]=np.nan
df

Unnamed: 0,d1,d2,d3
a,0,,2.0
b,3,4.0,
c,6,7.0,8.0


In [3]:
#Pandas能够处理缺失值，例如sum
df.sum()

d1     9.0
d2    11.0
d3    10.0
dtype: float64

In [4]:
#sum()的axis参数如之前所说的，0：索引；1：列
df.sum(axis=1)

a     2.0
b     7.0
c    21.0
dtype: float64

In [5]:
#如果整列/整行都出现NaN，会被排除，除非使用skipna=False

#### sum() 参数列表
<img src="8.png" width="40%">

#### idxmin和idxmax返回的是间接统计信息，比如最小值或者最大值的索引值
 

In [6]:
df.idxmax()

d1    c
d2    c
d3    c
dtype: object

In [7]:
#axis参数改为列
df.idxmax(axis=1)

a    d3
b    d2
c    d3
dtype: object

In [8]:
#累计列的数值
df.cumsum()

Unnamed: 0,d1,d2,d3
a,0.0,,2.0
b,3.0,4.0,
c,9.0,11.0,10.0


In [9]:
#产生多个汇总统计
df

Unnamed: 0,d1,d2,d3
a,0,,2.0
b,3,4.0,
c,6,7.0,8.0


In [10]:
df.describe()

Unnamed: 0,d1,d2,d3
count,3.0,2.0,2.0
mean,3.0,5.5,5.0
std,3.0,2.12132,4.242641
min,0.0,4.0,2.0
25%,1.5,4.75,3.5
50%,3.0,5.5,5.0
75%,4.5,6.25,6.5
max,6.0,7.0,8.0


In [11]:
#如果不是数值型，describle()产生另一种汇总统计

In [12]:
obj = pd.Series(list('abcdeff'))
obj

0    a
1    b
2    c
3    d
4    e
5    f
6    f
dtype: object

In [13]:
obj.describe()

count     7
unique    6
top       f
freq      2
dtype: object

#### 汇总统计及其相关方法
<img src="9.png" width="50%" >
<img src="9-1.png" width="50%" >

## 相关性、协方差


In [14]:
import pandas_datareader.data as web

all_data={ticker:web.get_data_yahoo(ticker) for ticker in ['AAPL','IBM','MSFT','GOOG']}
price = pd.DataFrame({ticker:data['Adj Close'] for ticker,data in all_data.items()})
volume=pd.DataFrame({ticker:data['Volume'] for ticker,data in all_data.items()})

returns = price.pct_change()
returns.tail()

##### 暂时运行不出来，参考P161

## 唯一值、计数和成员属性

In [15]:
obj=pd.Series(list('adfasdafadsfdaf'))
obj

0     a
1     d
2     f
3     a
4     s
5     d
6     a
7     f
8     a
9     d
10    s
11    f
12    d
13    a
14    f
dtype: object

In [16]:
#unique()方法返回数组
uniques=obj.unique()
uniques

array(['a', 'd', 'f', 's'], dtype=object)

In [17]:
#计算每个值的个数
obj.value_counts(ascending=False)

a    5
f    4
d    4
s    2
dtype: int64

In [18]:
#成员检查
mask=obj.isin(['s','d'])
mask

0     False
1      True
2     False
3     False
4      True
5      True
6     False
7     False
8     False
9      True
10     True
11    False
12     True
13    False
14    False
dtype: bool

In [19]:
obj[mask]

1     d
4     s
5     d
9     d
10    s
12    d
dtype: object

In [20]:
#利用Index.get_indexer()方法将可能非唯一值数组转换成另一个唯一值数组
to_match=pd.Series(list('cabbca'))
unique_values=pd.Series(list('cba'))
pd.Index(unique_values).get_indexer(to_match)

array([0, 2, 1, 1, 0, 2], dtype=int64)

#### 唯一值、计数、集合成员属性方法
<img src="10.png" width="50%" >
<img src="10-1.png" width="50%" >

In [21]:
#计算DataFrame多个相关列的直方图
dict={'Qu1':[1,3,4,3,4],'Qu2':[2,3,1,2,3],'Qu3':[1,5,2,4,4]}
data=pd.DataFrame(dict)
data

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [22]:
data.apply(pd.Series.value_counts)

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,,2.0,1.0
3,2.0,2.0,
4,2.0,,2.0
5,,,1.0


In [23]:
#将NaN填充为0
data.apply(pd.Series.value_counts).fillna(0)

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0
