
# 第 3 章　使用 Pyhton 进行数据分析

## 第 1 节　使用 Python 进行描述统计：单变量



### 1. 统计分析与 scipy

In [1]:
# 用于数值计算的库
import numpy as np
import scipy as sp

In [2]:
# 设置浮点数打印精度
# %precision 3
np.set_printoptions(precision=3) # 设置浮点数打印精度为4

### 2. 单变量数据的操作

In [3]:
fish_data = np.array([2,3,3,4,4,4,4,5,5,6])
fish_data

array([2, 3, 3, 4, 4, 4, 4, 5, 5, 6])

### 3. 实现：总和与样本容量

In [4]:
# 总和
sp.sum(fish_data)

  


40

In [5]:
# 参考
np.sum(fish_data)

40

In [6]:
# 参考
fish_data.sum()

40

In [7]:
# 参考
sum(fish_data)

40

In [8]:
# 样本容量
len(fish_data)

10

### 4. 实现：均值（期望值）

In [9]:
# 计算均值
N = len(fish_data)
sum_value = sp.sum(fish_data)
mu = sum_value / N
mu

  This is separate from the ipykernel package so we can avoid doing imports until


4.0

In [10]:
# 计算均值的函数
sp.mean(fish_data)

  


4.0

### 5. 实现：样本方差

In [11]:
# 样本方差
sigma_2_sample = sp.sum((fish_data - mu) ** 2) / N
sigma_2_sample

  


1.2

In [12]:
fish_data

array([2, 3, 3, 4, 4, 4, 4, 5, 5, 6])

In [13]:
fish_data - mu

array([-2., -1., -1.,  0.,  0.,  0.,  0.,  1.,  1.,  2.])

In [14]:
(fish_data - mu) ** 2

array([4., 1., 1., 0., 0., 0., 0., 1., 1., 4.])

In [15]:
sp.sum((fish_data - mu) ** 2)

  """Entry point for launching an IPython kernel.


12.0

In [16]:
# 计算样本方差的函数
sp.var(fish_data, ddof = 0)

  


1.2

In [17]:
arr = np.arange(9)
print(arr.var())
help(arr.var())

6.666666666666667
Help on float64 object:

class float64(floating, builtins.float)
 |  Double-precision floating-point number type, compatible with Python `float`
 |  and C ``double``.
 |  Character code: ``'d'``.
 |  Canonical name: ``np.double``.
 |  Alias: ``np.float_``.
 |  Alias *on this platform*: ``np.float64``: 64-bit precision floating-point number type: sign bit, 11 bits exponent, 52 bits mantissa.
 |  
 |  Method resolution order:
 |      float64
 |      floating
 |      inexact
 |      number
 |      generic
 |      builtins.float
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __abs__(self, /)
 |      abs(self)
 |  
 |  __add__(self, value, /)
 |      Return self+value.
 |  
 |  __bool__(self, /)
 |      self != 0
 |  
 |  __divmod__(self, value, /)
 |      Return divmod(self, value).
 |  
 |  __eq__(self, value, /)
 |      Return self==value.
 |  
 |  __float__(self, /)
 |      float(self)
 |  
 |  __floordiv__(self, value, /)
 |      Return self//value.
 

### 6. 实现：无偏方差

In [18]:
# 无偏方差
sigma_2 = sp.sum((fish_data - mu) ** 2) / (N - 1)
sigma_2

  


1.3333333333333333

In [19]:
# 无偏方差
sp.var(fish_data, ddof = 1)

  


1.3333333333333333

### 7. 实现：标准差

In [20]:
# 标准差
sigma = sp.sqrt(sigma_2)
sigma

  


1.1547005383792515

In [21]:
# 计算标准差的函数
sp.std(fish_data, ddof = 1)

  


1.1547005383792515

### 8. 补充：标准化

In [22]:
fish_data - mu

array([-2., -1., -1.,  0.,  0.,  0.,  0.,  1.,  1.,  2.])

In [23]:
sp.mean(fish_data - mu)

  """Entry point for launching an IPython kernel.


0.0

In [24]:
fish_data / sigma

array([1.732, 2.598, 2.598, 3.464, 3.464, 3.464, 3.464, 4.33 , 4.33 ,
       5.196])

In [25]:
sp.std(fish_data / sigma, ddof = 1)

  """Entry point for launching an IPython kernel.


1.0

In [26]:
standard = (fish_data - mu) / sigma
standard

array([-1.732, -0.866, -0.866,  0.   ,  0.   ,  0.   ,  0.   ,  0.866,
        0.866,  1.732])

In [27]:
sp.mean(standard)

  """Entry point for launching an IPython kernel.


2.2204460492503132e-17

In [28]:
sp.std(standard, ddof = 1)

  """Entry point for launching an IPython kernel.


1.0

### 9. 补充：其他统计量

In [29]:
# 最大值
sp.amax(fish_data)

  


6

In [30]:
# 最小值
sp.amin(fish_data)

  


2

In [31]:
# 中位数
sp.median(fish_data)

  


4.0

In [32]:
fish_data_2 = np.array([2,3,3,4,4,4,4,5,5,100])

In [33]:
sp.mean(fish_data_2)

  """Entry point for launching an IPython kernel.


13.4

In [34]:
sp.median(fish_data_2)

  """Entry point for launching an IPython kernel.


4.0

### 10. 实现：scipy.stats 与四分位数

In [35]:
from scipy import stats

In [36]:
fish_data_3 = np.array([1,2,3,4,5,6,7,8,9])
stats.scoreatpercentile(fish_data_3, 25)

3.0

In [37]:
stats.scoreatpercentile(fish_data_3, 75)

7.0