# 统计分析

## 基本操作

### 计算型统计

#### 1. 条数统计

In [3]:
import pandas as pd
import numpy as np

In [4]:
x = 100

In [5]:
arr = np.arange(1, 25).reshape(4,6)
arr

array([[ 1,  2,  3,  4,  5,  6],
       [ 7,  8,  9, 10, 11, 12],
       [13, 14, 15, 16, 17, 18],
       [19, 20, 21, 22, 23, 24]])

In [6]:
data = {
    "Person":["John", "Myla", "Lewis", "John", "Myla"],
    "Age": [24., np.nan, 21., 33, 26],
    "Single": [False, True, True, True, False]
}

df = pd.DataFrame(data)
df

Unnamed: 0,Person,Age,Single
0,John,24.0,False
1,Myla,,True
2,Lewis,21.0,True
3,John,33.0,True
4,Myla,26.0,False


In [7]:
df.count()

Person    5
Age       4
Single    5
dtype: int64

In [8]:
df['Single'] == True

0    False
1     True
2     True
3     True
4    False
Name: Single, dtype: bool

In [9]:
df_single = df[df['Single'] == True]
df_single

Unnamed: 0,Person,Age,Single
1,Myla,,True
2,Lewis,21.0,True
3,John,33.0,True


In [10]:
df_single['Single'].count()

3

In [11]:
df[df['Age'] < 30].count()

Person    3
Age       3
Single    3
dtype: int64

#### 2. 维度统计

In [12]:
df.shape

(5, 3)

In [13]:
df_single.shape

(3, 3)

In [14]:
num_rows = df.shape[0]
num_rows

5

In [15]:
num_cols = df.shape[1]
num_cols

3

In [16]:
num_rows = len(df)
num_rows

5

In [17]:
df.index

RangeIndex(start=0, stop=5, step=1)

In [18]:
df.values

array([['John', 24.0, False],
       ['Myla', nan, True],
       ['Lewis', 21.0, True],
       ['John', 33.0, True],
       ['Myla', 26.0, False]], dtype=object)

In [19]:
len(df.values)

5

In [20]:
df.axes

[RangeIndex(start=0, stop=5, step=1),
 Index(['Person', 'Age', 'Single'], dtype='object')]

In [21]:
list(df.index)

[0, 1, 2, 3, 4]

In [22]:
list(df.axes[0])

[0, 1, 2, 3, 4]

In [23]:
df.columns

Index(['Person', 'Age', 'Single'], dtype='object')

In [24]:
list(df.axes[1])

['Person', 'Age', 'Single']

In [25]:
df.ndim

2

In [26]:
df.size

15

In [27]:
df.any(axis=1)

0    True
1    True
2    True
3    True
4    True
dtype: bool

#### 3. 特定值统计

In [28]:
df.groupby(['Single'], as_index=False).size()

Single
False    2
True     3
dtype: int64

In [29]:
df['Single'].value_counts()

True     3
False    2
Name: Single, dtype: int64

In [30]:
df2 = pd.DataFrame(np.arange(16).reshape(4,4),\
                    index = list("ABCD"),\
                    columns=list('wxyz'))
df2

Unnamed: 0,w,x,y,z
A,0,1,2,3
B,4,5,6,7
C,8,9,10,11
D,12,13,14,15


In [31]:
df2 > 6

Unnamed: 0,w,x,y,z
A,False,False,False,False
B,False,False,False,True
C,True,True,True,True
D,True,True,True,True


In [34]:
(df2 > 6).sum(axis = 0)

w    2
x    2
y    2
z    3
dtype: int64

In [33]:
df2[df2.x > 8]

Unnamed: 0,w,x,y,z
C,8,9,10,11
D,12,13,14,15


In [90]:
df2[df2['y'].isin([2,10])]

Unnamed: 0,w,x,y,z
A,0,1,2,3
C,8,9,10,11


In [91]:
df2['x'].value_counts()

13    1
5     1
9     1
1     1
Name: x, dtype: int64

In [97]:
len(df2[df2['w'] == 8])

1

### 特征型统计

#### 1. 最大值

In [98]:
df.max()

Person    Myla
Age         33
Single    True
dtype: object

In [99]:
df2.max()

w    12
x    13
y    14
z    15
dtype: int64

In [101]:
df2['x'].max()

13

In [110]:
df2.max().max()

15

In [104]:
df2['x']

A     1
B     5
C     9
D    13
Name: x, dtype: int64

In [106]:
df2.idxmax()

w    D
x    D
y    D
z    D
dtype: object

#### 2. 最小值

In [107]:
df2.min()

w    0
x    1
y    2
z    3
dtype: int64

In [108]:
df2.min().min()

0

In [111]:
df2.idxmin()

w    A
x    A
y    A
z    A
dtype: object

In [116]:
df2.idxmin()[0]

'A'

In [117]:
idx = pd.MultiIndex.from_arrays([
    ['warm', 'warm', 'cold', 'cold'],
    ['dog', 'falcon', 'fish', 'spider']],
    names=['blooded', 'animal']
)
s = pd.Series([4, 2, 0, 8], name='legs', index=idx)
s

blooded  animal
warm     dog       4
         falcon    2
cold     fish      0
         spider    8
Name: legs, dtype: int64

In [118]:
s.min()

0

In [122]:
min_blooded = s.min(level='blooded')
min_blooded

blooded
warm    2
cold    0
Name: legs, dtype: int64

In [124]:
min_blooded['warm']

2

In [125]:
min_blooded['cold']

0

In [130]:
min_animal_legs = s.min(level=1).min()
min_animal_legs

0

#### 3. 中值

In [133]:
df2

Unnamed: 0,w,x,y,z
A,0,1,2,3
B,4,5,6,7
C,8,9,10,11
D,12,13,14,15


In [35]:
median_rows = df2.median()
median_rows

w    6.0
x    7.0
y    8.0
z    9.0
dtype: float64

In [137]:
median_rows.median()

7.5

In [39]:
median_cols = df2.median(axis = 1)
median_cols

A     1.5
B     5.5
C     9.5
D    13.5
dtype: float64

In [40]:
median_cols.median()

7.5

## 案例 - 菜品订单数据分析

### 1. 读取数据表

In [3]:
import pandas as pd
# 读取订单信息表
orders = pd.read_csv('data/meal_order_info.csv', encoding='gbk')
len(orders)

945

In [4]:
# 读取客户信息Excel数据
users = pd.read_excel('data/users.xlsx')
len(users)

734

In [5]:
# 读取订单详情表
order_details = pd.read_excel('data/meal_order_detail.xlsx')
len(order_details)

2779

### 2. 任务分析

#### 1. 查看数据基本信息

1. 基本属性

In [6]:
# 查看元素
users.values

array([[1, 'admin', '超级管理员', ..., '广东广州', '泰迪科技', 23.0],
       [981, nan, '老师', ..., nan, nan, nan],
       [982, nan, '叶亦凯', ..., '广东广州', '广州', 21.0],
       ...,
       [1644, nan, '杜小悦', ..., '广东广州', '佛山', 23.0],
       [1645, nan, '杜雨玲', ..., '福建厦门', '广州', 37.0],
       [1646, nan, '杜依醇', ..., '广西南宁', '佛山', 46.0]], dtype=object)

In [7]:
orders.values

array([[417, 1442, 4, ..., 1, 18688880641, '苗宇怡'],
       [301, 1095, 3, ..., 1, 18688880174, '赵颖'],
       [413, 1147, 6, ..., 1, 18688880276, '徐毅凡'],
       ...,
       [692, 1155, 8, ..., 1, 18688880327, '习一冰'],
       [647, 1094, 4, ..., 1, 18688880207, '章春华'],
       [570, 1113, 8, ..., 1, 18688880313, '唐雅嘉']], dtype=object)

In [8]:
order_details.values

array([[2956, 417, 610062, ..., nan, 'caipu/104001.jpg', 1442],
       [2958, 417, 609957, ..., nan, 'caipu/202003.jpg', 1442],
       [2961, 417, 609950, ..., nan, 'caipu/303001.jpg', 1442],
       ...,
       [6756, 774, 609949, ..., nan, 'caipu/404005.jpg', 1138],
       [6763, 774, 610014, ..., nan, 'caipu/302003.jpg', 1138],
       [6764, 774, 610017, ..., nan, 'caipu/302006.jpg', 1138]],
      dtype=object)

In [9]:
# 查看索引
users.index

RangeIndex(start=0, stop=734, step=1)

In [11]:
# 查看列名
users.columns

Index(['USER_ID', 'MYID', 'ACCOUNT', 'NAME', 'ORGANIZE_ID', 'ORGANIZE_NAME',
       'DUTY_ID', 'TITLE_ID', 'PASSWORD', 'EMAIL', 'LANG', 'THEME',
       'FIRST_VISIT', 'PREVIOUS_VISIT', 'LAST_VISITS', 'LOGIN_COUNT',
       'ISEMPLOYEE', 'STATUS', 'IP', 'DESCRIPTION', 'QUESTION_ID', 'ANSWER',
       'ISONLINE', 'CREATED', 'LASTMOD', 'CREATER', 'MODIFYER', 'TEL', 'stuNo',
       'qq', 'weixin', 'meal_arithmetic_id', 'arithmetic_name', 'sex', 'poo',
       'address', 'age'],
      dtype='object')

In [12]:
# 查看类型
users.dtypes

USER_ID                        int64
MYID                          object
ACCOUNT                       object
NAME                          object
ORGANIZE_ID                    int64
ORGANIZE_NAME                 object
DUTY_ID                      float64
TITLE_ID                     float64
PASSWORD                      object
EMAIL                         object
LANG                         float64
THEME                        float64
FIRST_VISIT           datetime64[ns]
PREVIOUS_VISIT               float64
LAST_VISITS           datetime64[ns]
LOGIN_COUNT                  float64
ISEMPLOYEE                   float64
STATUS                       float64
IP                           float64
DESCRIPTION                   object
QUESTION_ID                  float64
ANSWER                       float64
ISONLINE                     float64
CREATED               datetime64[ns]
LASTMOD               datetime64[ns]
CREATER                      float64
MODIFYER                     float64
T

In [13]:
orders.dtypes

info_id                 int64
emp_id                  int64
number_consumers        int64
mode                  float64
dining_table_id         int64
dining_table_name       int64
expenditure             int64
dishes_count            int64
accounts_payable        int64
use_start_time         object
check_closed          float64
lock_time              object
cashier_id            float64
pc_id                 float64
order_number          float64
org_id                  int64
print_doc_bill_num    float64
lock_table_info       float64
order_status            int64
phone                   int64
name                   object
dtype: object

In [14]:
order_details.dtypes

detail_id                     int64
order_id                      int64
dishes_id                     int64
logicprn_name               float64
parent_class_name           float64
dishes_name                  object
itemis_add                    int64
counts                        int64
amounts                       int64
cost                        float64
place_order_time     datetime64[ns]
discount_amt                float64
discount_reason             float64
kick_back                   float64
add_inprice                   int64
add_info                    float64
bar_code                    float64
picture_file                 object
emp_id                        int64
dtype: object

2. 大小、纬度和形状

In [15]:
# 大小
order_details.size

52801

In [16]:
users.size

27158

In [17]:
orders.size

19845

In [18]:
# 维度数
order_details.ndim

2

In [19]:
orders.ndim

2

In [20]:
users.ndim

2

In [21]:
# 数据形状
order_details.shape

(2779, 19)

In [22]:
orders.shape

(945, 21)

In [23]:
users.shape

(734, 37)

In [24]:
# 转置
order_details.T.shape

(19, 2779)

#### 2. 访问数据

1. 基本访问

In [25]:
# 查看单列多行
users['ACCOUNT'][:5]

0    超级管理员
1       老师
2      叶亦凯
3      邓彬彬
4      张建涛
Name: ACCOUNT, dtype: object

In [26]:
orders.dishes_count[:5]

0     5
1     6
2    15
3    10
4    24
Name: dishes_count, dtype: int64

In [27]:
order_details.dishes_name.str.strip()[:5]

0     蒜蓉生蚝
1    蒙古烤羊腿
2     大蒜苋菜
3    芝麻烤紫菜
4      蒜香包
Name: dishes_name, dtype: object

In [28]:
# 查看多列多行
invalid_cols = users.isnull().sum() > 700
valid_users = users.loc[:, ~invalid_cols]
valid_users.iloc[:5,:5]

Unnamed: 0,USER_ID,ACCOUNT,NAME,ORGANIZE_ID,ORGANIZE_NAME
0,1,超级管理员,admin,130,根目录
1,981,老师,teacher,328,统计班
2,982,叶亦凯,sx,328,统计班
3,983,邓彬彬,lyy,328,统计班
4,984,张建涛,zad,328,统计班


In [29]:
# 查看前几行和后几行
valid_users.iloc[-5:,-5:]

Unnamed: 0,stuNo,sex,poo,address,age
729,2017771,女,江西九江,佛山,48.0
730,2017772,男,天津,广州,43.0
731,2017773,女,广东广州,佛山,23.0
732,2017774,女,福建厦门,广州,37.0
733,2017775,女,广西南宁,佛山,46.0


2. 切片和索引

In [30]:
# 切片
valid_orders = orders.loc[:,~(orders.isnull().sum() > 900)]
valid_orders.loc[:,'info_id':'dining_table_id'].tail(10)

Unnamed: 0,info_id,emp_id,number_consumers,dining_table_id
935,599,1102,8,1491
936,721,1129,6,1486
937,533,1118,1,1482
938,551,1086,8,1519
939,695,1096,6,1486
940,641,1095,8,1492
941,672,1089,6,1489
942,692,1155,8,1492
943,647,1094,4,1485
944,570,1113,8,1517


In [33]:
# 花式索引
valid_order_details = order_details.loc[:,order_details.isnull().sum() < 2000]
valid_order_details.loc[:,'dishes_name'] = valid_order_details['dishes_name'].str.strip()
valid_order_details.iloc[:5, [0,1,3,5,6]]

Unnamed: 0,detail_id,order_id,dishes_name,counts,amounts
0,2956,417,蒜蓉生蚝,1,49
1,2958,417,蒙古烤羊腿,1,48
2,2961,417,大蒜苋菜,1,30
3,2966,417,芝麻烤紫菜,1,25
4,2968,417,蒜香包,1,13


In [34]:
# 条件索引
amounts_gt50 = valid_order_details.loc[:,'amounts'] > 50
amounts_gt50 = valid_order_details.loc[amounts_gt50]
amounts_gt50[['order_id','dishes_name','amounts']].head(10)

Unnamed: 0,order_id,dishes_name,amounts
5,301,白斩鸡,88
6,301,香烤牛排,55
7,301,干锅田鸡,88
11,413,芝士烩波士顿龙虾,175
12,413,葱姜炒蟹,109
14,413,爆炒鳝碌,55
15,413,干锅田鸡,88
17,413,重庆特色油烧兔,69
27,413,焖猪手,58
37,392,剁椒鱼头,55


3. 增删改查

In [35]:
# 更改数据
valid_users.loc[users['NAME'] == 'teacher','ACCOUNT']= 'BAI'
valid_users.loc[users['NAME'] == 'teacher'].iloc[:,:5]

Unnamed: 0,USER_ID,ACCOUNT,NAME,ORGANIZE_ID,ORGANIZE_NAME
1,981,BAI,teacher,328,统计班


In [69]:
# 增加数据
valid_order_details.loc[:,'payment'] = \
valid_order_details['counts'] * order_details['amounts']
valid_order_details['payment'].head()

0    49.0
1    48.0
2    30.0
3    25.0
4    13.0
Name: payment, dtype: float64

In [70]:
# 删除某列或某行数据
valid_order_details.drop([len(valid_order_details)- 1], inplace=True)
valid_order_details.iloc[-5:,5:]

Unnamed: 0,counts,amounts,place_order_time,add_inprice,picture_file,emp_id,payment
2770,1,33,2016-08-10 20:57:11,0,caipu/304005.jpg,1112,33.0
2771,1,78,2016-08-10 21:50:35,0,caipu/103002.jpg,1138,78.0
2772,1,65,2016-08-10 21:53:17,0,caipu/103005.jpg,1138,65.0
2773,1,45,2016-08-10 21:55:58,0,caipu/201004.jpg,1138,45.0
2774,1,10,2016-08-10 21:56:24,0,caipu/601005.jpg,1138,10.0


#### 3. 描述性统计

描述性统计是用来概括、表述事物整体状况、以及事物间关联、类属关系的统计方法。

##### 1. 数值型特征的描述性统计

Numpy中的描述性统计方法

方法名称 | 说明 | 方法名称 | 说明
---|---|---|---
np.min | 最小值 | np.max | 最大值
np.mean| 均值 | np.ptp |极差
np.median|中位数 |np.std|标准差
np.var |方差 | np.cov |协方差

Pandas描述性统计方法

方法名称 | 说明 | 方法名称 | 说明
---|---|---|---
min|最小值 |max|最大值
mean |均值 | ptp | 极差
median | 中位数 | std | 标准差
var |方差 | cov | 协方差 
sem | 标准误差 | mode |众数
skew |样本偏度 | kurt |样本峰度
quantile|四分位数 | count |非空值数目
describe|描述统计 |mad|平均绝对离差

In [55]:
# 计算订单详情表中菜品销量的平均值
order_details[['counts','amounts']].mean()

counts      1.111191
amounts    45.337172
dtype: float64

In [53]:
# 使用describe查看数量和销售额的描述性统计
order_details[['counts','amounts']].describe()

Unnamed: 0,counts,amounts
count,2779.0,2779.0
mean,1.111191,45.337172
std,0.625428,36.80855
min,1.0,1.0
25%,1.0,25.0
50%,1.0,35.0
75%,1.0,56.0
max,10.0,178.0


##### 2. 类别型特征的描述性统计

In [60]:
# 对菜品名称频数统计
order_details['dishes_name'].describe(include='O')

count      2779
unique      154
top       白饭/大碗
freq         92
Name: dishes_name, dtype: object

In [61]:
# 使用category类型（字段）进行类别型描述性统计
order_details['dishes_name'] =\
order_details['dishes_name'].astype('category')
order_details.describe(include='category')

Unnamed: 0,dishes_name
count,2779
unique,154
top,白饭/大碗
freq,92


### 3. 任务实现

1. 查看餐饮数据的大小和纬度

In [62]:
users.size, users.shape

(27158, (734, 37))

In [63]:
orders.size, orders.shape

(19845, (945, 21))

In [64]:
order_details.size, order_details.shape

(52801, (2779, 19))

2. 统计餐饮菜品销售状况

In [71]:
order_details[['counts','amounts']].describe()

Unnamed: 0,counts,amounts
count,2779.0,2779.0
mean,1.111191,45.337172
std,0.625428,36.80855
min,1.0,1.0
25%,1.0,25.0
50%,1.0,35.0
75%,1.0,56.0
max,10.0,178.0


3. 剔除全为空值或者所有元素取值相同的列

In [1]:
!jt -l

Available Themes: 
   chesterish
   grade3
   gruvboxd
   gruvboxl
   monokai
   oceans16
   onedork
   solarizedd
   solarizedl
