In [6]:
import pandas as pd
import numpy as np

In [7]:
# 创建一个示例DataFrame
np.random.seed(0)
df = pd.DataFrame({
    'Column': np.random.choice(['A', 'B', 'C', 'D'], size=100),
    'NumericColumn': np.random.randn(100) * 100,
    'Size': np.random.randint(0, 51, size=100)
})

1. describe()
注释: describe() 方法提供了数据集中数值型列的描述性统计信息，如计数、平均值、标准差、最小值、四分位数和最大值。

In [12]:
# 1. describe() - 获取数据集的描述性统计信息
desc_stats = df.describe()
print("Describe Stats:\n", desc_stats)

# 实例1: 获取数据集的描述性统计信息
desc_stats = df.describe()
print("\nDescribe Stats:\n", desc_stats)

# 实例2: 只获取数据集的均值和标准差
desc_stats_mean_std = df.describe(include=[np.number], exclude=[np.dtype(object).type])
print("\nMean and Std:\n", desc_stats_mean_std)

# 实例3: 排除NaN值并获取描述性统计信息
desc_stats_no_nan = df.dropna().describe()
print("\nDescribe without NaN:\n", desc_stats_no_nan)

Describe Stats:
        NumericColumn        Size
count     100.000000  100.000000
mean        6.218384   23.350000
std        97.828718   13.731461
min      -172.628260    0.000000
25%       -67.695202   11.750000
50%         5.405537   22.500000
75%        71.220252   32.250000
max       195.077540   50.000000

Describe Stats:
        NumericColumn        Size
count     100.000000  100.000000
mean        6.218384   23.350000
std        97.828718   13.731461
min      -172.628260    0.000000
25%       -67.695202   11.750000
50%         5.405537   22.500000
75%        71.220252   32.250000
max       195.077540   50.000000

Mean and Std:
        NumericColumn        Size
count     100.000000  100.000000
mean        6.218384   23.350000
std        97.828718   13.731461
min      -172.628260    0.000000
25%       -67.695202   11.750000
50%         5.405537   22.500000
75%        71.220252   32.250000
max       195.077540   50.000000

Describe without NaN:
        NumericColumn        Size
c

2. value_counts()
注释: value_counts() 方法返回数据集中某个列的值出现的次数。

In [13]:
# 2. value_counts() - 计算某列中每个值的出现次数
value_counts = df['Column'].value_counts()
print("\nValue Counts:\n", value_counts)

# 实例1: 计算某列中每个值的出现次数
value_counts = df['Column'].value_counts()
print("\nValue Counts:\n", value_counts)

# 实例2: 计算出现次数并排序
sorted_counts = df['Column'].value_counts(ascending=False)
print("\nSorted Value Counts:\n", sorted_counts)

# 实例3: 计算出现次数并去除NaN值
counts_without_nan = df['Column'].value_counts(dropna=True)
print("\nCounts Without NaN:\n", counts_without_nan)


Value Counts:
 Column
D    32
A    25
B    24
C    19
Name: count, dtype: int64

Value Counts:
 Column
D    32
A    25
B    24
C    19
Name: count, dtype: int64

Sorted Value Counts:
 Column
D    32
A    25
B    24
C    19
Name: count, dtype: int64

Counts Without NaN:
 Column
D    32
A    25
B    24
C    19
Name: count, dtype: int64


3. cut()
注释: cut() 方法将连续数值变量切分为多个区间（分箱），可以指定区间边界和标签。

In [15]:
# 3. cut() - 将连续数值变量切分为多个区间（分箱）
# 实例1: 根据指定的区间划分数值列
bins = [0, 10, 20, 30, 40, 50]
labels = ['0-10', '10-20', '20-30', '30-40', '40-50']
df['Binned'] = pd.cut(df['Size'], bins=bins, labels=labels)

# 实例2: 使用默认的区间和标签
# 这里需要指定bins的值，例如分箱数量为5
df['BinnedDefault'] = pd.cut(df['Size'], bins=5, right=False)  # right=False 表示包含左区间，不包含右区间

# 实例3: 包含区间的上界
df['BinnedUpper'] = pd.cut(df['Size'], bins=[0, 10, 20, 30, 40, np.inf], labels=['0-10', '10-20', '20-30', '30-40', '40+'])

# 打印分箱结果
print("\nBinned Column:\n", df['Binned'].value_counts())
print("\nBinned with Default:\n", df['BinnedDefault'].value_counts())
print("\nBinned Upper:\n", df['BinnedUpper'].value_counts())


Binned Column:
 Binned
20-30    28
10-20    22
0-10     19
40-50    15
30-40    13
Name: count, dtype: int64

Binned with Default:
 BinnedDefault
[20.0, 30.0)     27
[0.0, 10.0)      21
[10.0, 20.0)     21
[30.0, 40.0)     16
[40.0, 50.05)    15
Name: count, dtype: int64

Binned Upper:
 BinnedUpper
20-30    28
10-20    22
0-10     19
40+      15
30-40    13
Name: count, dtype: int64


4. groupby()
注释: groupby() 方法用于根据一个或多个键对数据进行分组。

In [18]:
# 4. groupby() - 根据一个或多个键对数据进行分组
grouped = df.groupby('Column')['NumericColumn'].mean()
print("\nGrouped Mean:\n", grouped)

# 实例1: 根据某列的值对数据进行分组
grouped = df.groupby('Column')['NumericColumn'].mean()
print("\nGrouped Mean:\n", grouped)

# 实例2: 多级分组
grouped = df.groupby(['Column', pd.cut(df['Size'], bins=[0, 25, 50, np.inf], labels=['0-25', '25-50', '50+'])]).size()
print("\nMulti Index Grouped:\n", grouped)


Grouped Mean:
 Column
A    22.774037
B    11.046260
C     9.968720
D   -12.563387
Name: NumericColumn, dtype: float64

Grouped Mean:
 Column
A    22.774037
B    11.046260
C     9.968720
D   -12.563387
Name: NumericColumn, dtype: float64

Multi Index Grouped:
 Column  Size 
A       0-25     16
        25-50     8
        50+       0
B       0-25     10
        25-50    14
        50+       0
C       0-25     11
        25-50     7
        50+       0
D       0-25     16
        25-50    15
        50+       0
dtype: int64


  grouped = df.groupby(['Column', pd.cut(df['Size'], bins=[0, 25, 50, np.inf], labels=['0-25', '25-50', '50+'])]).size()


5. mean()
注释: mean() 方法计算分组后的平均值。

In [19]:
# 5. mean() - 计算分组后的平均值
mean_values = grouped.mean()
print("\nMean of Grouped Values:\n", mean_values)

# 实例1: 计算分组的平均值
mean_values = df.groupby('Column')['NumericColumn'].mean()
print("\nMean Values:\n", mean_values)


Mean of Grouped Values:
 8.083333333333334

Mean Values:
 Column
A    22.774037
B    11.046260
C     9.968720
D   -12.563387
Name: NumericColumn, dtype: float64


6. reset_index()
注释: reset_index() 方法重置分组操作后的索引，将其转换回普通列。

In [20]:
# 6. reset_index() - 重置分组操作后的索引
reset_df = mean_values.reset_index()
print("\nReset Index:\n", reset_df)

# 实例1: 重置分组后的索引
reset_df = mean_values.reset_index()
print("\nReset Index:\n", reset_df)


Reset Index:
   Column  NumericColumn
0      A      22.774037
1      B      11.046260
2      C       9.968720
3      D     -12.563387

Reset Index:
   Column  NumericColumn
0      A      22.774037
1      B      11.046260
2      C       9.968720
3      D     -12.563387
