In [25]:
! python --version

Python 3.11.9


### 1.explode()
explode()函数可以用于将列表或数组列转换为多行，也就是可以将一行变多行。

In [4]:
import pandas as pd

# 创建一个简单的样本数据
df = pd.DataFrame(
    {"姓名": ["杰克", "史密斯"], "物品": [["手机", "电脑", "iPad"], ["跑车", "摩托"]]}
)

In [5]:
# 使用 explode 函数将“物品”列转换为多行
df_explode = df.explode("物品")
# df
df_explode

Unnamed: 0,姓名,物品
0,杰克,手机
0,杰克,电脑
0,杰克,iPad
1,史密斯,跑车
1,史密斯,摩托


### 2.implode
explode()函数的逆函数不是称为"implode"，而是可以使用groupby和agg函数来实现类似的功能。

In [6]:
# 对“姓名”列进行分组，然后对“物品”列使用魔法函数将其聚合为列表
df_implode = df_explode.groupby("姓名", as_index=False).agg(lambda x: x.tolist())
# df_explode
df_implode

Unnamed: 0,姓名,物品
0,史密斯,"[跑车, 摩托]"
1,杰克,"[手机, 电脑, iPad]"


### 3.melt()
melt 函数通过解透视（列转行）的方式对 DataFrame 进行重塑。在需要将宽格式（列多行少）的数据转换为长格式（行多列少）时，这种方法非常有用。

In [7]:
# 创建样本数据
data = {
    "name": ["Jack", "John"],
    "Item1": ["apple", "banana"],
    "Item2": ["pear", "orange"],
    "Item3": ["car", "bike"],
}
df = pd.DataFrame(data)
df

Unnamed: 0,name,Item1,Item2,Item3
0,Jack,apple,pear,car
1,John,banana,orange,bike


In [8]:
# 使用 melt 函数对数据框进行重塑
df_melt = df.melt(
    id_vars="name",
    value_vars=["Item1", "Item2", "Item3"],
    var_name="Item Type",
    value_name="Item Name",
)
# df
df_melt

Unnamed: 0,name,Item Type,Item Name
0,Jack,Item1,apple
1,John,Item1,banana
2,Jack,Item2,pear
3,John,Item2,orange
4,Jack,Item3,car
5,John,Item3,bike


### 4.pivot_table()
pivot_table函数用于根据DataFrame创建数据透视表。它适用于处理具有一个或多个分类变量的大量值的数据，以及创建显示这些分类数据分布的摘要表。

In [9]:
# 创建样本数据
data = {
    "Name": ["John", "Jack", "John", "Jack"],
    "Item": ["apple", "banana", "car", "bike"],
    "Quantity": [1, 3, 4, 5],
}
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Item,Quantity
0,John,apple,1
1,Jack,banana,3
2,John,car,4
3,Jack,bike,5


In [10]:
# 使用 pivot_table 函数创建透视表
df_pivot_table = df.pivot_table(
    index="Name", columns="Item", values="Quantity", aggfunc="sum"
)
# df
df_pivot_table

Item,apple,banana,bike,car
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Jack,,3.0,5.0,
John,1.0,,,4.0


### 5.stack()
stack函数通过将列标签的最内层移动为行标签的最内层，从而更改DataFrame的形状。

In [11]:
# 使用 stack 函数修改透视表的形状
df_stack = df_pivot_table.stack()
# df_pivot_table
df_stack

Name  Item  
Jack  banana    3.0
      bike      5.0
John  apple     1.0
      car       4.0
dtype: float64

### 6.unstack()
unstack函数通过将最内层索引移动到列来更改DataFrame的形状

In [12]:
# 使用 unstack 函数修改堆叠表的形状
df_unstack = df_stack.unstack()
# df
df_unstack

Item,apple,banana,bike,car
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Jack,,3.0,5.0,
John,1.0,,,4.0


### 7.crosstab()
crosstab 函数用于计算两个或多个因子的简单交叉表（频率表），它是 pivot_table 的特例。

In [13]:
df = pd.DataFrame(
    {
        "gender": ["Male", "Female", "Female", "Male"],
        "project": ["Yes", "No", "Yes", "No"],
    }
)
df_crosstab = pd.crosstab(index=df["gender"], columns=df["project"])
# df
df_crosstab

project,No,Yes
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,1,1
Male,1,1


In [14]:
df_crosstab_stack = df_crosstab.stack()
df_crosstab_stack

gender  project
Female  No         1
        Yes        1
Male    No         1
        Yes        1
dtype: int64

### 8.cut() and qcut()
pandas中的 cut() 和 qcut() 函数用于将连续数据分成离散区间或箱。cut() 函数用于将连续数据分割成大小相等的箱子，而 qcut() 用于将连续数据分割成观测值数量相等的箱子。

In [15]:
import numpy as np

In [16]:
# 创建样本数据
df = pd.DataFrame({"Quantity": np.arange(10) + 1})
df

Unnamed: 0,Quantity
0,1
1,2
2,3
3,4
4,5
5,6
6,7
7,8
8,9
9,10


In [17]:
# 使用 cut 函数将“Quantity”列分为3个箱子
bins = [0, 3, 6, 10]
df["Quantity_cut"] = pd.cut(df["Quantity"], bins=bins)

# 使用 qcut 函数将“Quantity”列分为3个观测值数量相等的箱子
df["Quantity_qcut"] = pd.qcut(df["Quantity"], q=3)
df

Unnamed: 0,Quantity,Quantity_cut,Quantity_qcut
0,1,"(0, 3]","(0.999, 4.0]"
1,2,"(0, 3]","(0.999, 4.0]"
2,3,"(0, 3]","(0.999, 4.0]"
3,4,"(3, 6]","(4.0, 7.0]"
4,5,"(3, 6]","(4.0, 7.0]"
5,6,"(3, 6]","(4.0, 7.0]"
6,7,"(6, 10]","(7.0, 10.0]"
7,8,"(6, 10]","(7.0, 10.0]"
8,9,"(6, 10]","(7.0, 10.0]"
9,10,"(6, 10]","(7.0, 10.0]"


### 9.eval()
用于高效的 DataFrame 列运算，支持更快的计算速度，并可以简化链式操作。

In [18]:
df = pd.DataFrame(
    data=np.random.randint(1, 51, size=(5000000, 4)), columns=list("abcd")
)
df.eval("e = (a + b) / (c + d)", inplace=True)
df

Unnamed: 0,a,b,c,d,e
0,31,25,11,12,2.434783
1,39,19,47,33,0.725000
2,8,22,25,17,0.714286
3,38,45,29,20,1.693878
4,42,4,4,17,2.190476
...,...,...,...,...,...
4999995,5,31,28,15,0.837209
4999996,6,12,24,43,0.268657
4999997,35,44,10,30,1.975000
4999998,40,37,1,44,1.711111


### 10.apply()
对 DataFrame 中的数据应用一个函数，可以作用于行或列。

In [19]:
df_apply = df.apply(np.floor)
df_apply

Unnamed: 0,a,b,c,d,e
0,31.0,25.0,11.0,12.0,2.0
1,39.0,19.0,47.0,33.0,0.0
2,8.0,22.0,25.0,17.0,0.0
3,38.0,45.0,29.0,20.0,1.0
4,42.0,4.0,4.0,17.0,2.0
...,...,...,...,...,...
4999995,5.0,31.0,28.0,15.0,0.0
4999996,6.0,12.0,24.0,43.0,0.0
4999997,35.0,44.0,10.0,30.0,1.0
4999998,40.0,37.0,1.0,44.0,1.0


### 11.map()
对DataFrame的每个元素应用一个函数，主要用于元素级函数应用。

In [20]:
df_map = df.map(lambda x: x**2)
df_map

Unnamed: 0,a,b,c,d,e
0,961,625,121,144,5.928166
1,1521,361,2209,1089,0.525625
2,64,484,625,289,0.510204
3,1444,2025,841,400,2.869221
4,1764,16,16,289,4.798186
...,...,...,...,...,...
4999995,25,961,784,225,0.700919
4999996,36,144,576,1849,0.072176
4999997,1225,1936,100,900,3.900625
4999998,1600,1369,1,1936,2.927901


### 12.rolling()
提供滚动窗口计算，常用于时间序列数据分析，如滚动平均、滚动标准差等。

In [24]:
# 创建样本数据
df = pd.DataFrame({"Quantity": np.arange(10) + 1})
df["rolling_avg"] = df["Quantity"].rolling(window=3).mean()
df

Unnamed: 0,Quantity,rolling_avg
0,1,
1,2,
2,3,2.0
3,4,3.0
4,5,4.0
5,6,5.0
6,7,6.0
7,8,7.0
8,9,8.0
9,10,9.0
