In [2]:
import pandas as pd
import numpy as np
import pyspark.pandas as ps
from pyspark.sql import SparkSession

import warnings
warnings.filterwarnings('ignore')

# 对象创建

通过传递值列表创建 pandas-on-Spark 系列，让 Spark 上的 pandas API 创建一个默认整数索引：

In [4]:
warnings.filterwarnings('ignore')


s = ps.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

通过传递可以转换为类似系列的对象的字典来创建pandas-on-Spark DataFrame

In [6]:
psdf = ps.DataFrame(
    {'a': [1, 2, 3, 4, 5, 6],
     'b': [100, 200, 300, 400, 500, 600],
     'c': ["one", "two", "three", "four", "five", "six"]},
    index=[10, 20, 30, 40, 50, 60])

In [7]:
psdf

Unnamed: 0,a,b,c
10,1,100,one
20,2,200,two
30,3,300,three
40,4,400,four
50,5,500,five
60,6,600,six


通过传递一个带有日期时间索引和标签列的 numpy 数组来创建 pandas DataFrame：

In [8]:
dates = pd.date_range('20220101', periods=6)

In [9]:
dates

DatetimeIndex(['2022-01-01', '2022-01-02', '2022-01-03', '2022-01-04',
               '2022-01-05', '2022-01-06'],
              dtype='datetime64[ns]', freq='D')

In [18]:
pdf = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('z2CD'))

In [19]:
pdf

Unnamed: 0,z,2,C,D
2022-01-01,2.031821,1.862059,-1.245968,-0.611189
2022-01-02,-1.621555,0.234756,-1.076365,1.782918
2022-01-03,0.106431,0.070107,1.811104,0.367067
2022-01-04,0.572783,0.052547,0.911452,-0.15855
2022-01-05,2.767634,-0.006317,0.800687,1.520986
2022-01-06,0.220979,0.400373,-0.594883,0.013214


In [24]:
type(pdf)

pandas.core.frame.DataFrame

这个 pandas DataFrame 可以转换为 pandas-on-Spark DataFrame

In [21]:
psdf = ps.from_pandas(pdf)

In [22]:
type(psdf)

pyspark.pandas.frame.DataFrame

In [23]:
psdf

Unnamed: 0,z,2,C,D
2022-01-01,2.031821,1.862059,-1.245968,-0.611189
2022-01-02,-1.621555,0.234756,-1.076365,1.782918
2022-01-03,0.106431,0.070107,1.811104,0.367067
2022-01-04,0.572783,0.052547,0.911452,-0.15855
2022-01-05,2.767634,-0.006317,0.800687,1.520986
2022-01-06,0.220979,0.400373,-0.594883,0.013214


从 pandas DataFrame 创建 Spark DataFrame

In [27]:
spark = SparkSession.builder.getOrCreate()

In [28]:
sdf = spark.createDataFrame(pdf)

In [29]:
sdf.show()

+-------------------+--------------------+-------------------+--------------------+
|                  z|                   2|                  C|                   D|
+-------------------+--------------------+-------------------+--------------------+
| 2.0318205724827476|  1.8620592848375082|-1.2459677771933935| -0.6111885267913448|
|-1.6215545861374359| 0.23475572955867338| -1.076364582122888|  1.7829184101091524|
|0.10643131731516978| 0.07010748803344258| 1.8111042967052309| 0.36706735389804784|
| 0.5727834796877288| 0.05254665526257438| 0.9114523579308513|-0.15854951021817854|
| 2.7676341085068605|-0.00631661882340...| 0.8006873399601735|  1.5209856754157327|
|  0.220979463306122| 0.40037346605234214|-0.5948831944401617|0.013213868440551372|
+-------------------+--------------------+-------------------+--------------------+



从 Spark DataFrame 创建 pandas-on-Spark DataFrame。

In [30]:
psdf = sdf.to_pandas_on_spark()

In [32]:
warnings.filterwarnings('ignore')

psdf

Unnamed: 0,z,2,C,D
0,2.031821,1.862059,-1.245968,-0.611189
1,-1.621555,0.234756,-1.076365,1.782918
2,0.106431,0.070107,1.811104,0.367067
3,0.572783,0.052547,0.911452,-0.15855
4,2.767634,-0.006317,0.800687,1.520986
5,0.220979,0.400373,-0.594883,0.013214


In [33]:
psdf.dtypes

z    float64
2    float64
C    float64
D    float64
dtype: object

Spark 数据框中的数据不保留自然顺序。通过设置选项可以保留自然顺序，compute.ordered_head但它会导致内部排序的性能开销。

In [36]:
psdf.head()

22/04/27 19:31:52 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/04/27 19:31:52 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


Unnamed: 0,z,2,C,D
0,2.031821,1.862059,-1.245968,-0.611189
1,-1.621555,0.234756,-1.076365,1.782918
2,0.106431,0.070107,1.811104,0.367067
3,0.572783,0.052547,0.911452,-0.15855
4,2.767634,-0.006317,0.800687,1.520986


显示索引、列和基础 numpy 数据。

In [37]:
psdf.index

Int64Index([0, 1, 2, 3, 4, 5], dtype='int64')

In [38]:
psdf.columns

Index(['z', '2', 'C', 'D'], dtype='object')

In [39]:
psdf.to_numpy()

22/04/27 19:33:23 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/04/27 19:33:23 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/04/27 19:33:23 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


array([[ 2.03182057,  1.86205928, -1.24596778, -0.61118853],
       [-1.62155459,  0.23475573, -1.07636458,  1.78291841],
       [ 0.10643132,  0.07010749,  1.8111043 ,  0.36706735],
       [ 0.57278348,  0.05254666,  0.91145236, -0.15854951],
       [ 2.76763411, -0.00631662,  0.80068734,  1.52098568],
       [ 0.22097946,  0.40037347, -0.59488319,  0.01321387]])

In [40]:
psdf.describe()

Unnamed: 0,z,2,C,D
count,6.0,6.0,6.0,6.0
mean,0.679682,0.435588,0.101005,0.485741
std,1.551384,0.714419,1.245441,0.960222
min,-1.621555,-0.006317,-1.245968,-0.611189
25%,0.106431,0.052547,-1.076365,-0.15855
50%,0.220979,0.070107,-0.594883,0.013214
75%,2.031821,0.400373,0.911452,1.520986
max,2.767634,1.862059,1.811104,1.782918


In [42]:
psdf

Unnamed: 0,z,2,C,D
0,2.031821,1.862059,-1.245968,-0.611189
1,-1.621555,0.234756,-1.076365,1.782918
2,0.106431,0.070107,1.811104,0.367067
3,0.572783,0.052547,0.911452,-0.15855
4,2.767634,-0.006317,0.800687,1.520986
5,0.220979,0.400373,-0.594883,0.013214


In [41]:
#转置数据
psdf.T

22/04/27 19:36:43 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/04/27 19:36:43 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


Unnamed: 0,0,1,2,3,4,5
z,2.031821,-1.621555,0.106431,0.572783,2.767634,0.220979
2,1.862059,0.234756,0.070107,0.052547,-0.006317,0.400373
C,-1.245968,-1.076365,1.811104,0.911452,0.800687,-0.594883
D,-0.611189,1.782918,0.367067,-0.15855,1.520986,0.013214


In [43]:
psdf.sort_index(ascending=False)

22/04/27 19:37:25 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/04/27 19:37:25 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/04/27 19:37:25 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


Unnamed: 0,z,2,C,D
5,0.220979,0.400373,-0.594883,0.013214
4,2.767634,-0.006317,0.800687,1.520986
3,0.572783,0.052547,0.911452,-0.15855
2,0.106431,0.070107,1.811104,0.367067
1,-1.621555,0.234756,-1.076365,1.782918
0,2.031821,1.862059,-1.245968,-0.611189


In [44]:
psdf.sort_values(by='2')

22/04/27 19:37:44 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/04/27 19:37:44 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/04/27 19:37:44 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


Unnamed: 0,z,2,C,D
4,2.767634,-0.006317,0.800687,1.520986
3,0.572783,0.052547,0.911452,-0.15855
2,0.106431,0.070107,1.811104,0.367067
1,-1.621555,0.234756,-1.076365,1.782918
5,0.220979,0.400373,-0.594883,0.013214
0,2.031821,1.862059,-1.245968,-0.611189


# 缺失数据

Spark 上的 Pandas API 主要使用该值np.nan来表示缺失数据。默认情况下，它不包含在计算中。

In [45]:
pdf

Unnamed: 0,z,2,C,D
2022-01-01,2.031821,1.862059,-1.245968,-0.611189
2022-01-02,-1.621555,0.234756,-1.076365,1.782918
2022-01-03,0.106431,0.070107,1.811104,0.367067
2022-01-04,0.572783,0.052547,0.911452,-0.15855
2022-01-05,2.767634,-0.006317,0.800687,1.520986
2022-01-06,0.220979,0.400373,-0.594883,0.013214


In [46]:
pdf1 = pdf.reindex(index=dates[0:4], columns=list(pdf.columns) + ['E'])

In [47]:
pdf1

Unnamed: 0,z,2,C,D,E
2022-01-01,2.031821,1.862059,-1.245968,-0.611189,
2022-01-02,-1.621555,0.234756,-1.076365,1.782918,
2022-01-03,0.106431,0.070107,1.811104,0.367067,
2022-01-04,0.572783,0.052547,0.911452,-0.15855,


In [48]:
pdf1.loc[dates[0]:dates[1], 'E'] = 1

In [49]:
pdf1

Unnamed: 0,z,2,C,D,E
2022-01-01,2.031821,1.862059,-1.245968,-0.611189,1.0
2022-01-02,-1.621555,0.234756,-1.076365,1.782918,1.0
2022-01-03,0.106431,0.070107,1.811104,0.367067,
2022-01-04,0.572783,0.052547,0.911452,-0.15855,


In [53]:
type(pdf1)

pandas.core.frame.DataFrame

In [50]:
psdf1 = ps.from_pandas(pdf1)

In [51]:
psdf1

Unnamed: 0,z,2,C,D,E
2022-01-01,2.031821,1.862059,-1.245968,-0.611189,1.0
2022-01-02,-1.621555,0.234756,-1.076365,1.782918,1.0
2022-01-03,0.106431,0.070107,1.811104,0.367067,
2022-01-04,0.572783,0.052547,0.911452,-0.15855,


In [52]:
type(psdf1)

pyspark.pandas.frame.DataFrame

In [54]:
# drop:
psdf1.dropna(how='any')

Unnamed: 0,z,2,C,D,E
2022-01-01,2.031821,1.862059,-1.245968,-0.611189,1.0
2022-01-02,-1.621555,0.234756,-1.076365,1.782918,1.0


In [55]:
# fill:
psdf1.fillna(value=5)

Unnamed: 0,z,2,C,D,E
2022-01-01,2.031821,1.862059,-1.245968,-0.611189,1.0
2022-01-02,-1.621555,0.234756,-1.076365,1.782918,1.0
2022-01-03,0.106431,0.070107,1.811104,0.367067,5.0
2022-01-04,0.572783,0.052547,0.911452,-0.15855,5.0


# 操作

In [57]:
psdf.mean()

z    0.679682
2    0.435588
C    0.101005
D    0.485741
dtype: float64

In [58]:
type(psdf)

pyspark.pandas.frame.DataFrame

## Spark配置

PySpark 中的各种配置可以在 Spark 上的 pandas API 内部应用。例如，您可以启用 Arrow 优化以极大地加快内部 pandas 转换。

https://spark.apache.org/docs/latest/api/python/user_guide/sql/arrow_pandas.html?highlight=arrow

In [59]:
# 保持其默认值。
prev = spark.conf.get("spark.sql.execution.arrow.pyspark.enabled") 
# 使用默认索引防止开销。
ps.set_option("compute.default_index_type", "distributed")  

In [60]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", True)
%timeit ps.range(300000).to_pandas()

                                                                                

332 ms ± 42.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [61]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", False)
%timeit ps.range(300000).to_pandas()

1.91 s ± 363 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [66]:
 # Set its default value back.
ps.reset_option("compute.default_index_type")
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", prev) 
%timeit ps.range(300000).to_pandas()

22/04/27 19:54:01 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/04/27 19:54:01 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/04/27 19:54:01 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/04/27 19:54:03 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/04/27 19:54:03 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/04/27 19:54:03 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/04/27 1

1.76 s ± 196 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# 分组

“分组依据”是指涉及以下一个或多个步骤的过程：

根据某些标准将数据分组

将函数独立应用于每个组

将结果组合成数据结构

In [67]:
psdf = ps.DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
                          'foo', 'bar', 'foo', 'foo'],
                    'B': ['one', 'one', 'two', 'three',
                          'two', 'two', 'one', 'three'],
                    'C': np.random.randn(8),
                    'D': np.random.randn(8)})

In [68]:
psdf

Unnamed: 0,A,B,C,D
0,foo,one,1.937254,-1.234743
1,bar,one,0.197343,0.450929
2,foo,two,1.424602,-0.989486
3,bar,three,-2.223636,-0.743189
4,foo,two,-2.150958,-0.164525
5,bar,two,0.398104,-1.604183
6,foo,one,-0.991699,-0.77424
7,foo,three,-0.51846,-0.457759


In [69]:
psdf.groupby('A').sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,-1.628189,-1.896443
foo,-0.299261,-3.620754


In [70]:
psdf.describe()

Unnamed: 0,C,D
count,8.0,8.0
mean,-0.240931,-0.68965
std,1.527787,0.639628
min,-2.223636,-1.604183
25%,-2.150958,-1.234743
50%,-0.51846,-0.77424
75%,0.398104,-0.457759
max,1.937254,0.450929


In [71]:
psdf.groupby(['A', 'B']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
foo,one,0.945555,-2.008983
bar,one,0.197343,0.450929
foo,two,-0.726356,-1.154011
bar,three,-2.223636,-0.743189
bar,two,0.398104,-1.604183
foo,three,-0.51846,-0.457759


# 绘图

In [77]:
pser = pd.Series(np.random.randn(1000),
                 index=pd.date_range('1/1/2022', periods=1000))

In [78]:
psser = ps.Series(pser)

In [79]:
psser = psser.cummax()

In [82]:
psser.tail(10)

22/04/27 19:58:02 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/04/27 19:58:02 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/04/27 19:58:02 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


2024-09-17    2.926613
2024-09-18    2.926613
2024-09-19    2.926613
2024-09-20    2.926613
2024-09-21    2.926613
2024-09-22    2.926613
2024-09-23    2.926613
2024-09-24    2.926613
2024-09-25    2.926613
2024-09-26    2.926613
dtype: float64

In [81]:
psser.plot()

22/04/27 19:57:40 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/04/27 19:57:40 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/04/27 19:57:40 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [83]:
pdf = pd.DataFrame(np.random.randn(1000, 4), index=pser.index,
                   columns=['A', 'B', 'C', 'D'])

In [84]:
psdf = ps.from_pandas(pdf)

In [85]:
psdf = psdf.cummax()
psdf

22/04/27 19:58:44 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/04/27 19:58:44 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/04/27 19:58:44 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


Unnamed: 0,A,B,C,D
2022-01-01,-0.834554,1.480901,-1.286437,-0.034071
2022-01-02,0.115041,1.480901,1.082066,0.025251
2022-01-03,0.115041,1.480901,1.082066,0.808691
2022-01-04,0.668869,1.480901,1.082066,0.808691
2022-01-05,1.42289,1.480901,1.082066,0.808691
2022-01-06,1.42289,1.480901,1.082066,0.808691
2022-01-07,1.42289,1.480901,1.082066,0.808691
2022-01-08,1.42289,1.480901,1.082066,0.808691
2022-01-09,1.42289,1.480901,1.082066,1.563619
2022-01-10,1.42289,1.736714,1.082066,1.563619


In [86]:
psdf.plot()

22/04/27 19:59:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/04/27 19:59:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/04/27 19:59:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


# 输入/输出数据

写入和读取CSV

In [87]:
psdf.to_csv('foo.csv')

22/04/27 22:16:13 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/04/27 22:16:13 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/04/27 22:16:13 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
                                                                                

In [88]:
psdf

Unnamed: 0,A,B,C,D
2022-01-01,-0.834554,1.480901,-1.286437,-0.034071
2022-01-02,0.115041,1.480901,1.082066,0.025251
2022-01-03,0.115041,1.480901,1.082066,0.808691
2022-01-04,0.668869,1.480901,1.082066,0.808691
2022-01-05,1.42289,1.480901,1.082066,0.808691
2022-01-06,1.42289,1.480901,1.082066,0.808691
2022-01-07,1.42289,1.480901,1.082066,0.808691
2022-01-08,1.42289,1.480901,1.082066,0.808691
2022-01-09,1.42289,1.480901,1.082066,1.563619
2022-01-10,1.42289,1.736714,1.082066,1.563619


In [93]:
%timeit ps.read_csv('foo.csv').head(10)

315 ms ± 25 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# Parquet

In [95]:
psdf.to_parquet('bar.parquet')
%timeit ps.read_parquet('bar.parquet').head(10)

22/04/27 22:20:01 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/04/27 22:20:01 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/04/27 22:20:01 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


235 ms ± 21.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [96]:
ps.read_parquet('bar.parquet').head(10)

22/04/27 22:20:15 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/04/27 22:20:15 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


Unnamed: 0,A,B,C,D
0,-0.834554,1.480901,-1.286437,-0.034071
1,0.115041,1.480901,1.082066,0.025251
2,0.115041,1.480901,1.082066,0.808691
3,0.668869,1.480901,1.082066,0.808691
4,1.42289,1.480901,1.082066,0.808691
5,1.42289,1.480901,1.082066,0.808691
6,1.42289,1.480901,1.082066,0.808691
7,1.42289,1.480901,1.082066,0.808691
8,1.42289,1.480901,1.082066,1.563619
9,1.42289,1.736714,1.082066,1.563619


# Spark IO

In [97]:
psdf.to_spark_io('zoo.orc', format="orc")
%timeit ps.read_spark_io('zoo.orc', format="orc").head(10)

22/04/27 22:21:40 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/04/27 22:21:40 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/04/27 22:21:40 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
                                                                                

117 ms ± 13.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [98]:
ps.read_spark_io('zoo.orc', format="orc").head(10)

22/04/27 22:21:46 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/04/27 22:21:46 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


Unnamed: 0,A,B,C,D
0,-0.834554,1.480901,-1.286437,-0.034071
1,0.115041,1.480901,1.082066,0.025251
2,0.115041,1.480901,1.082066,0.808691
3,0.668869,1.480901,1.082066,0.808691
4,1.42289,1.480901,1.082066,0.808691
5,1.42289,1.480901,1.082066,0.808691
6,1.42289,1.480901,1.082066,0.808691
7,1.42289,1.480901,1.082066,0.808691
8,1.42289,1.480901,1.082066,1.563619
9,1.42289,1.736714,1.082066,1.563619
