# Spark DataFrames to Pandas conversion

## toPandas with Arrow

In [2]:
from pyspark.sql import SparkSession
ss = SparkSession.builder.getOrCreate()

from pyspark.sql.functions import rand
from pyspark.sql import Row
df = ss.sparkContext.range(1 << 22).map(lambda e: Row(uid=e)).toDF().withColumn("x", rand())
df.printSchema()

root
 |-- uid: long (nullable = true)
 |-- x: double (nullable = false)



In [12]:
ss.conf.set("spark.sql.execution.arrow.enabled", "false")

%time pdf = df.toPandas()

CPU times: user 16.2 s, sys: 1.35 s, total: 17.5 s
Wall time: 32.2 s


In [13]:
ss.conf.set("spark.sql.execution.arrow.enabled", "true")

%time pdf = df.toPandas()

CPU times: user 57.6 ms, sys: 69.2 ms, total: 127 ms
Wall time: 11.9 s


## toPandas using mapPartitions

In [5]:
from pyspark.sql import SparkSession
ss = SparkSession.builder.getOrCreate()

from pyspark.sql.functions import rand
from pyspark.sql import Row
df = ss.sparkContext.range(1 << 22).map(lambda e: Row(uid=e)).toDF().withColumn("x", rand())
df = df.repartition(100)
df.printSchema()

root
 |-- uid: long (nullable = true)
 |-- x: double (nullable = false)



In [7]:
import pandas as pd

def map_to_pandas(rdds):
    return [pd.DataFrame(list(rdds))]

%time parts = df.rdd.mapPartitions(map_to_pandas).collect()
%time pdf = pd.concat(parts)

CPU times: user 73 ms, sys: 55.8 ms, total: 129 ms
Wall time: 10.5 s
CPU times: user 54.4 ms, sys: 49.5 ms, total: 104 ms
Wall time: 104 ms
