# Enabling for Conversion to/from Pandas

- PySpark DataFrame can be converted to Python pandas DataFrame using a function toPandas()

In [None]:
#!python -m pip install pandas

In [2]:
home = "/mnt"

path_LMS = f"{home}/data/Predict-Forclosure/Foreclosure-Prediction-Dataset/LMS_31JAN2019.csv"
path_RMS_Final = f"{home}/data/Predict-Forclosure/Foreclosure-Prediction-Dataset/RF_Final_Data.csv"
path_Customer = f"{home}/data/Predict-Forclosure/Foreclosure-Prediction-Dataset/Customers_31JAN2019.csv"

In [3]:
df_lms = spark\
.read\
.option("inferSchema", "true")\
.option("header", "true")\
.csv(path_LMS)
df_lms

DataFrame[AGREEMENTID: int, CUSTOMERID: string, LOAN_AMT: string, NET_DISBURSED_AMT: string, INTEREST_START_DATE: string, CURRENT_ROI: double, ORIGNAL_ROI: double, CURRENT_TENOR: int, ORIGNAL_TENOR: int, DUEDAY: int, AUTHORIZATIONDATE: string, CITY: string, PRE_EMI_DUEAMT: double, PRE_EMI_RECEIVED_AMT: double, PRE_EMI_OS_AMOUNT: double, EMI_DUEAMT: double, EMI_RECEIVED_AMT: double, EMI_OS_AMOUNT: double, EXCESS_AVAILABLE: double, EXCESS_ADJUSTED_AMT: double, BALANCE_EXCESS: double, NET_RECEIVABLE: double, OUTSTANDING_PRINCIPAL: double, PAID_PRINCIPAL: double, PAID_INTEREST: double, MONTHOPENING: double, LAST_RECEIPT_DATE: string, LAST_RECEIPT_AMOUNT: double, NET_LTV: double, COMPLETED_TENURE: int, BALANCE_TENURE: int, DPD: int, FOIR: double, PRODUCT: string, SCHEMEID: string, NPA_IN_LAST_MONTH: string, NPA_IN_CURRENT_MONTH: string, MOB: int]

In [4]:
df_Customer = spark\
.read\
.option("inferSchema", "true")\
.option("header", "true")\
.csv(path_Customer)
df_Customer

DataFrame[CUSTOMERID: int, CUST_CONSTTYPE_ID: int, CUST_CATEGORYID: int, PROFESSION: string, AGE: int, SEX: string, MARITAL_STATUS: string, QUALIFICATION: string, NO_OF_DEPENDENT: int, OCCUPATION: string, POSITION: string, GROSS_INCOME: double, PRE_JOBYEARS: int, NETTAKEHOMEINCOME: double, BRANCH_PINCODE: string]

## Convert PySpark Dataframe to Pandas DataFrame
- toPandas() results in the collection of all records in the PySpark DataFrame to the driver program
- Should be done only on a small subset of the data.
- Running on larger dataset’s results in memory error and crashes the application
- To deal with a larger dataset, you can also try increasing memory on the driver.

In [5]:
pandasDF = df_Customer.toPandas()
print(pandasDF)

      CUSTOMERID  CUST_CONSTTYPE_ID  CUST_CATEGORYID PROFESSION   AGE SEX  \
0       12001000                  1                5       None  33.0   M   
1       12001001                  1                5       None  44.0   M   
2       12001002                  1                7       None  50.0   M   
3       12001003                  3                4       None  40.0   M   
4       12001004                  3                4       None  27.0   M   
...          ...                ...              ...        ...   ...  ..   
9995    12014215                  1                4       None  36.0   M   
9996    12022042                  1                5       None  33.0   M   
9997    12024250                  1                4       None  54.0   M   
9998    12023448                  1                4       None  64.0   M   
9999    12023683                  1                4       None  30.0   M   

     MARITAL_STATUS QUALIFICATION  NO_OF_DEPENDENT OCCUPATION POSITION  \
0

## Apache Arrow in Spark
- An in-memory columnar data format that is used in Spark to efficiently transfer data between JVM and Python processes
- This currently is most beneficial to Python users that work with Pandas/NumPy data
- Its usage is not automatic and might require some minor changes to configuration or code to take full advantage and ensure compatibility

### Ensure PyArrow Installed

In [7]:
# Use any of the below commands
#!python -m pip install pyspark[sql]
#!conda install -c conda-forge pyarrow
#!python -m pip install Cython
#!python -m pip install pyarrow==0.9.0

## Enabling for Conversion to/from Pandas
- To use Arrow when executing these calls, users need to first set the Spark configuration 'spark.sql.execution.arrow.enabled' to 'true'
- This is disabled by default.

In [8]:
import numpy as np
import pandas as pd

In [9]:
pandasDF

Unnamed: 0,CUSTOMERID,CUST_CONSTTYPE_ID,CUST_CATEGORYID,PROFESSION,AGE,SEX,MARITAL_STATUS,QUALIFICATION,NO_OF_DEPENDENT,OCCUPATION,POSITION,GROSS_INCOME,PRE_JOBYEARS,NETTAKEHOMEINCOME,BRANCH_PINCODE
0,12001000,1,5,,33.0,M,M,POSTGRAD,0.0,,,198375.22180,8.0,198375.22180,400070
1,12001001,1,5,,44.0,M,M,POSTGRAD,0.0,,,242703.98290,10.0,242703.98290,400070
2,12001002,1,7,,50.0,M,M,GRAD,0.0,,,0.00000,,0.00000,400070
3,12001003,3,4,,40.0,M,M,GRAD,0.0,,,365263.51160,,365263.51610,400070
4,12001004,3,4,,27.0,M,M,GRAD,0.0,,,187342.19900,,187342.19900,400070
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,12014215,1,4,,36.0,M,M,UG,0.0,,,679254.64900,,679254.65350,395002
9996,12022042,1,5,,33.0,M,M,GRAD,0.0,,,78043.36087,6.0,78043.36087,440010
9997,12024250,1,4,,54.0,M,M,GRAD,0.0,,,75807.18077,,75807.18077,500034
9998,12023448,1,4,,64.0,M,M,GRAD,0.0,,,124614.56250,,124614.55790,400070


In [10]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType

In [11]:
my_schema=StructType([
                      StructField("CUSTOMERID",IntegerType(),True),StructField("CUST_CONSTTYPE_ID",IntegerType(),True),\
                      StructField("CUST_CATEGORYID",IntegerType(),True),StructField("PROFESSION",StringType(),True),\
                      StructField("AGE",DoubleType(),True),StructField("SEX",StringType(),True),StructField("MARITAL_STATUS",StringType(),True),\
                      StructField("QUALIFICATION",StringType(),True),StructField("NO_OF_DEPENDENT",DoubleType(),True),StructField("OCCUPATION",StringType(),True),StructField("POSITION",StringType(),True),StructField("GROSS_INCOME",DoubleType(),True),StructField("PRE_JOBYEARS",DoubleType(),True),\
                      StructField("NETTAKEHOMEINCOME",DoubleType(),True),StructField("BRANCH_PINCODE",StringType(),True)
                     ])

In [12]:
# Create a Spark DataFrame from a Pandas DataFrame using Arrow
df = spark.createDataFrame(pandasDF, schema=my_schema)

df.show()

+----------+-----------------+---------------+----------+----+---+--------------+-------------+---------------+----------+--------+------------+------------+-----------------+--------------+
|CUSTOMERID|CUST_CONSTTYPE_ID|CUST_CATEGORYID|PROFESSION| AGE|SEX|MARITAL_STATUS|QUALIFICATION|NO_OF_DEPENDENT|OCCUPATION|POSITION|GROSS_INCOME|PRE_JOBYEARS|NETTAKEHOMEINCOME|BRANCH_PINCODE|
+----------+-----------------+---------------+----------+----+---+--------------+-------------+---------------+----------+--------+------------+------------+-----------------+--------------+
|  12001000|                1|              5|      null|33.0|  M|             M|     POSTGRAD|            0.0|      null|    null| 198375.2218|         8.0|      198375.2218|        400070|
|  12001001|                1|              5|      null|44.0|  M|             M|     POSTGRAD|            0.0|      null|    null| 242703.9829|        10.0|      242703.9829|        400070|
|  12001002|                1|              7

In [13]:
# Enable Arrow-based columnar data transfers
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

- In addition, optimizations enabled by ‘spark.sql.execution.arrow.enabled’ could fallback automatically to non-Arrow optimization implementation if an error occurs before the actual computation within Spark
- This can be controlled by ‘spark.sql.execution.arrow.fallback.enabled’.

In [14]:
spark.conf.set("spark.sql.execution.arrow.fallback.enabled", "true")

In [15]:
# Convert the Spark DataFrame back to a Pandas DataFrame using Arrow
result_pdf = df_Customer.select("*").toPandas()
result_pdf

Unnamed: 0,CUSTOMERID,CUST_CONSTTYPE_ID,CUST_CATEGORYID,PROFESSION,AGE,SEX,MARITAL_STATUS,QUALIFICATION,NO_OF_DEPENDENT,OCCUPATION,POSITION,GROSS_INCOME,PRE_JOBYEARS,NETTAKEHOMEINCOME,BRANCH_PINCODE
0,12001000,1,5,,33.0,M,M,POSTGRAD,0.0,,,198375.22180,8.0,198375.22180,400070
1,12001001,1,5,,44.0,M,M,POSTGRAD,0.0,,,242703.98290,10.0,242703.98290,400070
2,12001002,1,7,,50.0,M,M,GRAD,0.0,,,0.00000,,0.00000,400070
3,12001003,3,4,,40.0,M,M,GRAD,0.0,,,365263.51160,,365263.51610,400070
4,12001004,3,4,,27.0,M,M,GRAD,0.0,,,187342.19900,,187342.19900,400070
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,12014215,1,4,,36.0,M,M,UG,0.0,,,679254.64900,,679254.65350,395002
9996,12022042,1,5,,33.0,M,M,GRAD,0.0,,,78043.36087,6.0,78043.36087,440010
9997,12024250,1,4,,54.0,M,M,GRAD,0.0,,,75807.18077,,75807.18077,500034
9998,12023448,1,4,,64.0,M,M,GRAD,0.0,,,124614.56250,,124614.55790,400070


- Using the above optimizations with Arrow will produce the same results as when Arrow is not enabled
- Even with Arrow, toPandas() results in the collection of all records in the DataFrame to the driver program and should be done on a small subset of the data
- Not all Spark data types are currently supported and an error can be raised if a column has an unsupported type, see Supported SQL Types
- If an error occurs during createDataFrame(), Spark will fall back to create the DataFrame without Arrow.

## Pandas UDFs
- User defined functions that are executed by Spark using Arrow to transfer data and Pandas to work with the data.

### Scalar

In [16]:
import pandas as pd

from pyspark.sql.functions import col, pandas_udf
from pyspark.sql.types import LongType

# Declare the function and create the UDF
def compute_func(a, b):
    return a - b

my_pudf_compute = pandas_udf(compute_func, returnType=LongType())


In [17]:
result_pdf

Unnamed: 0,CUSTOMERID,CUST_CONSTTYPE_ID,CUST_CATEGORYID,PROFESSION,AGE,SEX,MARITAL_STATUS,QUALIFICATION,NO_OF_DEPENDENT,OCCUPATION,POSITION,GROSS_INCOME,PRE_JOBYEARS,NETTAKEHOMEINCOME,BRANCH_PINCODE
0,12001000,1,5,,33.0,M,M,POSTGRAD,0.0,,,198375.22180,8.0,198375.22180,400070
1,12001001,1,5,,44.0,M,M,POSTGRAD,0.0,,,242703.98290,10.0,242703.98290,400070
2,12001002,1,7,,50.0,M,M,GRAD,0.0,,,0.00000,,0.00000,400070
3,12001003,3,4,,40.0,M,M,GRAD,0.0,,,365263.51160,,365263.51610,400070
4,12001004,3,4,,27.0,M,M,GRAD,0.0,,,187342.19900,,187342.19900,400070
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,12014215,1,4,,36.0,M,M,UG,0.0,,,679254.64900,,679254.65350,395002
9996,12022042,1,5,,33.0,M,M,GRAD,0.0,,,78043.36087,6.0,78043.36087,440010
9997,12024250,1,4,,54.0,M,M,GRAD,0.0,,,75807.18077,,75807.18077,500034
9998,12023448,1,4,,64.0,M,M,GRAD,0.0,,,124614.56250,,124614.55790,400070


In [53]:
# The function for a pandas_udf should be able to execute with local Pandas data
print(compute_func(result_pdf['GROSS_INCOME'], result_pdf['NETTAKEHOMEINCOME']))

0       0.0000
1       0.0000
2       0.0000
3      -0.0045
4       0.0000
         ...  
9995   -0.0045
9996    0.0000
9997    0.0000
9998    0.0046
9999    0.0000
Length: 10000, dtype: float64


In [18]:
# Execute function as a Spark vectorized UDF
df_Customer.select(my_pudf_compute(col("GROSS_INCOME"), col("NETTAKEHOMEINCOME"))).show()

+----------------------------------+
|(GROSS_INCOME - NETTAKEHOMEINCOME)|
+----------------------------------+
|                               0.0|
|                               0.0|
|                               0.0|
|              -0.00449999998090...|
|                               0.0|
|                               0.0|
|              0.004510000006121118|
|                               0.0|
|              -0.00450000001001...|
|              0.004500000010011718|
|                               0.0|
|                               0.0|
|                               0.0|
|                               0.0|
|                               0.0|
|                               0.0|
|                               0.0|
|                               0.0|
|              -0.00449999998090...|
|                               0.0|
+----------------------------------+
only showing top 20 rows

